In [1]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [2]:
list=[[  0,   'a',  1.1],
      [ 0,   'b', 0.6],
      [1,   'b',  0.2],
      [1,   'c',  0.6],
      [2,   'c',  1.1],
      [3,   'a',  0.2],
      [3,   'b',  0.7]]

df=spark.createDataFrame(list,['day','user','score'])

df.show()

In [3]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

w=Window().partitionBy("user").orderBy("day").rowsBetween(Window.unboundedPreceding,Window.currentRow)

df.withColumn("score", F.mean("score").over(w)).show()

In [4]:
# Without any filtering:
+---+----+-------------------+
|day|user|              score|
+---+----+-------------------+
|  0|   a|                1.1|
|  0|   b|                0.6|
|  1|   c|                0.6|
|  1|   b|                0.5|
|  1|   a|               0.75|
|  2|   c| 1.4000000000000001|
|  2|   b|               0.25|
|  2|   a|              0.375|
|  3|   c| 0.7000000000000001|
|  3|   b|              0.825|
|  3|   a|             0.3875|
|  4|   c|0.35000000000000003|
|  4|   b|             0.4125|
|  4|   a|            0.19375|
|  5|   c|0.17500000000000002|
|  5|   b|            0.20625|
|  5|   a|           0.096875|
|  6|   c|0.08750000000000001|
|  6|   b|           0.103125|
|  6|   a|          0.0484375|
+---+----+-------------------+

In [6]:
list=[['clinic'],
       ['office']]

input_df=spark.createDataFrame(list,['type_txt'])

input_df.show()

In [7]:
condition = "type_txt = 'clinic'"
input_df1 = input_df.withColumn(
        "prm_data_category",
        F.when(F.expr(condition), F.lit("clinic")) 
        .when(F.col("type_txt") == 'office', F.lit("office"))
        .otherwise(F.lit("other"))
    )

input_df1.show()

In [9]:
list=[[['1', '9', '1']],
      [['2', '2', '2','1', '2']],
      [['3', '4', '4','1', '4']],
      [['1', '4']],
     [['99', '99', '100']],
     [['92', '11', '92']],
     [['0', '0', '1']]]

df=spark.createDataFrame(list,['array'])


df.show()
       

In [10]:
from pyspark.sql import functions as F
df\
  .withColumn("count",\
          F.expr("""map_from_arrays(array_distinct(array),transform(array_distinct(array),\
              x-> size(filter(array,y-> y=x))))"""))\
  .show(truncate=False)

#+---------------+------------------------+
#|array          |count                   |
#+---------------+------------------------+
#|[1, 9, 1]      |[1 -> 2, 9 -> 1]        |
#|[2, 2, 2, 1, 2]|[2 -> 4, 1 -> 1]        |
#|[3, 4, 4, 1, 4]|[3 -> 1, 4 -> 3, 1 -> 1]|
#|[1, 4]         |[1 -> 1, 4 -> 1]        |
#|[99, 99, 100]  |[99 -> 2, 100 -> 1]     |
#|[92, 11, 92]   |[92 -> 2, 11 -> 1]      |
#|[0, 0, 1]      |[0 -> 2, 1 -> 1]        |
#+---------------+------------------------+

In [11]:
df.show() #sample dataframe
#+---------------+
#|          array|
#+---------------+
#|      [1, 1, 1]|
#|[2, 1, 3, 3, 2]|
#|         [8, 99]|
#|      [9, 7, G]|
#|      [S, T, U]|
#|      [G, C, G]|
#+---------------+

from pyspark.sql import functions as F
df\
  .withColumn("count",\
          F.expr("""array_sort(transform(array_distinct(array),\
              x-> aggregate(sort_array(array), 0,(acc,t)->acc+IF(t=x,1,0))))"""))\
  .withColumn("zip", F.map_from_arrays(F.array_distinct(F.sort_array(F.col("array"))),F.col("count")))\
  .show(truncate=False)

#+---------------+---------+------------------------+
#|array          |count    |zip                     |
#+---------------+---------+------------------------+
#|[A, A, B]      |[1, 2]   |[A -> 1, B -> 2]        |
#|[D, P, E, P, P]|[1, 1, 3]|[D -> 1, E -> 1, P -> 3]|
#|[H, X]         |[1, 1]   |[H -> 1, X -> 1]        |
#|[P, Q, G]      |[1, 1, 1]|[G -> 1, P -> 1, Q -> 1]|
#|[S, T, U]      |[1, 1, 1]|[S -> 1, T -> 1, U -> 1]|
#|[G, C, G]      |[1, 2]   |[C -> 1, G -> 2]        |
#+---------------+---------+------------------------+

In [12]:
#+---------------+
#|          array|
#+---------------+
#|      [1, 9, 1]|
#|[2, 2, 2, 1, 2]|
#|[3, 4, 4, 1, 4]|
#|         [1, 4]|
#|  [99, 99, 100]|
#|   [92, 11, 92]|
#|      [0, 0, 1]|
#+---------------+

from pyspark.sql import functions as F
df\
  .withColumn("count",\
          F.expr("""map_from_arrays(array_distinct(array),transform(array_distinct(array),\
              x-> aggregate(array, 0,(acc,t)->acc+IF(t=x,1,0))))"""))\
  .show(truncate=False)

#+---------------+------------------------+
#|array          |count                   |
#+---------------+------------------------+
#|[1, 9, 1]      |[1 -> 2, 9 -> 1]        |
#|[2, 2, 2, 1, 2]|[2 -> 4, 1 -> 1]        |
#|[3, 4, 4, 1, 4]|[3 -> 1, 4 -> 3, 1 -> 1]|
#|[1, 4]         |[1 -> 1, 4 -> 1]        |
#|[99, 99, 100]  |[99 -> 2, 100 -> 1]     |
#|[92, 11, 92]   |[92 -> 2, 11 -> 1]      |
#|[0, 0, 1]      |[0 -> 2, 1 -> 1]        |
#+---------------+------------------------+

In [13]:
from pyspark.sql import functions as F

elements=[1,9,2,3,4,99,100,92,11,0]
collected=df.withColumn("struct", F.struct(*[F.struct(F.expr("size(filter(atr_list,x->x={}))"\
                                                    .format(y)).alias(str(y)) for y in elements)]))\
            .select(*[F.sum(F.col("struct.{}.col1".format(x))).alias(x) for x in elements]).show()

In [14]:
elements=[1,9,2,3,4,99,100,92,11,0]
from pyspark.sql import functions as F
collected=df.withColumn("struct", F.struct(*[(F.struct(F.expr("size(filter(array,x->x={}))"\
                                                    .format(y))).alias(str(y))) for y in elements]))\
            .select("array",F.split(F.concat_ws(",",*[(F.col("struct.{}.col1".format(x)).alias(str(x)+'count'))\
                                          for x in elements]).alias("count"),',').alias("count")).show(truncate=False)

In [15]:
elements=[1,9,2,3,4,99,100,92,11,0]
from pyspark.sql import functions as F
collected=df.withColumn("struct", F.struct(*[(F.struct(F.expr("size(filter(array,x->x={}))"\
                                                    .format(y))).alias(str(y))) for y in elements]))\
            .withColumn("vals", F.array(*[(F.col("struct.{}.col1".format(x))) for x in elements]))\
            .select("array",F.arrays_zip(F.array(*[F.lit(x) for x in elements]),\
                                    F.col("vals")).alias("count"))\
            .withColumn("count", F.expr("""filter(count,x-> x.vals != 0)"""))\
            .withColumn("count")
            .show(truncate=False)

In [16]:
elements=[1,9,2,3,4,99,100,92,11,0]
from pyspark.sql import functions as F
collected=df.withColumn("struct", F.struct(*[(F.struct(F.expr("size(filter(array,x->x={}))"\
                                                    .format(y))).alias(str(y))) for y in elements]))\
            .withColumn("vals", F.array(*[(F.col("struct.{}.col1".format(x))) for x in elements]))\
            .withColumn("elems", F.array(*[F.lit(x) for x in elements]))\
            .withColumn("count", F.map_from_entries(F.expr("""filter(arrays_zip(elems,vals),x-> x.vals != 0)""")))\
            .select("array","count")\
            .show(truncate=False)


In [17]:
elements=[1,9,2,3,4,99,100,92,11,0]
from pyspark.sql import functions as F
collected=df.withColumn("struct", F.array(*[(F.struct(F.expr("size(filter(array,x->x={}))"\
                                                    .format(y))).alias(str(y))) for y in elements]))\
             .withColumn("struct", F.expr("""filter(struct,x-> x.col1!=0)"""))\
             .printSchema()

In [18]:
elements=[1,9,2,3,4,99,100,92,11,0]
from pyspark.sql import functions as F
collected=df.withColumn("struct", F.struct(*[(F.struct(F.expr("size(filter(array,x->x={}))"\
                                                    .format(y))).alias(str(y))) for y in elements]))\
            .select("array",F.map_from_arrays(F.array(*[F.lit(x) for x in elements]),\
                                                       F.array(*[(F.col("struct.{}.col1".format(x)))\
                                          for x in elements])))\
                    .printSchema()
            #.select("array", F.expr("""filter(count,x->x.)""".alias("count")).show(truncate=False)

In [19]:
list=[[13,       18,'Name',  'project/sd-03-bloc...',    'true'    ,  'standard',               1.0,           3],
      [13,         7,'Name',  'project/sd-03-bloc...',    'true' ,      'standard',               1.0,            3],
      [13,        27,'Name',  'project/sd-03-bloc...',    'true',       'standard',               1.0,            3]]

table1=spark.createDataFrame(list,['student_id','project_id','name','project_name','approved','evaluation_type'       ,'grade','cohort_number'])


table1.show()


list1=[[    3,       18],
       [  3,        27],
     [     4,        15],
      [    3,         7],
        [   3,        35]]
table2=spark.createDataFrame(list1,['cohort_number','project_id'])

table2.show()


In [20]:
table1.join(table2.withColumnRenamed("project_id","project_id2"), ['cohort_number'],'right')\
       .groupBy("project_id2").agg(*[F.first(x).alias(x) for x in table1.columns])\
       .dropna().show()


In [21]:
list=[['s1',0,1.2],
     ['s1',0,2.2],
     ['s1',1,3.2],
      ['s1',1,4.2],
     ['s2',1,5.2],
     ['s1',2,6.2],
     ['s1',2,7.2]]

df=spark.createDataFrame(list,['status','year','close_price'])

df.show()

list1=[['s1',0,1.2,0.0],
     ['s1',0,2.2,0.0],
     ['s1',1,3.2,1.2],
      ['s1',1,4.2,2.2],
     ['s2',1,5.2,0.0],
     ['s1',2,6.2,3.2],
     ['s1',2,7.2,4.2]]

df1=spark.createDataFrame(list1,['status','year','close_price','open_price'])
       
df1.show()

In [22]:
df.show() #sample data

#+------+----+-----------+
#|status|year|close_price|
#+------+----+-----------+
#|    s1|   0|        1.2|
#|    s1|   0|        2.2|
#|    s1|   1|        3.2|
#|    s1|   1|        4.2|
#|    s2|   1|        5.2|
#|    s1|   2|        6.2|
#|    s1|   2|        7.2|
#+------+----+-----------+


w=Window().partitionBy("status").orderBy("mono_id")
w1=Window().orderBy("mono_id")
w2=Window().partitionBy("sum").orderBy("mono_id")
df.withColumn("mono_id", F.monotonically_increasing_id())\
  .withColumn("rowNum", F.row_number().over(w))\
  .withColumn("sum", F.sum(F.when(F.col("rowNum")==1, F.lit(1)).otherwise(F.lit(0))).over(w1))\
  .withColumn("sum", F.when((F.row_number().over(w2)==1) & (F.col("sum")==2), F.lit(1)).otherwise(F.col("sum")))\
    .withColumn("lag1", F.lag("close_price",2).over(w1))\
     .withColumn("lag2", F.lag("close_price",3).over(w1))\
  .withColumn("open_price", F.when((F.col("sum")==1)&(F.col("lag1").isNotNull()), F.col("lag1"))\
                             .when((F.col("sum")!=1),F.col("lag2"))\
                              .otherwise(F.lit(0)))\
 .withColumn("open_price", F.when(F.col("rowNum")==1, F.lit(0)).otherwise(F.col("open_price")))\
  .orderBy("mono_id").drop("mono_id","lag1","lag2","rowNum")\
  .show()

#+------+----+-----------+---+----------+
#|status|year|close_price|sum|open_price|
#+------+----+-----------+---+----------+
#|    s1|   0|        1.2|  1|       0.0|
#|    s1|   0|        2.2|  1|       0.0|
#|    s1|   1|        3.2|  1|       1.2|
#|    s1|   1|        4.2|  1|       2.2|
#|    s2|   1|        5.2|  1|       0.0|
#|    s1|   2|        6.2|  2|       3.2|
#|    s1|   2|        7.2|  2|       4.2|
#+------+----+-----------+---+----------+
 

In [23]:
lagfrom pyspark.sql import functions as F
from pyspark.sql.window import Window



w1=Window().orderBy(F.col("mono_id"))
w2=Window().partitionBy("status").orderBy("mono_id")


df.withColumn("mono_id", F.monotonically_increasing_id())\
   .withColumn("lag1", F.lag("close_price",2).over(w1))\
     .withColumn("lag2", F.lag("close_price",3).over(w1))\
              .withColumn("open_price",F.when(F.row_number().over(w2)==1,\
                                   F.lit(0)).when((F.col("lag2").isNull())&(F.col("lag1").isNotNull()),F.col("lag1"))\
                                            .when(F.col("lag2").isNull()&(F.col("lag1").isNull()),F.lit(0))\
                                                 .otherwise(F.col("lag2"))).orderBy("mono_id")\
   `                                             .drop("mono_id","lag1","lag2").show()





In [24]:
df.columns

In [25]:


from pyspark.sql import functions as F
from pyspark.sql.window import Window


w1=Window().orderBy(F.col("year"),F.col("mono_id"))
w2=Window().partitionBy("status").orderBy(F.col("year"),F.col("mono_id"))


df.withColumn("mono_id", F.monotonically_increasing_id())\
   .withColumn("lag1", F.lag("close_price",2).over(w1))\
              .withColumn("open_price",F.when(F.row_number().over(w2)==1,F.lit(0))\
                          .when(F.col("lag1").isNull(),F.lit(0))\
                                  .otherwise(F.col("lag1")))\
    .orderBy("year","mono_id")\
    .drop("mono_id","lag1","lag2").show()


#+------+-----------+----------+------------+-----------+
#|status|close_price|open_price|close_price1|open_price1|
#+------+-----------+----------+------------+-----------+
#|    s1|        1.2|       0.0|         2.1|        0.0|
#|    s1|        2.2|       1.2|         3.1|        0.0|
#|    s1|        3.2|       0.0|         4.1|        3.1|
#|    s2|        4.2|       2.2|         5.1|        3.1|
#|    s2|        5.2|       3.2|         6.1|        4.1|
#|    s1|        6.2|       4.2|         7.1|        5.1|
#+------+-----------+----------+------------+-----------+

In [26]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window


w1=Window().orderBy(F.col("mono_id"))
w2=Window().partitionBy("status").orderBy("mono_id")


df.withColumn("mono_id", F.monotonically_increasing_id())\
   .withColumn("lag1", F.lag("close_price",2).over(w1))\
     .withColumn("lag2", F.lag("close_price",3).over(w1))\
              .withColumn("open_price",F.when(F.row_number().over(w2)==1,\
                                   F.lit(0)).when((F.col("lag2").isNull())&(F.col("lag1").isNotNull()),F.col("lag1"))\
                                            .when(F.col("lag2").isNull()&(F.col("lag1").isNull()),F.lit(0))\
                                                 .otherwise(F.col("lag2"))).orderBy("mono_id")\
    .withColumn("lag3", F.lag("close_price1",2).over(w1))\
     .withColumn("lag4", F.lag("close_price1",2).over(w1))\
              .withColumn("open_price1",F.when(F.row_number().over(w2)==1,\
                                   F.lit(0)).when((F.col("lag4").isNull())&(F.col("lag3").isNotNull()),F.col("lag3"))\
                                            .when(F.col("lag4").isNull()&(F.col("lag3").isNull()),F.lit(0))\
                                                 .otherwise(F.col("lag4"))).orderBy("mono_id")\
                                                .drop("mono_id","lag1","lag2","lag3","lag4")\
      .show()
  

#+------+-----------+----------+------------+-----------+
#|status|close_price|open_price|close_price1|open_price1|
#+------+-----------+----------+------------+-----------+
#|    s1|        1.2|       0.0|         2.1|        0.0|
#|    s1|        2.2|       0.0|         3.1|        2.1|
#|    s1|        3.2|       1.2|         4.1|        0.0|
#|    s2|        4.2|       0.0|         5.1|        3.1|
#|    s2|        5.2|       2.2|         6.1|        4.1|
#|    s1|        6.2|       3.2|         7.1|        5.1|
#|    s2|        7.2|       4.2|         7.1|        5.1|
#+------+-----------+----------+------------+-----------+

In [27]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window


w1=Window().orderBy(F.col("mono_id"))
w2=Window().partitionBy("status").orderBy("mono_id")


df.withColumn("mono_id", F.monotonically_increasing_id())\
   .withColumn("lag1", F.lag("close_price",2).over(w1))\
     .withColumn("lag2", F.lag("close_price",3).over(w1))\
              .withColumn("open_price",F.when(F.row_number().over(w2)==1,\
                                   F.lit(0)).when((F.col("lag2").isNull())|((F.col("lag1").isNull()),F.lit(0)),
                                                  F.lit(0))\
                                                 .otherwise(F.col("lag2"))).orderBy("mono_id")\
                                               .drop("mono_id","lag1","lag2").select("status","close_price","open_price").show()


#+------+-----------+----------+------------+-----------+
#|status|close_price|open_price|close_price1|open_price1|
#+------+-----------+----------+------------+-----------+
#|    s1|        1.2|       0.0|         2.1|        0.0|
#|    s1|        2.2|       0.0|         3.1|        2.1|
#|    s2|        3.2|       1.2|         4.1|        0.0|
#|    s2|        4.2|       0.0|         5.1|        3.1|
#|    s1|        5.2|       2.2|         6.1|        4.1|
#|    s1|        6.2|       3.2|         7.1|        5.1|
#|    s1|        7.2|       4.2|         7.1|        5.1|
#+------+-----------+----------+------------+-----------+

In [28]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
    
    
w1=Window().orderBy(F.col("mono_id"))
w2=Window().partitionBy("status").orderBy("mono_id")
    
    
df.withColumn("mono_id", F.monotonically_increasing_id())\
       .withColumn("lag1", F.lag("close_price",2).over(w1))\
         .withColumn("lag2", F.lag("close_price",3).over(w1))\
                  .withColumn("open_price",F.when(F.row_number().over(w2)==1,\
                                       F.lit(0)).when((F.col("lag2").isNull())&(F.col("lag1").isNotNull()),F.col("lag1"))\
                                                .when(F.col("lag2").isNull()&(F.col("lag1").isNull()),F.lit(0))\
                                                     .otherwise(F.col("lag2"))).orderBy("mono_id").drop("mono_id","lag1","lag2").select("status","close_price","open_price").show()

In [29]:
+------+-----------+----------+------------+-----------+
|status|close_price|open_price|close_price1|open_price1|
+------+-----------+----------+------------+-----------+
|    s1|        1.2|       0.0|         2.1|        0.0|
|    s1|        2.2|       0.0|         3.1|        2.1|
|    s2|        3.2|       1.2|         4.1|        0.0|
|    s2|        4.2|       0.0|         5.1|        3.1|
|    s1|        5.2|       2.2|         6.1|        4.1|
|    s1|        6.2|       3.2|         7.1|        5.1|
|    s1|        7.2|       4.2|         7.1|        5.1|
+------+-----------+----------+------------+-----------+

In [30]:
list=[[  None     , 4.905615,'2019-08-01 00:00:00',  1],
     [51.819645, None        ,'2019-08-01 00:00:00',   1],
     [51.81964, 4.961713,'2019-08-01 00:00:00',   2],
     [   None      ,   None,      '2019-08-01 00:00:00',   3],
     [51.82918, 4.911187,        None           ,   3],
     [51.82385, 4.901488,'2019-08-01 00:00:03',   5]]


df=spark.createDataFrame(list,['latitude','longitude','timestamplast','name'])

df.show()


In [31]:
w=Window().partitionBy("name").orderBy(F.lit(1))


df.withColumn("latitude", F.when(F.col("latitude").isNull()))

In [32]:
list=[['2019-08-01 00:00:00',   1, 0],
      ['2019-08-01 00:01:00',   1, 60], 
      ['2019-08-01 00:01:15',   1, 15],
      ['2019-08-01 03:00:00',   2, 0],
      ['2019-08-01 04:00:00',   2, 3600],
      ['2019-08-01 00:15:00',   3, 0]]

df=spark.createDataFrame(list,['timestamplast','name','time_d'])

df.show()


In [33]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

w=Window().partitionBy("name").orderBy(F.col("timestamplast"))
df.withColumn("time_d", F.lag(F.unix_timestamp("timestamplast")).over(w))\
  .withColumn("time_d", F.when(F.col("time_d").isNotNull(), F.unix_timestamp("timestamplast")-F.col("time_d"))\
                         .otherwise(F.lit(0))).orderBy("name","timestamplast").show()

#+-------------------+----+------+
#|      timestamplast|name|time_d|
#+-------------------+----+------+
#|2019-08-01 00:00:00|   1|     0|
#|2019-08-01 00:01:00|   1|    60|
#|2019-08-01 00:01:15|   1|    15|
#|2019-08-01 03:00:00|   2|     0|
#|2019-08-01 04:00:00|   2|  3600|
#|2019-08-01 00:15:00|   3|     0|
#+-------------------+----+------+

In [34]:
dftmp = spark.createDataFrame([('ab',)], ['data'])

In [35]:
from pyspark.sql import functions as F

dftmp.withColumn('repeat', F.expr("""array_repeat(data, len)""")).show()

In [36]:
dftmp.withColumn('repeat', F.expr("""array_repeat(data, length(data))""")).show()

In [37]:
from pyspark.sql import functions as F
df = spark.createDataFrame([(5000, 'US'),(2500, 'IN'),(4500, 'AU'),(4500, 'NZ')],["Sales", "Region"])
df.show()

In [38]:
df.when(condition1==True, withColumn('This', lit("yes")) & withColumn('That', lit("also yes"))).otherwise(withColumn('This', lit("no")) & withColumn('That', lit("also no")))

In [39]:
#+-----+------+
#|Sales|Region|
#+-----+------+
#| 5000|    US|
#| 2500|    IN|
#| 4500|    AU|
#| 4500|    NZ|
#+-----+------+

from pyspark.sql import functions as F

df.withColumn("col", F.when(F.col("Region")=='US',\
                            F.struct(F.lit("yes").alias("This"),F.lit("also yes").alias("That")))\
                      .otherwise(F.struct(F.lit("no").alias("This"),F.lit("also no").alias("That"))))\
                      .select(*df.columns,"col.*")\
                      .show()

#+-----+------+----+--------+
#|Sales|Region|This|    That|
#+-----+------+----+--------+
#| 5000|    US| yes|also yes|
#| 2500|    IN|  no| also no|
#| 4500|    AU|  no| also no|
#| 4500|    NZ|  no| also no|
#+-----+------+----+--------+

In [40]:
from pyspark.sql import functions as F
import datetime
df = spark.createDataFrame([('America/NewYork', '2020-02-01 10:00:00'),('Africa/Nairobi', '2020-02-01 10:00:00')],["OriginTz", "Time"])

df=df.withColumn("y", F.lit('2020-01-01'))

df.show()

In [41]:
from pyspark.sql import functions as F
df.withColumn("new_date", F.expr("""IF(Time<y, Time + interval 14 hours, Time + interval 10 hours)""")).show()

In [42]:
list=[['"alex"john"', 30 ,  'burlington'  ,'nj',      'usa'],
      ['"mohammad"hashmi"', 30 ,  'burlington'  ,'nj',      'usa']]

df=spark.createDataFrame(list,['name','age','county','state','country'])

df.show(truncate=False)

In [43]:
df.select(*[F.regexp_replace(x,'^\"|\"$','').alias(x) for x in df.columns]).show()

In [44]:
df.withColumn("name", F.expr("""substring(name,2,length(name)-2)""")).show()

#+---------+---+----------+-----+-------+
#|name     |age|county    |state|country|
#+---------+---+----------+-----+-------+
#|alex"john|30 |burlington|nj   |usa    |
#+---------+---+----------+-----+-------+

In [45]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

list=[['2019-02-23','2019-02-20',          2],
      ['2019-03-20','2019-02-20',          7],
      ['2019-03-21', '2019-02-21',         12],
      ['2019-03-22', '2019-02-22',         27],
      ['2019-03-23', '2019-02-23',         91]]

df=spark.createDataFrame(list,['AsofDate','oneMonthAgo','value'])

df.show()

In [46]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

w=Window().partitionBy(F.dayofmonth("AsofDate"))\
          .orderBy(F.to_timestamp("AsofDate").cast("long"))\
          .rangeBetween(86400*-30,0)

first=F.first("value").over(w)

df.withColumn("1MonthAgoValue", F.when(first!=F.col("value"), first)\
                                 .otherwise(F.lit(None))).show()

In [47]:
from pyspark.sql import functions as F
from pyspark.sql import Window

w = Window.partitionBy(F.year('AsofDate'),F.dayofmonth('AsofDate')).orderBy(F.month('AsofDate'))

df.withColumn('1MonthAgoValue', F.lag('value').over(w)).show()