In [1]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window


In [2]:
list=[[['A','B'],['A','B','C']],          
      [['A','C'],['A','B','C']],           
      [['A','B','D'],['A','B','E']]]

df=spark.createDataFrame(list,['subject_1','subject_2'])

df.show(truncate=False)

In [3]:
list=[['000006c9-d42b-4fe...',   '2019_09',                     3],
      ['000006c9-d42b-4fe...',   '2020_01',                     2],
      ['000006c9-d42b-4fe...',   '2020_02',                     6]]

df=spark.createDataFrame(list,['core_id','year_month','month_sum_detailaction'])

df.show()

In [4]:
from pyspark.sql import functions as F

df.groupBy("core_id").agg(F.collect_list(F.to_date("year_month","yyyy_MM")).alias("year_month"),\
                          F.collect_list("month_sum_detailaction").alias("month_sum_detailaction"))\
  .withColumn("seq", F.expr("""sequence(to_date('2019_05','yyyy_MM'),to_date('2020_06','yyyy_MM'),interval 1 month)"""))\
  .withColumn("year_month", F.flatten(F.array("year_month", F.array_except("seq","year_month"))))\
  .withColumn("zip", F.explode(F.arrays_zip("year_month"\
                                 ,"month_sum_detailaction")))\
  .select("core_id", F.col("zip.*"))\
  .withColumn("year_month", F.date_format("year_month", "yyyy_MM")).orderBy("year_month").fillna(0).show()

#+--------------------+----------+----------------------+
#|             core_id|year_month|month_sum_detailaction|
#+--------------------+----------+----------------------+
#|000006c9-d42b-4fe...|   2019_05|                     0|
#|000006c9-d42b-4fe...|   2019_06|                     0|
#|000006c9-d42b-4fe...|   2019_07|                     0|
#|000006c9-d42b-4fe...|   2019_08|                     0|
#|000006c9-d42b-4fe...|   2019_09|                     3|
#|000006c9-d42b-4fe...|   2019_10|                     0|
#|000006c9-d42b-4fe...|   2019_11|                     0|
#|000006c9-d42b-4fe...|   2019_12|                     0|
#|000006c9-d42b-4fe...|   2020_01|                     2|
#|000006c9-d42b-4fe...|   2020_02|                     6|
#|000006c9-d42b-4fe...|   2020_03|                     0|
#|000006c9-d42b-4fe...|   2020_04|                     0|
#|000006c9-d42b-4fe...|   2020_05|                     0|
#|000006c9-d42b-4fe...|   2020_06|                     0|
#+--------------------+----------+----------------------+

In [5]:
+---------------+
|min(year_month)|
+---------------+
|        2019_05|
+---------------+
+---------------+
|max(year_month)|
+---------------+
|        2020_06|
+---------------+

In [6]:
from pyspark.sql import functions as F

df.withColumn("both", F.array_intersect("subject_1","subject_2"))\
  .withColumn("only_1", F.array_except("subject_1","subject_2"))\
  .withColumn("only_2", F.array_except("subject_2","subject_1")).show()

#+---------+---------+------+------+------+
#|subject_1|subject_2|  both|only_1|only_2|
#+---------+---------+------+------+------+
#|   [A, B]|[A, B, C]|[A, B]|    []|   [C]|
#|   [A, C]|[A, B, C]|[A, C]|    []|   [B]|
#|[A, B, D]|[A, B, E]|[A, B]|   [D]|   [E]|
#+---------+---------+------+------+------+

In [7]:
list=[['01-03-20'  ,'Bob'   , 3],
      ['01-04-20'  ,'Bob'    ,2],
      ['01-06-20'  ,'Bob'    ,9],
      ['01-02-20'  ,'Alice'  ,7],
      ['01-03-20'  ,'Alice'  ,5],
      ['01-04-20'  ,'Alice'  ,4]]

df=spark.createDataFrame(list,['Date','Name','Tasks'])

df.show()

In [8]:
df_immigration_new = df_immigration\
    .withColumn('depdate2', F.col('depdate').cast(T.IntegerType()))\
    .withColumn('depdate', F.date_add(F.to_date('1960-01-01'), F.col('depdate2')))\
    .show(n=1, truncate=False)

In [9]:
list=[[1,'Hi. Sent1. Sent2.',['Hi.' ,'Sent1.', 'Sent2.']],
      [2,'Yeah. Ok.',['Yeah.', 'Ok.']]]



In [10]:
df=spark.createDataFrame(list,['REVIEW_ID','REVIEW_COMMENTS','SENTENCES_LIST'])

df.show()

In [11]:
from pyspark.sql import functions as F

df.withColumn("list", F.explode(F.expr("""transform(SENTENCES_LIST,(x,i)-> struct(x as SENTENCE,(i+1) as SENT_NUMBER))""")))\
  .select("*", "list.*").show()

In [12]:
list=[['78aa', '2020-04-14'  , 3       ,    '2020-04-14 19:00:00','2020-04-14 19:23:59',24],
['78aa', '2020-04-14'  , 3       ,    '2020-04-14 19:24:00','2020-04-14 19:26:59',4],
['78aa', '2020-04-14'  , 3       ,    '2020-04-14 19:27:00','2020-04-14 19:35:59',8],
['78aa', '2020-04-14'  , 3       ,    '2020-04-14 19:36:00','2020-04-14 19:55:00',19],
['25aa', '2020-04-15'  , 7       ,    '2020-04-15 08:00:00','2020-04-15 08:02:59',3],
['25aa', '2020-04-15'  , 7       ,    '2020-04-15 11:03:00','2020-04-15 11:11:59',9],
['25aa', '2020-04-15'  , 7       ,    '2020-04-15 11:12:00','2020-04-15 11:45:59',34],
['25aa', '2020-04-15'  , 7       ,    '2020-04-15 11:46:00','2020-04-15 11:47:00',1]]

df=spark.createDataFrame(list,['id','date','group','start_time','end_time','duration'])

df.show()

In [13]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

w=Window().partitionBy("id","date","group").orderBy("start_time")
df.withColumn("check", F.sum(F.when(F.unix_timestamp("start_time")-F.lag(F.unix_timestamp("end_time")).over(w)>1,F.lit(1))\
                        .otherwise(F.lit(0))).over(w))\
  .groupBy("date","id","group","check").agg(F.first("start_time").alias("start_time"),F.last("end_time").alias("end_time"),\
                                   F.sum("duration").alias("duration")).drop("check").show()

#+----------+----+-----+-------------------+-------------------+--------+
#|      date|  id|group|         start_time|           end_time|duration|
#+----------+----+-----+-------------------+-------------------+--------+
#|2020-04-14|78aa|    3|2020-04-14 19:00:00|2020-04-14 19:55:00|      55|
#|2020-04-15|25aa|    7|2020-04-15 08:00:00|2020-04-15 08:02:59|       3|
#|2020-04-15|25aa|    7|2020-04-15 11:03:00|2020-04-15 11:47:00|      44|
#+----------+----+-----+-------------------+-------------------+--------+

In [14]:
w=Window().partitionBy("id","date","group").orderBy("start_time")
df.withColumn("check", F.when(F.unix_timestamp("start_time")-F.lag(F.unix_timestamp("end_time")).over(w)>1,F.lit(1))\
                        .otherwise(F.lit(0))
         .groupBy("date","id","group").agg(F.first("start_time").alias("start_time"),F.last("end_time").alias("end_time"),\
                                   F.sum("duration").alias("duration")).show()

In [15]:
+--------+--------------+------+-----------------------+-------------------+---------+
|id      |  date        |group |start_time             | end_time          | duration|
+--------+--------------+------+-----------------------+-------------------+---------+
|    78aa|  2020-04-14  | 3    |    2020-04-14 19:00:00|2020-04-14 19:55:00|55       |
|    25aa|  2020-04-15  | 7    |    2020-04-15 08:00:00|2020-04-15 08:02:59|3        |
|    25aa|  2020-04-15  | 7    |    2020-04-15 11:00:00|2020-04-15 11:47:00|44       |
+--------+--------------+------+-----------------------+-------------------+---------+

In [16]:
 list=[[   'B'   ,     'c'   ],
      ['K'   ,    'G'    ],
      ['g'   ,    'A'    ],
      [None  ,   None]]
df=spark.createDataFrame(list,['A','B'])

df.show()

In [17]:
df.select((F.sort_array(F.array("A","B"))[0]).alias("AB")).show()

In [18]:
from pyspark.sql.functions import *

df.withColumn("AB",when(col("A") < col("B"),col("A"))\
      .otherwise(col("B"))).show()

In [19]:
list=[['2019-08-01'],
       ['2019-09-01'],
     ]

df=spark.createDataFrame(list,['col1'])

df1=df.withColumn("col1", F.to_date("col1"))

df1.printSchema()

In [20]:
df1.select(F.max("col1")).collect()[0]