In [1]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import *

In [2]:
list=[[324.456 , 'hi' ,    'test'],
[453.987  ,'hello',  'python'],
[768.66   ,'test',   'java']]

df=spark.createDataFrame(list,['col1','col2','col3'])

df.printSchema()

In [3]:
df.withColumn("col4",F.expr("""substring(col1,0,instr(col1),"."+2""")).show()

In [4]:
df.withColumn('test',F.regexp_extract("col1",'\d+[.]\d{2}',0)).show()

In [5]:
df.withColumn("col4", F.expr("""IF(length(col1)==7, substring(col1,0,length(col1)-1),col1)""")).show()

In [6]:
col1     col2   col3   col4
324.456  hi     test   324.45
453.987  hello  python 453.98
768.66   test   java   768.66

In [7]:
list=[['t1', ['m1']],
      ['t3' ,['m1', 'm2']],
      ['t4' , ['m1', 'm2']],
      ['t6' , ['m2']],
      ['t7' , ['m3']],
      ['t8' , ['m3']],
      ['t9' , ['m1']]]


df=spark.createDataFrame(list,['time','message'])

df.show()

In [8]:
w=Window().partitionBy("message")
df.withColumn("mono", F.monotonically_increasing_id())\
  .withColumn("message", F.explode("message"))\
  .show()
  

In [9]:
+--------+-------+-----+
 message | start | end |
+--------+-------+-----+
   m1    |  t1   |  t4 |
   m2    |  t3   |  t6 |
   m3    |  t7   |  t8 |
   m1    |  t9   |  t9 |

In [10]:
list=[[1,'2015-01-05' , '2015-01-05',  0.0076],
     [1,'2015-01-06' , '2015-01-05',  0.0026],
     [1,'2015-01-07' , '2015-01-05',  0.0016],
     [1,'2015-01-08' , '2015-01-05',  0.0006],
     [2,'2015-01-09' , '2015-01-05',  0.0012],
     [2,'2015-01-10' , '2015-01-05',  0.0014],
     [1,'2015-01-12' , '2015-01-12',  0.0026],
     [1,'2015-01-13' , '2015-01-12',  0.0086],
     [1,'2015-01-14' , '2015-01-12',  0.0046],
     [1,'2015-01-15' , '2015-01-12',  0.0021],
     [2,'2015-01-16' , '2015-01-12',  0.0042],
     [2,'2015-01-17' , '2015-01-12',  0.0099]]

df=spark.createDataFrame(list,['id','calendarday','last_monday','indexCP'])


df.show()

In [11]:
w=Window().partitionBy("last_monday")\
          .orderBy(F.to_date("calendarday","yyyy-MM-dd"))\
          .rowsBetween(Window.unboundedPreceding,Window.currentRow)

df.withColumn("PreviousYearUnique", F.first("indexCP").over(w)).show()

#+---+-----------+-----------+-------+------------------+
#| id|calendarday|last_monday|indexCP|PreviousYearUnique|
#+---+-----------+-----------+-------+------------------+
#|  1| 2015-01-05| 2015-01-05| 0.0076|            0.0076|
#|  1| 2015-01-06| 2015-01-05| 0.0026|            0.0076|
#|  1| 2015-01-07| 2015-01-05| 0.0016|            0.0076|
#|  1| 2015-01-08| 2015-01-05| 6.0E-4|            0.0076|
#|  2| 2015-01-09| 2015-01-05| 0.0012|            0.0076|
#|  2| 2015-01-10| 2015-01-05| 0.0014|            0.0076|
#|  1| 2015-01-12| 2015-01-12| 0.0026|            0.0026|
#|  1| 2015-01-13| 2015-01-12| 0.0086|            0.0026|
#|  1| 2015-01-14| 2015-01-12| 0.0046|            0.0026|
#|  1| 2015-01-15| 2015-01-12| 0.0021|            0.0026|
#|  2| 2015-01-16| 2015-01-12| 0.0042|            0.0026|
#|  2| 2015-01-17| 2015-01-12| 0.0099|            0.0026|
#+---+-----------+-----------+-------+------------------+

In [12]:
df = spark.createDataFrame(
    [[1, 'foo'],
     [1, 'bar'], 
     [1, 'foo'], 
     [1, 'foo'], 
     [2, 'bar'], 
     [2, 'foo'], 
     [2, 'bar'], 
     [2, 'foo'],
     [3,'foo']],
    ['session_id', 'event'])
df.show()

In [13]:
df = spark.createDataFrame(
        [[1, '2020-01-01 12:30:00.000', 'foo'], [1, '2020-01-01 12:31:00.000', 'bar'], [1, '2020-01-01 12:32:00.000', 'foo'],
    [1, '2020-01-01 12:33:00.000', 'foo'], [2, '2020-01-01 13:00:00.000', 'bar'], [2, '2020-01-01 13:01:00.000', 'foo'],
    [2, '2020-01-01 13:02:00.000', 'bar'], [2, '2020-01-01 13:03:00.000', 'foo']],
        ['session_id', 'timestamp', 'event']
    )
df.show(truncate=False)

In [14]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

w=Window().orderBy("timestamp")
w2=Window().partitionBy("session_id").orderBy("timestamp")
w3=Window().partitionBy("session_id")
df.withColumn("timestamp", F.to_timestamp("timestamp", 'yyyy-MM-dd HH:mm:ss.SSS'))\
  .withColumn("session_id", F.sum(F.when((F.col("event")=='bar'),F.lit(1))\
                                         .otherwise(F.lit(0))).over(w))\
  .withColumn("rowNum", F.row_number().over(w2))\
  .withColumn("max", F.max("rowNum").over(w3))\
  .withColumn("first", F.when((F.col("rowNum")==1)&(F.col("event")=='foo'), F.lit(1))\
                       .otherwise(F.lit(0)))\
  .filter('max>=2 and first=0').drop(*['rowNum','sample_timestamp','max','first']).show()

#+----------+-------------------+-----+
#|session_id|          timestamp|event|
#+----------+-------------------+-----+
#|         1|2020-01-01 12:31:00|  bar|
#|         1|2020-01-01 12:32:00|  foo|
#|         1|2020-01-01 12:33:00|  foo|
#|         2|2020-01-01 13:00:00|  bar|
#|         2|2020-01-01 13:01:00|  foo|
#|         3|2020-01-01 13:02:00|  bar|
#|         3|2020-01-01 13:03:00|  foo|
#+----------+-------------------+-----+



In [15]:
+----------+-----+
|session_id|event|
+----------+-----+
|         1|  bar|
|         1|  foo|
|         1|  foo|
|         2|  bar|
|         2|  foo|
|         3|  bar|
|         3|  foo|
+----------+-----+