In [None]:
!pip install pandas
!pip install pyspark
!pip install pyarrow

In [None]:
from pyspark.sql import SparkSession
 
# Building the SparkSession and name 
# it :'pandas to spark'
spark = SparkSession.builder.appName(
  "pandas to spark").getOrCreate()

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [3]:
import pandas as pd

data = [[1, 2, '2016-03-01', 5], [1, 2, '2016-03-02', 6], [2, 3, '2017-06-25', 1], [3, 1, '2016-03-02', 0], [3, 4, '2018-07-03', 5]]
activity = pd.DataFrame(data, columns=['player_id', 'device_id', 'event_date', 'games_played']).astype({'player_id':'Int64', 'device_id':'Int64', 'event_date':'datetime64[ns]', 'games_played':'Int64'})

In [4]:
activity = spark.createDataFrame(activity)
activity.show()

+---------+---------+-------------------+------------+
|player_id|device_id|         event_date|games_played|
+---------+---------+-------------------+------------+
|        1|        2|2016-03-01 00:00:00|           5|
|        1|        2|2016-03-02 00:00:00|           6|
|        2|        3|2017-06-25 00:00:00|           1|
|        3|        1|2016-03-02 00:00:00|           0|
|        3|        4|2018-07-03 00:00:00|           5|
+---------+---------+-------------------+------------+



In [13]:
from pyspark.sql.functions import lead, col, row_number, date_diff, when, sum, count, round
from pyspark.sql.window import Window

activity \
.withColumn('next_event_date', 
            lead(col('event_date'), 1) \
                    .over(Window.partitionBy('player_id') \
                                .orderBy('event_date'))) \
.withColumn('row_number', 
            row_number() \
                .over(Window.partitionBy('player_id') \
                                .orderBy('event_date'))) \
.withColumn('difference', date_diff(col('next_event_date'), col('event_date'))) \
.where('row_number == 1') \
.groupby('row_number') \
.agg(sum(when(col('difference') == 1, 1).otherwise(0)).alias('no_of_players'),
     count(col('player_id')).alias('total_no_of_players')) \
.withColumn('fraction', round(col('no_of_players')/col('total_no_of_players'), 2)) \
.select('fraction') \
.show()


+--------+
|fraction|
+--------+
|    0.33|
+--------+

