In [1]:
import pyspark
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.dataframe import DataFrame

from functools import partial

In [3]:
spark = SparkSession.Builder().appName('test').master("local[*]").getOrCreate()

In [4]:
schema = StructType([
         StructField('date', TimestampType(), True),
         StructField('event', StringType(), True),
         StructField('os_family', StringType(), True),
         StructField('Num', IntegerType(), True)])

df = spark.read\
     .schema(schema)\
     .csv('os_family*')

In [5]:
df.printSchema()

root
 |-- date: timestamp (nullable = true)
 |-- event: string (nullable = true)
 |-- os_family: string (nullable = true)
 |-- Num: integer (nullable = true)



In [5]:
df.rdd.getNumPartitions()

2

In [6]:
df.sample(0.1, False).show()

+-------------------+-----+-------------+-------+
|               date|event|    os_family|    Num|
+-------------------+-----+-------------+-------+
|2017-11-01 00:00:00| 0042|          VRE|      7|
|2017-10-01 00:00:00| 0042|        Other|   5816|
|2018-01-01 00:00:00| 0042|     Mac OS X|  12369|
|2017-11-01 00:00:00| 0550|      Android|5331398|
|2017-06-01 00:00:00| 0042|        Tizen|      2|
|2017-07-01 00:00:00| 0042|        Other|   6293|
|2017-10-01 00:00:00| 0042|       Fedora|     24|
|2017-07-01 00:00:00| 0042|        Linux|   6566|
|2017-12-01 00:00:00| 0550|      Android|5278654|
|2017-07-01 00:00:00| 0042|     Mac OS X|  13289|
|2017-06-01 00:00:00| 0042|     Mac OS X|  11477|
|2017-07-01 00:00:00| 1002|          iOS|4251549|
|2017-07-01 00:00:00| 1002|        Other|      3|
|2018-07-01 00:00:00| 0042|BlackBerry OS|     12|
|2018-09-01 00:00:00| 0042|     Mac OS X|   8853|
|2018-12-01 00:00:00| 0042|     Mac OS X|   8054|
|2018-11-01 00:00:00| 0042|    Symbian^3|      3|


In [7]:
def transform(self, f):
    return f(self)

DataFrame.transform = transform

In [8]:
def filterfn(df, dt,operator):
    if operator == '>':
        df =df.filter(col('date')>dt)\
          .withColumnRenamed('Num', 'Num_18')\
          .withColumnRenamed('os_family', 'os_family_18')\
          .withColumnRenamed('event', 'event_18')\
          .withColumnRenamed('date', 'date_18')
        return df
    else:
        df = df.filter(col('date')<dt)\
           .withColumnRenamed('Num', 'Num_17')\
           .withColumnRenamed('os_family', 'os_family_17')\
          .withColumnRenamed('event', 'event_17')\
           .withColumnRenamed('date', 'date_17')
        return df
        

def filter17(df, dt):
    df = df.filter(col('date')<dt)\
           .withColumnRenamed('Num', 'Num_17')\
           .withColumnRenamed('os_family', 'os_family_17')\
          .withColumnRenamed('event', 'event_17')\
           .withColumnRenamed('date', 'date_17')
    return df

def _insometh(df):
    df =df.where((col('os_family')=='Android') | (col('os_family')=='iOS'))\
        .sort(col('date'), col('event'), col('os_family'))
    return df

In [17]:
df_18 = df.transform(lambda df: filterfn(df, lit('2018-01-01 00:00:00'), '>'))
df_18 =df_18.where((col('os_family')=='Android') | (col('os_family')=='iOS')).sort(col('date'), col('event'), col('os_family'))

+-------------------+--------+------------+--------+
|            date_18|event_18|os_family_18|  Num_18|
+-------------------+--------+------------+--------+
|2018-06-01 00:00:00|    0042|     Android| 8707778|
|2018-06-01 00:00:00|    0042|         iOS|13134720|
|2018-06-01 00:00:00|    0550|     Android| 5013732|
|2018-06-01 00:00:00|    0550|         iOS| 8534062|
|2018-06-01 00:00:00|    1002|     Android| 2089562|
|2018-06-01 00:00:00|    1002|         iOS| 8978175|
|2018-07-01 00:00:00|    0042|     Android| 9484852|
|2018-07-01 00:00:00|    0042|         iOS|14067372|
|2018-07-01 00:00:00|    0550|     Android| 5750525|
|2018-07-01 00:00:00|    0550|         iOS| 9480238|
|2018-07-01 00:00:00|    1002|     Android| 2488602|
|2018-07-01 00:00:00|    1002|         iOS|10069556|
|2018-08-01 00:00:00|    0042|     Android| 9412543|
|2018-08-01 00:00:00|    0042|         iOS|14038098|
|2018-08-01 00:00:00|    0550|     Android| 5905011|
|2018-08-01 00:00:00|    0550|         iOS| 96

In [30]:
df_17 = df.transform(lambda df: filterfn(df, lit('2018-01-01 00:00:00'), "<"))
df_17_a = df_17.where((col('os_family')=='Android') | (col('os_family')=='iOS')).sort(col('date'), col('event'), col('os_family'))

In [24]:
df_17.show()

+-------------------+--------+------------+--------+
|            date_17|event_17|os_family_17|  Num_17|
+-------------------+--------+------------+--------+
|2017-06-01 00:00:00|    0042|     Android| 7327201|
|2017-06-01 00:00:00|    0042|         iOS|11899555|
|2017-06-01 00:00:00|    0550|     Android| 5377300|
|2017-06-01 00:00:00|    0550|         iOS| 9498861|
|2017-06-01 00:00:00|    1002|     Android|  481294|
|2017-06-01 00:00:00|    1002|         iOS| 3697954|
|2017-07-01 00:00:00|    0042|     Android| 8309729|
|2017-07-01 00:00:00|    0042|         iOS|13216455|
|2017-07-01 00:00:00|    0550|     Android| 5928348|
|2017-07-01 00:00:00|    0550|         iOS|10433398|
|2017-07-01 00:00:00|    1002|     Android|  619791|
|2017-07-01 00:00:00|    1002|         iOS| 4251549|
|2017-08-01 00:00:00|    0042|     Android| 8406219|
|2017-08-01 00:00:00|    0042|         iOS|13281837|
|2017-08-01 00:00:00|    0550|     Android| 5845129|
|2017-08-01 00:00:00|    0550|         iOS|102

In [31]:
df_new = df_17.join(df_18, "id","inner")\
              .drop("id")\
              .withColumn("change", col('Num_18')-col('Num_17'))\
              .withColumn("%change", round((col("change")/col("Num_17")),2))

df_new.select('date_18','date_17','event_18','event_17','os_family_18','os_family_17','Num_18','Num_17','change', '%change')\
.show(10)

In [31]:
df_18_os_family = (df.transform(lambda df: filterfn(df, lit('2018-01-01 00:00:00'),">")))\
             .select('os_family_18').distinct()
df_17_os_family = (df.transform(lambda df: filterfn(df, lit('2018-01-01 00:00:00'), "<")))\
              .select('os_family_17').distinct()

df_18_os_family_g = (df.transform(lambda df: filterfn(df, lit('2018-01-01 00:00:00'), ">")))\
             .groupby('os_family_18').sum(col('Num_18'))
df_17_os_family_g = (df.transform(lambda df: filterfn(df, lit('2018-01-01 00:00:00'), "<")))\
               .groupby('os_family_18').sum(col('Num_17'))

In [33]:
df_17_os_family.show()

+--------------------+
|        os_family_17|
+--------------------+
|          Symbian OS|
|                 iOS|
|BlackBerry Tablet OS|
|               Linux|
|           Symbian^3|
|               MeeGo|
|               Other|
|              Fedora|
|             OpenBSD|
|               Maemo|
|           Chrome OS|
|       BlackBerry OS|
|              Ubuntu|
|              NetBSD|
|       Windows Phone|
|          Firefox OS|
|             FreeBSD|
|               Tizen|
|             Android|
|            Mac OS X|
+--------------------+
only showing top 20 rows



In [34]:
df_18_os_family.show()

+--------------------+
|        os_family_18|
+--------------------+
|          Symbian OS|
|                 iOS|
|BlackBerry Tablet OS|
|          Windows CE|
|                BREW|
|               Linux|
|           Symbian^3|
|               MeeGo|
|               Other|
|              Fedora|
|             OpenBSD|
|               Maemo|
|           Chrome OS|
|       BlackBerry OS|
|              Ubuntu|
|          Chromecast|
|       Windows Phone|
|          Firefox OS|
|             FreeBSD|
|               Tizen|
+--------------------+
only showing top 20 rows



In [37]:
df_18_os_family.exceptAll(df_17_os_family).show()

+------------+
|os_family_18|
+------------+
|  Windows CE|
|        BREW|
|  Chromecast|
+------------+



In [38]:
df_17_os_family.exceptAll(df_18_os_family).show()

+--------------+
|  os_family_17|
+--------------+
|        NetBSD|
|Windows NT 4.0|
+--------------+



In [83]:
df_new.show()

+-------------------+-----+---------+--------+-------------------+-----+---------+--------+
|               date|event|os_family|     Num|               date|event|os_family|     Num|
+-------------------+-----+---------+--------+-------------------+-----+---------+--------+
|2017-08-01 00:00:00| 0550|      iOS|10291916|2018-08-01 00:00:00| 0550|      iOS| 9685755|
|2017-06-01 00:00:00| 0042|      iOS|11899555|2018-06-01 00:00:00| 0042|      iOS|13134720|
|2017-06-01 00:00:00| 0042|  Android| 7327201|2018-06-01 00:00:00| 0042|  Android| 8707778|
|               null| null|     null|    null|2019-01-01 00:00:00| 0042|  Android| 9068509|
|2017-09-01 00:00:00| 1002|  Android| 1002113|2018-09-01 00:00:00| 1002|  Android| 2582013|
|2017-10-01 00:00:00| 0550|  Android| 5368162|2018-10-01 00:00:00| 0550|  Android| 6168170|
|2017-09-01 00:00:00| 0042|      iOS|12152338|2018-09-01 00:00:00| 0042|      iOS|12930658|
|2017-08-01 00:00:00| 1002|      iOS| 5024174|2018-08-01 00:00:00| 1002|      iO

In [84]:
df_new.printSchem

root
 |-- date: timestamp (nullable = true)
 |-- event: string (nullable = true)
 |-- os_family: string (nullable = true)
 |-- Num: integer (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- event: string (nullable = true)
 |-- os_family: string (nullable = true)
 |-- Num: integer (nullable = true)

