Some Resources:
- paper explaining some of the variables on pg 16.3: [link](https://ijssst.info/Vol-16/No-5/paper16.pdf)
- random assignment based on this dataset, might give us some good questions to ask: [link](http://cse.ucdenver.edu/~biswasa/ml-s18/files/assignments/assignment-01/Programming-Assignment-1.pdf)

**data import and preprocessing**

In [1]:
# import modules
import pandas as pd
from pyspark.sql import SparkSession

In [2]:
# build spark session and spark context
spark = SparkSession.builder \
        .appName("fb") \
        .getOrCreate()
sc = spark.sparkContext

In [39]:
#Need to change path when running
train_path='/home/jovyan/Project/Training/'

training_file=['Features_Variant_1.csv','Features_Variant_2.csv','Features_Variant_3.csv','Features_Variant_4.csv','Features_Variant_5.csv']

fb = spark.read.csv(train_path+training_file[0])


In [6]:
fb.show(1)

+------+---+---+---+---+-----+------------------+---+-----------------+---+-----+-----------------+----+---------------+----+----+------------------+----+----------------+----+-----+------------------+----+-----------------+-----+-----+-----------------+----+-----------------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
|   _c0|_c1|_c2|_c3|_c4|  _c5|               _c6|_c7|              _c8|_c9| _c10|             _c11|_c12|           _c13|_c14|_c15|              _c16|_c17|            _c18|_c19| _c20|              _c21|_c22|             _c23| _c24| _c25|             _c26|_c27|             _c28|_c29|_c30|_c31|_c32|_c33|_c34|_c35|_c36|_c37|_c38|_c39|_c40|_c41|_c42|_c43|_c44|_c45|_c46|_c47|_c48|_c49|_c50|_c51|_c52|_c53|
+------+---+---+---+---+-----+------------------+---+-----------------+---+-----+-----------------+----+---------------+----+----+------------------+----+----------------+----+-----+------------

In [8]:
# rename columns 
df = (fb.withColumnRenamed('_c0', 'PageLikes')              # page likes
             .withColumnRenamed('_c1', 'PageCheckin')       # page's "Check-in" count
             .withColumnRenamed('_c2', 'PageTalking')       # page's "Talking About" count
             .withColumnRenamed('_c3', 'PageCategory')      # page category
             
            # C1: total comment count before selected base date/time
             .withColumnRenamed('_c4', 'C1min')             # min of C1
             .withColumnRenamed('_c5', 'C1max')             # max of C1
             .withColumnRenamed('_c6', 'C1avg')             # avg of C1
             .withColumnRenamed('_c7', 'C1med')             # median of C1
             .withColumnRenamed('_c8', 'C1std')             # standard deviation of C1
      
            # C2: comment count in last 24 hrs w.r.t base date/time
             .withColumnRenamed('_c9', 'C2min')             # min of C2
             .withColumnRenamed('_c10', 'C2max')            # max of C2
             .withColumnRenamed('_c11', 'C2avg')            # avg of C2
             .withColumnRenamed('_c12', 'C2med')            # median of C2
             .withColumnRenamed('_c13', 'C2std')            # standard deviation of C2
    
            # C3: comment count from last 48 to last 24 hrs w.r.t base date/time
             .withColumnRenamed('_c14', 'C3min')            # min of C3
             .withColumnRenamed('_c15', 'C3max')            # max of C3
             .withColumnRenamed('_c16', 'C3avg')            # avg of C3
             .withColumnRenamed('_c17', 'C3med')            # median of C3
             .withColumnRenamed('_c18', 'C3std')            # standard deviation of C3

            # C4: comment count in first 24 hrs after publishing document but before base date/time
             .withColumnRenamed('_c19', 'C4min')            # min of C4
             .withColumnRenamed('_c20', 'C4max')            # max of C4
             .withColumnRenamed('_c21', 'C4avg')            # avg of C4
             .withColumnRenamed('_c22', 'C4med')            # median of C4
             .withColumnRenamed('_c23', 'C4std')            # standard deviation of C4
      
            # C5: difference between C2 and C3
             .withColumnRenamed('_c24', 'C5min')            # min of C5
             .withColumnRenamed('_c25', 'C5max')            # max of C5
             .withColumnRenamed('_c26', 'C5avg')            # avg of C5
             .withColumnRenamed('_c27', 'C5med')            # median of C5
             .withColumnRenamed('_c28', 'C5std')            # standard deviation of C5
      
            # Other features
             .withColumnRenamed('_c29', 'CC1')              # total number of comments before selected base date/time
             .withColumnRenamed('_c30', 'CC2')              # number of comments in last 24 hours relative to base date/time
             .withColumnRenamed('_c31', 'CC3')              # number of comments in last 48 hours to last 24 hours relative to base date/time
             .withColumnRenamed('_c32', 'CC4')              # number of comments in the first 24 hours after the publication of post but before base date/time
             .withColumnRenamed('_c33', 'CC5')              # difference between CC2 and CC3
             .withColumnRenamed('_c34', 'BaseTime')         # selected time in order to simulate scenario (this is a decimal between 0-71, not sure what the number specifically means)
             .withColumnRenamed('_c35', 'PostLength')       # character count in post
             .withColumnRenamed('_c36', 'PostShare')        # number of shares of the post, how many people shared this post to their timeline
             .withColumnRenamed('_c37', 'PagePromote')      # if the original poster "promoted" this post
             .withColumnRenamed('_c38', 'HLocal')           # describes the hours past for target variable
             
            # Binary variables that represent the day(Sun-Sat) on which the post was published
             .withColumnRenamed('_c39', 'PostSun')         
             .withColumnRenamed('_c40', 'PostMon')          
             .withColumnRenamed('_c41', 'PostTue') 
             .withColumnRenamed('_c42', 'PostWed') 
             .withColumnRenamed('_c43', 'PostThu') 
             .withColumnRenamed('_c44', 'PostFri')   
             .withColumnRenamed('_c45', 'PostSat') 
      
            # Binary variables that represent the day(Sun-Sat) of selected base Date/Time
             .withColumnRenamed('_c46', 'BaseSun')          
             .withColumnRenamed('_c47', 'BaseMon') 
             .withColumnRenamed('_c48', 'BaseTue')   
             .withColumnRenamed('_c49', 'BaseWed') 
             .withColumnRenamed('_c50', 'BaseThu')   
             .withColumnRenamed('_c51', 'BaseFri')
             .withColumnRenamed('_c52', 'BaseSat') 
      
            # Target Variable: number of comments in next H hrs after comment was posted
             .withColumnRenamed('_c53', 'Comments'))       


In [26]:
# cast data types to numerical values, save as new df named "df_casted"
df_casted = df.select(
                df.PageLikes.cast('float'),
                df.PageCheckin.cast('float'),
                df.PageTalking.cast('float'),
                df.PageCategory.cast('int'),  # number corresponds to categories in pdf
                df.C1min.cast('float'),
                df.C1max.cast('float'),
                df.C1avg.cast('float'),
                df.C1med.cast('float'),
                df.C1std.cast('float'),
                df.C2min.cast('float'),
                df.C2max.cast('float'),
                df.C2avg.cast('float'),
                df.C2med.cast('float'),
                df.C2std.cast('float'),
                df.C3min.cast('float'),
                df.C3max.cast('float'),
                df.C3avg.cast('float'),
                df.C3med.cast('float'),
                df.C3std.cast('float'),
                df.C4min.cast('float'),
                df.C4max.cast('float'),
                df.C4avg.cast('float'),
                df.C4med.cast('float'),
                df.C4std.cast('float'),
                df.C5min.cast('float'),
                df.C5max.cast('float'),
                df.C5avg.cast('float'),
                df.C5med.cast('float'),
                df.C5std.cast('float'),
                df.CC1.cast('float'),
                df.CC2.cast('float'),
                df.CC3.cast('float'),
                df.CC4.cast('float'),
                df.CC5.cast('float'),
                df.BaseTime.cast('float'),
                df.PostLength.cast('float'),
                df.PostShare.cast('float'),
                df.PagePromote.cast('int'),  # integer encoded
                df.HLocal.cast('float'),
                df.PostSun.cast('int'),
                df.PostMon.cast('int'),
                df.PostTue.cast('int'),
                df.PostWed.cast('int'),
                df.PostThu.cast('int'),
                df.PostFri.cast('int'),
                df.PostSat.cast('int'),
                df.BaseSun.cast('int'),
                df.BaseMon.cast('int'),
                df.BaseTue.cast('int'),
                df.BaseThu.cast('int'),
                df.BaseFri.cast('int'),
                df.BaseSat.cast('int'),
                df.Comments.cast('float')
)

In [27]:
df_casted.printSchema()

root
 |-- PageLikes: float (nullable = true)
 |-- PageCheckin: float (nullable = true)
 |-- PageTalking: float (nullable = true)
 |-- PageCategory: integer (nullable = true)
 |-- C1min: float (nullable = true)
 |-- C1max: float (nullable = true)
 |-- C1avg: float (nullable = true)
 |-- C1med: float (nullable = true)
 |-- C1std: float (nullable = true)
 |-- C2min: float (nullable = true)
 |-- C2max: float (nullable = true)
 |-- C2avg: float (nullable = true)
 |-- C2med: float (nullable = true)
 |-- C2std: float (nullable = true)
 |-- C3min: float (nullable = true)
 |-- C3max: float (nullable = true)
 |-- C3avg: float (nullable = true)
 |-- C3med: float (nullable = true)
 |-- C3std: float (nullable = true)
 |-- C4min: float (nullable = true)
 |-- C4max: float (nullable = true)
 |-- C4avg: float (nullable = true)
 |-- C4med: float (nullable = true)
 |-- C4std: float (nullable = true)
 |-- C5min: float (nullable = true)
 |-- C5max: float (nullable = true)
 |-- C5avg: float (nullable = true

In [28]:
df_casted.show(5)

+---------+-----------+-----------+------------+-----+-----+---------+-----+--------+-----+-----+--------+-----+--------+-----+-----+---------+-----+--------+-----+-----+---------+-----+--------+-----+-----+--------+-----+--------+---+---+---+---+----+--------+----------+---------+-----------+------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+--------+
|PageLikes|PageCheckin|PageTalking|PageCategory|C1min|C1max|    C1avg|C1med|   C1std|C2min|C2max|   C2avg|C2med|   C2std|C3min|C3max|    C3avg|C3med|   C3std|C4min|C4max|    C4avg|C4med|   C4std|C5min|C5max|   C5avg|C5med|   C5std|CC1|CC2|CC3|CC4| CC5|BaseTime|PostLength|PostShare|PagePromote|HLocal|PostSun|PostMon|PostTue|PostWed|PostThu|PostFri|PostSat|BaseSun|BaseMon|BaseTue|BaseThu|BaseFri|BaseSat|Comments|
+---------+-----------+-----------+------------+-----+-----+---------+-----+--------+-----+-----+--------+-----+--------+-----+-----+---------+-----+--------+-----+-----+

One-hot encode PageCategory column

In [29]:
from pyspark.ml.feature import OneHotEncoder, OneHotEncoderModel


In [36]:
ohe1 = OneHotEncoder()
ohe1.setInputCols(["PageCategory"])
ohe1.setOutputCols(["PageCategory_onehot"])
model1 = ohe1.fit(df_casted)
model1.setOutputCols(["PageCategory_onehot"])
df_oh=model1.transform(df_casted)

In [38]:
df_oh.show(1)

+---------+-----------+-----------+------------+-----+-----+---------+-----+--------+-----+-----+--------+-----+--------+-----+-----+---------+-----+--------+-----+-----+---------+-----+--------+-----+-----+--------+-----+--------+---+---+---+---+---+--------+----------+---------+-----------+------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+--------+-------------------+
|PageLikes|PageCheckin|PageTalking|PageCategory|C1min|C1max|    C1avg|C1med|   C1std|C2min|C2max|   C2avg|C2med|   C2std|C3min|C3max|    C3avg|C3med|   C3std|C4min|C4max|    C4avg|C4med|   C4std|C5min|C5max|   C5avg|C5med|   C5std|CC1|CC2|CC3|CC4|CC5|BaseTime|PostLength|PostShare|PagePromote|HLocal|PostSun|PostMon|PostTue|PostWed|PostThu|PostFri|PostSat|BaseSun|BaseMon|BaseTue|BaseThu|BaseFri|BaseSat|Comments|PageCategory_onehot|
+---------+-----------+-----------+------------+-----+-----+---------+-----+--------+-----+-----+--------+-----+--------+-----+-----