In [56]:
import pandas as pd
import numpy as np
import pyspark as ps
import pyspark.sql.functions as f

In [57]:
spark = (ps.sql.SparkSession.builder 
    .master("local") 
    .appName("user-pipeline")
    .getOrCreate()
    )
sc = spark.sparkContext
sc.setLogLevel("WARN")

In [58]:
## loading responses with survey data into a spark dataframe
path = "../data/SharedResponsesSurvey_10000.csv"
responses = spark.read.csv(path, header=True).sample(False, 0.1, np.random.randint(100))

In [59]:
responses.columns

['ResponseID',
 'ExtendedSessionID',
 'UserID',
 'ScenarioOrder',
 'Intervention',
 'PedPed',
 'Barrier',
 'CrossingSignal',
 'AttributeLevel',
 'ScenarioTypeStrict',
 'ScenarioType',
 'DefaultChoice',
 'NonDefaultChoice',
 'DefaultChoiceIsOmission',
 'NumberOfCharacters',
 'DiffNumberOFCharacters',
 'Saved',
 'Template',
 'DescriptionShown',
 'LeftHand',
 'UserCountry3',
 'Review_age',
 'Review_education',
 'Review_gender',
 'Review_income',
 'Review_political',
 'Review_religious']

In [60]:
## some EDA stuff
## getting responses by country
us_users = responses.select(["UserID", "UserCountry3", \
                  "Review_age","Review_education", \
                  "Review_gender", "Review_income", \
                  "Review_political" ,"Review_religious"])\
                .filter("UserCountry3 = 'USA' ")\
                .groupby(["UserID", "UserCountry3", \
                  "Review_age","Review_education", \
                  "Review_gender", "Review_income", \
                  "Review_political" ,"Review_religious"]).agg({"UserID":"count"})

us_users.show(20)

+----------------+------------+----------+----------------+-------------+-------------+----------------+----------------+-------------+
|          UserID|UserCountry3|Review_age|Review_education|Review_gender|Review_income|Review_political|Review_religious|count(UserID)|
+----------------+------------+----------+----------------+-------------+-------------+----------------+----------------+-------------+
| 809483690453245|         USA|        16|       underHigh|       female|      default|             0.5|             0.5|            1|
|9844649455077370|         USA|        15|          others|         male|      default|             0.5|            0.71|            1|
| 172458103030324|         USA|        16|       underHigh|         male|    under5000|             0.1|            0.76|            2|
|5507122359435470|         USA|        36|        bachelor|       female|  above100000|            0.83|             0.1|            1|
| 305832603477678|         USA|      null|      

In [66]:
ages = us_users.select("Review_age").filter(us_users["Review_age"] != 'null').collect()
ages = np.array(ages, dtype=int).reshape(np.shape(ages)[0])
ages

array([16, 15, 16, 36, 41, 13, 56, 30, 16, 13, 46, 26, 16, 69, 36, 18, 24,
       16, 17, 14, 73, 12, 18, 19, 16, 20, 22, 17, 22, 22, 17, 34, 17, 13,
       24, 55, 19, 30, 43, 26, 19, 31, 15, 17, 33, 22, 21, 10, 16, 43, 19,
       17, 25, 17, 22, 19, 12, 50, 25, 16, 17, 17, 11, 19, 14, 23, 17, 25,
       22, 28, 29, 14, 26, 21, 11, 28, 20, 12, 63, 15, 44, 11, 19, 40, 47,
       13, 50, 42, 22, 14,  3, 18, 12, 73, 18, 14, 29,  1, 14, 12, 40, 15,
       19, 30, 14, 18, 46, 19, 16, 22, 48, 16, 16, 13, 19, 15, 69, 25, 17,
       16, 20, 17, 17, 15, 13, 38])

In [67]:
genders = us_users.select("Review_gender").filter(us_users["Review_gender"] != 'null')..collect()
genders = np.array(genders, dtype=str).reshape(np.shape(genders)[0])
genders

array(['female', 'male', 'male', 'female', 'default', 'male', 'default',
       'female', 'male', 'female', 'female', 'female', 'male', 'male',
       'male', 'male', 'others', 'male', 'male', 'male', 'female',
       'female', 'male', 'default', 'male', 'male', 'female', 'male',
       'female', 'others', 'male', 'male', 'male', 'default', 'male',
       'female', 'default', 'female', 'female', 'female', 'female',
       'male', 'female', 'male', 'female', 'female', 'male', 'male',
       'default', 'male', 'male', 'default', 'others', 'male', 'default',
       'male', 'male', 'female', 'male', 'female', 'male', 'default',
       'male', 'default', 'female', 'female', 'male', 'others', 'male',
       'male', 'male', 'female', 'female', 'male', 'female', 'male',
       'female', 'male', 'male', 'male', 'male', 'male', 'female', 'male',
       'male', 'female', 'male', 'male', 'female', 'male', 'female',
       'male', 'male', 'male', 'female', 'male', 'male', 'default',
       'male', 

In [None]:
## user demographics
us_users = users.select('*').filter("UserCountry3 = 'USA' ")
n = us_users.count()
print(n)
genders = us_users.select("Review_gender").groupby("Review_gender").agg({"Review_gender": "count"})

ages = us_users.select("Review_age").groupby("Review_age").agg({"Review_age": "count"})
political = us_users.select("Review_political")

In [None]:
users_by_country = users.select(["UserCountry3","UserID"]).groupby("UserCountry3").agg({"UserID": "count"})
top_countries = users_by_country.select('*')\
                                .orderBy("count(userID)", ascending=False)\
                                .filter(users_by_country['count(UserID)'] > 20)\
                                .limit(50)