In [80]:
import pandas as pd
import timeit
import boto3
import os
import pyspark as ps
import pyspark.sql.functions as f
import pyspark.sql.types as sparktypes
from pyspark.sql.functions import udf, col

In [45]:
SCRIPT_DIRECTORY = os.path.realpath("")
HOME_DIR = os.path.split(SCRIPT_DIRECTORY)[0]
DATA_DIR = os.path.join(HOME_DIR, "data")
print(SCRIPT_DIRECTORY)

/home/jovyan/work/dsi/capstones/icapstone/notebooks


In [60]:
spark = (ps.sql.SparkSession.builder 
    .master("local[*]") 
    .appName("country-pipeline")
    .getOrCreate()
    )
sc = spark.sparkContext
sc.setLogLevel("WARN")

In [71]:
fname = 'SharedResponsesSurvey.csv'
path = f'{DATA_DIR}/{fname}'
responses = spark.read.csv(path, header=True)
responses = responses.select(["UserID", "UserCountry3", "Saved", "Intervention", "CrossingSignal",\
    "PedPed", "ScenarioType", "AttributeLevel", "Review_age","Review_education", \
    "Review_gender", "Review_income", "Review_political" ,"Review_religious"])

In [49]:
fname = 'country_cluster_map.csv'
path = f'{DATA_DIR}/{fname}'
s3_client.download_file('paxton-dsi-capstone-i',\
                        fname,\
                        path)
countries = spark.read.csv(path, header=True).select('ISO3')

In [50]:
def p_intervention(dataf):
    '''
    Returns the proportion of choices in dataf that favored
    intervention over non-intervention, and n the number of choices analyzed.
        Params: dataf (Spark Dataframe)
        Returns: p (float), n (int)
    '''
    # probability of having chosen commission
    commits = dataf.filter("Saved = 1 AND Intervention = 1").count()
    # probability of having not chosen omission, meaning that the user must have chosen
    # commission in the scenario
    omits = dataf.filter("Saved = 0 AND Intervention = 0").count()
    n = dataf.count()
    try:
        return (round((commits + omits) / n, 4), n)
    except ZeroDivisionError:
        print("p_intervention received a dataframe without revelant entries.")
        return None

In [51]:
def p_legality(dataf):
    '''
    Returns p the proportion of choices from data in dataf that favored saving pedestrians
    crossing legally, and n the number of choices with the legal dimension.
        Params: dataf (Spark Dataframe)
        Returns: tuple: (p (float), n (int))
    '''
    legality = dataf.filter("CrossingSignal != 0 AND PedPed = 1")
    ## above line credit Edmond Awad, MMFunctionsShared.R
    ## found at: https://osf.io/3hvt2/files/
    n = legality.count()
    
    # probability of having chosen to save law-abiding
    peds = legality.filter("Saved = 1 AND CrossingSignal = 1").count()
    # probability of having chosen to not save non-law-abiding
    jwalkers = legality.filter("Saved = 0 AND CrossingSignal = 2").count()
    
    try:
        return (round((peds + jwalkers) / n, 4), n)
    except ZeroDivisionError:
        print("p_legality received a dataframe without revelant entries.")
        return None    

In [52]:
def p_factor(dataf, attribute):
    '''
       Returns the proportion of choices from data in dataf that favored the default choice, 
    default (str) the default choice for the factor, nondefault (str) the alternative choice
    for the factor, and n (int) the number of choices analyzed with the factor corresponding
    to the dimension.
        Parameters: dataf (Spark Dataframe), attribute (str)
        Returns: tuple: p (float), n (int), default (str), nondefault (str),
    '''
    attr = {"Utilitarian" : ['More', 'Less']\
              , "Gender" : ['Male', 'Female']\
              , "Social Status" : ['High', 'Low']\
              , "Age" : ['Young', 'Old']\
             , "Species" : []\
             , "Fitness" : []}
    try:
        default, nondefault = attr[attribute]
    except KeyError:
        print("p_factor received an invalid attribute.")
        return None  
    
    factor = dataf.filter(f"ScenarioType = '{attribute}' ")
    n = factor.count()
    # probability of having chosen the default
    defs = factor.filter(f"Saved = 1 AND AttributeLevel = '{default}'").count()
    # probability of having not chosen the nondefault
    nonnondefs = factor.filter(f"Saved = 0 AND AttributeLevel = '{nondefault}'").count()
    try:
        return ( round((defs + nonnondefs) / n, 4), n, default, nondefault )
    except ZeroDivisionError:
        print("p_factor received a dataframe without revelant entries.")
        return None   

In [53]:
## creating a pandas dataframe to hold preferences by country
pandas_cols = ["ISO3", "p_intervention", "n_intervention", "p_legality", "n_legality",\
           "p_util", "n_util", "p_gender", "n_gender", \
           "p_social", "n_social", "p_age", "n_age"]
factors = ["Utilitarian", "Gender", "Social Status", "Age"]
country_probs = pd.DataFrame(columns=pandas_cols)

In [76]:
country = "FRA"
country_responses = responses.filter(f"UserCountry3 = '{country}'").limit(10000)

In [74]:
intervention_positive = country_responses\
                        .filter("Saved = Intervention")
intervention_positive.filter("Saved=1").show(20)

+------+------------+-----+------------+--------------+------+------------+--------------+----------+----------------+-------------+-------------+----------------+----------------+
|UserID|UserCountry3|Saved|Intervention|CrossingSignal|PedPed|ScenarioType|AttributeLevel|Review_age|Review_education|Review_gender|Review_income|Review_political|Review_religious|
+------+------------+-----+------------+--------------+------+------------+--------------+----------+----------------+-------------+-------------+----------------+----------------+
+------+------------+-----+------------+--------------+------+------------+--------------+----------+----------------+-------------+-------------+----------------+----------------+



In [78]:
leg = country_responses.filter("CrossingSignal != 0 AND PedPed = 1")
leg_positive = country_responses.filter("Saved = 1 AND CrossingSignal = 1" or "Saved = 0 AND CrossingSignal = 2")
leg_positive.show(20)                                        

+----------------+------------+-----+------------+--------------+------+-------------+--------------+----------+----------------+-------------+-------------+----------------+----------------+
|          UserID|UserCountry3|Saved|Intervention|CrossingSignal|PedPed| ScenarioType|AttributeLevel|Review_age|Review_education|Review_gender|Review_income|Review_political|Review_religious|
+----------------+------------+-----+------------+--------------+------+-------------+--------------+----------+----------------+-------------+-------------+----------------+----------------+
|4085044494843110|         FRA|    1|           0|             1|     1|  Utilitarian|          More|        22|        graduate|         male|    under5000|             0.5|             0.5|
|7358818517052320|         FRA|    1|           0|             1|     0|       Random|          Rand|      null|         default|      default|      default|             0.5|             0.5|
|7358818517052320|         FRA|    1|   

In [82]:
def p_factor2(dataf, attribute):
    '''
    '''
    attr = {"Intervention" : ["Saved = 0 AND Intervention = 0", "Saved = 1 AND Intervention = 1"],\
            "Legality" : ["Saved = 1 AND CrossingSignal = 1", "Saved = 0 AND CrossingSignal = 2"],\
            "Utilitarian" : ['More', 'Less'],\
            "Gender" : ['Male', 'Female'],\
            "Social Status" : ['High', 'Low'],\
            "Age" : ['Young', 'Old']}
    
    ##
    try:
        default, nondefault = attr[attribute]
    except KeyError:
        print("p_factor received an invalid attribute.")
        return None  

    ##
    if attribute == "Legality":
        dataf = dataf.filter("CrossingSignal != 0 AND PedPed = 1")
        ## above line credit Edmond Awad, MMFunctionsShared.R
        ## found at: https://osf.io/3hvt2/files/
        positive = dataf.filter(default or nondefault)
    elif attribute == "Intervention":
        positive = dataf.filter("Saved = Intervention")
    else:
        default = f"Saved = 1 AND AttributeLevel = '{default}'"
        nonnondefault = f"Saved = 0 AND AttributeLevel = '{nondefault}'"

        dataf = dataf.filter(f"ScenarioType = '{attribute}' ")
        positive = dataf.filter(default or nonnondefault)

    n = dataf.count()
    try:
        p = positive.count() / n
    except ZeroDivisionError:
        p = -1
    
    return (p, n)