In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import os

In [20]:
df = pd.read_excel(os.path.join("..", "..","Datasets","smc","SMC_DATA_clean.xlsx"))

### Generate platform indicators

In [48]:
# platforms and alternative spellings
platform_map = {
    "reddit": ["redit"],
    "snapchat": [],
    "twitter": [],
    "instagram": [],
    "facebook": [],
    "youtube": [],
    "tiktok": ["tik tok", "tik-tok"],
    #"imessage": [],
    "discord": [],
    #"facetime": ["facetiming", "face timing"],
    "pinterest": [],
}

for platform in platform_map.keys():
    platform_map[platform].append(platform)
    
# ignoring because out of scope: Slack, Spotify, Apple Music, Zoom, Netflix, Hulu, Disney+, HBO max, viki, Safari, email, Gmail
# excluding due to low prevalence ( <5% of participants, n=23):   
    # messenger     22.0
    # tinder         6.0
    # hinge          2.0
    # linkedin      21.0
    # bereal        18.0
    # vsco          15.0
    # whatsapp      15.0
    # groupme       19.0
    # weibo          1.0
    # wechat         5.0
    # yikyak         6.0
    # twitch         9.0

In [49]:
# combine behaviors and goals into single field called STATE_behaviors_and_goals
# make STATE_behaviors_and_goals lower case
df['STATE_behaviors_and_goals'] = df['STATE_behaviors'] + ' ' + df['STATE_goals']
df['STATE_behaviors_and_goals'] = df['STATE_behaviors_and_goals'].apply(lambda x: x.lower() if type(x) == str else x)

In [50]:
# make indicator variable for each platform of interest
for platform in platform_map.keys():
    platform_spellings = platform_map[platform]
    df[platform] = df['STATE_behaviors_and_goals'].apply(lambda x: int(any(spelling in x for spelling in platform_spellings)) if type(x) == str else x)

platforms = [p for p in platform_map.keys()]

In [51]:
# peak at generated data
df[df['SURVEY'] == 0][platforms].head()

Unnamed: 0,reddit,snapchat,twitter,instagram,facebook,youtube,tiktok,discord,pinterest
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
5,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
10,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
15,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
20,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0


### Platform Prevalence

In [52]:
(df.groupby('SEMESTER')[platforms].sum()/92).loc[
    ['Fall 2020',
     'Spring 2021',
     'Fall 2021',
     'Spring 2022',
     'Fall 2022',
     'Spring 2023',
    ]
]

Unnamed: 0_level_0,reddit,snapchat,twitter,instagram,facebook,youtube,tiktok,discord,pinterest
SEMESTER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Fall 2020,0.097826,0.706522,0.358696,0.75,0.25,0.26087,0.445652,0.054348,0.043478
Spring 2021,0.130435,0.804348,0.413043,0.945652,0.282609,0.391304,0.543478,0.26087,0.043478
Fall 2021,0.076087,0.728261,0.271739,0.793478,0.206522,0.293478,0.576087,0.086957,0.097826
Spring 2022,0.184783,0.782609,0.326087,0.945652,0.195652,0.369565,0.717391,0.282609,0.141304
Fall 2022,0.141304,0.684783,0.195652,0.869565,0.173913,0.336957,0.652174,0.141304,0.130435
Spring 2023,0.065217,0.423913,0.173913,0.543478,0.086957,0.195652,0.467391,0.054348,0.043478


In [53]:
df[platforms].describe().loc[['mean']]

Unnamed: 0,reddit,snapchat,twitter,instagram,facebook,youtube,tiktok,discord,pinterest
mean,0.126482,0.750988,0.316206,0.881423,0.217391,0.335968,0.618577,0.160079,0.090909


### Correlations

In [54]:
interest_cols = [col for col in df.columns if col.endswith('_total') or col.endswith('_score')]

In [55]:
interest_cols

['BSMAS_total',
 'PSS_total',
 'ADTS_ANX_total',
 'ADTS_P_total',
 'ADTS_N_total',
 'RSES_total',
 'SWLS_total',
 'PSOC_total',
 'LONE_total',
 'SoPA_total',
 'SoNA_total',
 'ChQ_total',
 'WEEKLY_self_assessment_score']

In [56]:
df_is = df[df['SURVEY'] == 0]

In [57]:
df_mean_scores = df[['ParticipantID'] + interest_cols].groupby('ParticipantID').mean().reset_index()

In [58]:
df_is[['ParticipantID'] + platforms].merge(df_mean_scores, on='ParticipantID').corr().loc[interest_cols][platforms].round(3)

Unnamed: 0,reddit,snapchat,twitter,instagram,facebook,youtube,tiktok,discord,pinterest
BSMAS_total,-0.071,0.089,0.097,0.076,0.063,-0.093,0.273,-0.105,0.043
PSS_total,0.008,-0.093,0.111,-0.103,-0.015,-0.039,0.064,0.015,0.048
ADTS_ANX_total,0.031,0.03,0.064,-0.067,-0.027,0.107,0.187,0.075,0.044
ADTS_P_total,0.116,-0.034,0.026,-0.03,-0.02,0.112,-0.008,0.181,-0.029
ADTS_N_total,-0.017,0.048,-0.029,0.095,-0.031,-0.008,0.02,-0.224,-0.03
RSES_total,-0.028,0.1,0.115,0.049,0.113,0.009,-0.138,0.002,-0.103
SWLS_total,-0.09,0.265,-0.144,0.215,-0.002,-0.092,0.084,-0.04,0.066
PSOC_total,-0.23,0.206,-0.055,0.248,0.023,-0.149,0.196,-0.138,0.127
LONE_total,0.099,0.0,-0.073,-0.092,0.074,-0.006,0.065,-0.038,-0.17
SoPA_total,-0.043,0.087,-0.024,0.013,-0.144,0.082,-0.271,-0.058,-0.18


In [59]:
df_is[platforms].corr()

Unnamed: 0,reddit,snapchat,twitter,instagram,facebook,youtube,tiktok,discord,pinterest
reddit,1.0,-0.165858,0.086473,-0.136297,0.001253,0.106973,-0.080652,0.223034,-0.016922
snapchat,-0.165858,1.0,0.018103,0.283561,0.070814,-0.103219,0.112345,-0.184833,0.023123
twitter,0.086473,0.018103,1.0,0.025929,0.156801,0.056195,0.008992,0.062444,-0.008064
instagram,-0.136297,0.283561,0.025929,1.0,0.059926,-0.217974,0.165049,-0.173303,0.052194
facebook,0.001253,0.070814,0.156801,0.059926,1.0,-0.080712,0.06862,-0.007954,-0.016667
youtube,0.106973,-0.103219,0.056195,-0.217974,-0.080712,1.0,-0.139186,0.305658,0.007939
tiktok,-0.080652,0.112345,0.008992,0.165049,0.06862,-0.139186,1.0,-0.211984,0.106789
discord,0.223034,-0.184833,0.062444,-0.173303,-0.007954,0.305658,-0.211984,1.0,-0.044314
pinterest,-0.016922,0.023123,-0.008064,0.052194,-0.016667,0.007939,0.106789,-0.044314,1.0
