### **Cleaning**

In [4]:
def remove_columns(DATA, missing_threshold):
    missing = pd.DataFrame(DATA.isnull().sum()/DATA.shape[0])*100
    missing = missing[missing[0] > 0]
    missing = missing.sort_values(by=[0], ascending=False)

    to_delete = missing[missing[0] > missing_threshold].index

    if missing.shape[0] == 0:
        print("No missing values!")

    else:
        percent_missing = missing.loc[to_delete,:].shape[0]/missing.shape[0]*100
        DATA.drop(to_delete, axis=1, inplace = True)
        DATA.reset_index(inplace = True, drop = True)
        print(round(percent_missing, 1), "percent of cols with >", missing_threshold, "% missing data")
    return DATA

In [1]:
def drop_text_and_unused_cols(data):
    drop_cols = ['Unnamed: 0', 
             'C1_School closing', 'C2_Workplace closing', 'C3_Cancel public events',
             'C4_Restrictions on gatherings', 'C5_Close public transport', 'C6_Stay at home requirements',
             'C7_Restrictions on internal movement', 'C8_International travel controls',
             'ConfirmedCases', 'ConfirmedDeaths', 'ContainmentHealthIndex', 'ContainmentHealthIndexForDisplay',
             'CountryName', 'Date',
             'Demographics_COVID_Current State', # only NYC
             'Demographics_COVID_Profession', # free text
             'Demographics_General_Years of Post-School', 'Demographics_General_Years of School', # combined in the 'Demographics_General_Education column'
             'E1_Income support', 'E2_Debt/contract relief', 'E3_Fiscal measures', 'E4_International support',
             'ESM_ID', 'ESM_Self-Generated Code', 
             'EconomicSupportIndex', 'EconomicSupportIndexForDisplay', 
             'GovernmentResponseIndex', 'GovernmentResponseIndexForDisplay',
             'H1_Public information campaigns', 'H2_Testing policy', 
             'H3_Contact tracing', 'H4_Emergency investment in healthcare', 'H5_Investment in vaccines',
             'ResponseId', 'StartDate', 
             'StringencyIndex', 'StringencyIndexForDisplay', 'StringencyLegacyIndexForDisplay',
             'Survey Info_Duration', 'Survey Info_EndDate', 'Survey Info_Group', 
             'Survey Info_Progress', 'Survey Info_RecordedDate', 'Survey Info_Source', 
             'date', 'to_cut_actvities'
            ]
    data = data.drop(columns = drop_cols, axis = 1)
    data.reset_index(inplace = True, drop = True) 
    return data

### **Describing  High and Low Scorers on the 5 Factors**

In [1]:
def high_low(DATA, colList):
    means_df = pd.DataFrame({"Score":['High','Low']}).set_index('Score')
    for k in colList:
        means_df.loc['High',k] = DATA.loc[DATA.type=='high',k].mean()
        means_df.loc['Low',k] = DATA.loc[DATA.type=='low',k].mean()
    means_df = means_df.T
    means_df['Difference'] = means_df['High'] - means_df['Low'] 
    means_df.columns=[s+'_Mean' for s in means_df.columns]

    sds_df = pd.DataFrame({"Score":['High','Low']}).set_index('Score')
    for k in colList:
        sds_df.loc['High',k] = DATA.loc[DATA.type=='high',k].std()
        sds_df.loc['Low',k] = DATA.loc[DATA.type=='low',k].std()
    sds_df = sds_df.T
    sds_df['Difference'] = sds_df['High'] - sds_df['Low'] 
    sds_df.columns=[s+'_SD' for s in sds_df.columns]

    summary_df = means_df.merge(sds_df, left_index=True, right_index=True)
    summary_df = summary_df.reindex(sorted(summary_df.columns), axis=1)
    return summary_df

In [2]:
def split_into_high_low(df, col, low_cutoff, high_cutoff):
    high = df[df[col] > np.percentile(df[col], high_cutoff)]
    high['type'] = 'high'

    low = df[df[col] < np.percentile(df[col], low_cutoff)]
    low['type'] = 'low'

    new_df = high.append(low, ignore_index=True)
    
    return new_df

In [3]:
def factor_summary(DATA):
    act_list = [col for col in df.columns if col.startswith('Activities_')]
    sit_list = [col for col in df.columns if col.startswith('Situations_')]
    fun_list = [col for col in df.columns if col.startswith('Functions_')]
    hyp_list = [col for col in df.columns if col.startswith('Hypotheses_')]
    eng_list = [col for col in df.columns if col.startswith('Music Engagement_')]
    
    act_df = high_low(DATA, act_list)
    sit_df = high_low(DATA, sit_list)
    fun_df = high_low(DATA, fun_list)
    hyp_df = high_low(DATA, hyp_list)
    eng_df = high_low(DATA, eng_list)
    
    summary_df = act_df.append(sit_df, ignore_index=False).append(fun_df, ignore_index=False).append(hyp_df, ignore_index=False).append(eng_df, ignore_index=False)
    return summary_df

### **LGBM**

In [1]:
def lgbmr_model(dv, data): 
    y = dv
    X = data
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
    
    cor = X_train.corr().abs()
    cor = cor.unstack()
    cor = cor.sort_values(kind="quicksort", ascending = False)
    cor = pd.DataFrame(cor)
    cor.columns = ['cor']

    highcor = cor.query('0.90 < cor < 1')
    print((len(highcor)/len(cor))*100, "% of correlations > 0.90\n")
    
    estimator = lgb.LGBMRegressor()
    parameter_grid = {
        'learning_rate': [0.05, 0.1,0.16],
        'n_estimators': [50,100,200,300,500],
        'num_leaves':[40,50],
        'max_depth':[10,30,50], 
        'subsample_for_bin':[500,1000], 
        'min_child_samples':[10],
        'random_state':[123]
    }
    model = GridSearchCV(estimator, parameter_grid, cv = 3)
    model.fit(X_train, y_train)
    
    lgbmr = model.best_estimator_
    print('Best parameters found by grid search are:', lgbmr)
    print("\n=================================")
    
    # model
    lgbmr.fit(X_train, y_train)
    y_pred = lgbmr.predict(X_test)
    
    print(f'LightGBM Regression_r2: {r2_score(y_test, y_pred).round(4)}')
    print(f'LightGBM Regression_Mean_squared_error: {mean_squared_error(y_test, y_pred).round(4)}')
    print(f'LightGBM Regression_Mean_absolute_error: {mean_absolute_error(y_test, y_pred).round(4)}')
    
    return lgbmr