In [None]:
%run ./load_data.ipynb

In [None]:
import pandas as pd
import copy
import glob
from datetime import datetime
import time
import numpy as np
import statsmodels.api as sm
import patsy

In [None]:
cols = list(unscored_sample_tweets_extended.columns)
niloo_farnaz = pd.concat([sample_tweets_farnaz_f3w, sample_tweets_niloo_f3w])

In [None]:
COLS = [ 'retweet_count', 'reply_count',
       'like_count', 'quote_count', 'hashtag_count', 'uppercase_count', 'uppercase_pct',
       'exclamation_mark_count', 'question_mark_count', 'url_count',
       'mention_count', 'emojie_count', 'engagement_score',
       'sentiment_score', 'avg_replies_sentiment',
       'avg_quotes_sentiment', 'avg_conversation_sentiment',
       'var_replies_sentiment', 'var_quotes_sentiment',
       'var_conversation_sentiment', 'pct_similar_sentiment_replies',
       'pct_similar_sentiment_quotes', 'var_replies_semantic',
       'var_quotes_semantic', 'var_conversation_semantic']

In [None]:
def regression(df_input, indep_variable_list, X_1=None):
    
    dep_var = [X_1]

    both = indep_variable_list + dep_var
    df = df_input[both]
    
    df_temp0 = df.copy()
    
    df_temp = df_temp0.dropna()
    
    if not X_1:
        print('Explanatory variable is missing')

    s = f'{X_1} ~ ' 
    
   
    for elm in indep_variable_list:
        s = s + ' + ' + elm
        
    y, X = patsy.dmatrices(s, data=df_temp, return_type='dataframe')
    
    #
    mod = sm.OLS(y, X)
    res = mod.fit()
    
    return res


def cat_regression(df_input, indep_variable_list, X_1=None):
    
    dep_var = [X_1]

    both = indep_variable_list + dep_var
    df = df_input[both]
    
    df_temp0 = df.copy()
    
    df_temp = df_temp0.dropna()
    
    if not X_1:
        print('Explanatory variable is missing')
        
    s = f'{X_1} ~ ' 
    
   
    for elm in indep_variable_list:
        s = s + ' + ' + elm
        
    y, X = patsy.dmatrices(s, data=df_temp, return_type='dataframe')
    
    #
    mod = sm.Logit(y, X)
    res = mod.fit()

    return res

In [None]:
# Define function to output plot of the model coefficients

def coefplot(results):
    '''
    Takes in results of OLS model and returns a plot of 
    the coefficients with 95% confidence intervals.
    
    Removes intercept, so if uncentered will return error.
    '''
    # Create dataframe of results summary 
    coef_df = pd.DataFrame(results.summary().tables[1].data)
    
    # Add column names
    coef_df.columns = coef_df.iloc[0]

    # Drop the extra row with column labels
    coef_df=coef_df.drop(0)

    # Set index to variable names 
    coef_df = coef_df.set_index(coef_df.columns[0])

    # Change datatype from object to float
    coef_df = coef_df.astype(float)

    # Get errors; (coef - lower bound of conf interval)
    errors = coef_df['coef'] - coef_df['[0.025']
    
    # Append errors column to dataframe
    coef_df['errors'] = errors

    # Drop the constant for plotting
    # coef_df = coef_df.drop(['const'])

    # Sort values by coef ascending
    coef_df = coef_df.sort_values(by=['coef'])

    ### Plot Coefficients ###

    # x-labels
    variables = list(coef_df.index.values)
    
    # Add variables column to dataframe
    coef_df['variables'] = variables
    
    # Set sns plot style back to 'poster'
    # This will make bars wide on plot
    sns.set_context("poster")

    # Define figure, axes, and plot
    fig, ax = plt.subplots(figsize=(15, 10))
    
    # Error bars for 95% confidence interval
    # Can increase capsize to add whiskers
    coef_df.plot(x='variables', y='coef', kind='bar',
                 ax=ax, color='none', fontsize=22, 
                 ecolor='steelblue',capsize=0,
                 yerr='errors', legend=False)
    
    # Set title & labels
    plt.title('Coefficients of Features w/ 95% Confidence Intervals',fontsize=30)
    ax.set_ylabel('Coefficients',fontsize=22)
    ax.set_xlabel('',fontsize=22)
    
    # Coefficients
    ax.scatter(x=pd.np.arange(coef_df.shape[0]), 
               marker='o', s=80, 
               y=coef_df['coef'], color='steelblue')
    
    # Line to define zero on the y-axis
    ax.axhline(y=0, linestyle='--', color='red', linewidth=1)
    
    return plt.show()

In [None]:
def cat_csore(df):
    df_temp = copy.deepcopy(df)
    df_temp.loc[df_temp['GS_controversiality_score'] == 0, 'GS_controversiality_score'] = 0
    df_temp.loc[df_temp['GS_controversiality_score'] == 1, 'GS_controversiality_score'] = 0
    df_temp.loc[df_temp['GS_controversiality_score'] == 2, 'GS_controversiality_score'] = 1
    df_temp.loc[df_temp['GS_controversiality_score'] == 3, 'GS_controversiality_score'] = 1
    return df_temp
        

In [None]:
sample_tweets_farnaz['GS_controversiality_score'].unique()

In [None]:
cat_sample_tweets_farnaz = cat_csore(sample_tweets_farnaz)
cat_sample_tweets_farnaz['GS_controversiality_score'].unique()

In [None]:
reg = regression(sample_tweets_farnaz, COLS, 'GS_controversiality_score')
print(reg.summary())

In [None]:
# reg = cat_regression(cat_sample_tweets_farnaz, COLS, 'GS_controversiality_score')
# print(reg.summary())

In [None]:
reg = regression(sample_tweets_farnaz_f3w, COLS, 'GS_controversiality_score')
print(reg.summary())

In [None]:
reg = regression(sample_tweets_niloo, COLS,'GS_controversiality_score')
print(reg.summary())

In [None]:
reg = regression(sample_tweets_niloo_f3w, COLS, 'GS_controversiality_score')
print(reg.summary())

In [None]:
reg = regression(niloo_farnaz, COLS, 'GS_controversiality_score')
print(reg.summary())

In [None]:

# print(niloo_farnaz['conversation_id'].nunique())
indep = ['like_count',  'var_replies_sentiment', 'var_quotes_sentiment','var_conversation_sentiment'
         ,'var_replies_semantic','var_quotes_semantic',
         'var_conversation_semantic', 'engagement_score']
reg = regression(niloo_farnaz, indep, 'GS_controversiality_score')

print(reg.summary())


In [None]:

# df = pd.DataFrame(results_summary.tables[1].data)

# df.columns = df.iloc[0,:]
# df = df.drop([0])
# df['[0.025'].astype(float)
coef_df = pd.DataFrame(reg.summary().tables[1].data)

# Add column names
coef_df.columns = coef_df.iloc[0]

# Drop the extra row with column labels
coef_df=coef_df.drop(0)

# # Set index to variable names 
# coef_df = coef_df.set_index(coef_df.columns[0])

# # Change datatype from object to float
# coef_df = coef_df.astype(float)

coef_df = coef_df.sort_values(by=['coef'])

coef_df['stats'] = coef_df.iloc[:, 0]
plot_df = coef_df[['stats','coef', 'std err', 't', 'P>|t|', '[0.025', '0.975]']]
plot_df

In [None]:
(0.578 + 0.466) /2

In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Scatter(
        x=plot_df['stats'],
        y=plot_df['coef'].astype(float),
        error_y=dict(
            type='data',
            symmetric=False,
            array=plot_df['coef'].astype(float) - plot_df['[0.025'].astype(float),
            arrayminus=plot_df['0.975]'].astype(float) - plot_df['coef'].astype(float))
        ))
fig.add_hline(y=0, line_width=3, line_dash="dash", line_color="red")

fig.update_layout(
    autosize=False,
    width=600,
    height=600,
    yaxis=dict(
        title_text="Y-axis Title",
        tickmode="array",
        titlefont=dict(size=11),
    )
)



fig.show()

In [None]:
import plotly.express as px

fig = px.scatter(plot_df, x="stats", y="coef", color="stats",
                 error_y="0.975]", error_y_minus="[0.025", width=800, height=400)
fig.show()
