In [None]:
%run ./load_data.ipynb

In [None]:
import pandas as pd
import copy
import glob
from datetime import datetime
import time
import numpy as np
import statsmodels.api as sm
import patsy
import plotly.graph_objects as go

In [None]:
def regression(df_input, indep_variable_list, X_1=None):
    
    dep_var = [X_1]

    both = indep_variable_list + dep_var
    df = df_input[both]
    
    df_temp0 = df.copy()
    
    df_temp = df_temp0.dropna()
    
    if not X_1:
        print('Explanatory variable is missing')

    s = f'{X_1} ~ ' 
    
   
    for elm in indep_variable_list:
        s = s + ' + ' + elm
        
    y, X = patsy.dmatrices(s, data=df_temp, return_type='dataframe')
    
    #
    mod = sm.OLS(y, X)
    res = mod.fit()
    
    return res


def df_remove_sample(df_all, df_sample):
    
    all_ids = set(df_all['conversation_id'].tolist())
    sample_ids = set(df_sample['conversation_id'].tolist())
    
    lst_difference = list(all_ids - sample_ids)
    
    return df_all.loc[df_all['conversation_id'].isin(lst_difference)]


In [None]:
def plot_regression(reg_result):

    coef_df = pd.DataFrame(reg.summary().tables[1].data)
    
    coef_df.columns = coef_df.iloc[0]

    coef_df=coef_df.drop(0)


    coef_df = coef_df.sort_values(by=['coef'])

    coef_df['stats'] = coef_df.iloc[:, 0]
    plot_df = coef_df[['stats','coef', 'std err', 't', 'P>|t|', '[0.025', '0.975]']]
    
    
    fig = go.Figure(data=go.Scatter(
        x= plot_df['coef'].astype(float),
        y= plot_df['stats'],
        mode='markers',
        error_x=dict(
            type='data',
            symmetric=False,
            thickness=1.5,
            array=plot_df['coef'].astype(float) - plot_df['[0.025'].astype(float),
            arrayminus=plot_df['0.975]'].astype(float) - plot_df['coef'].astype(float))
        ))
    fig.add_vline(x=0, line_width=3, line_dash="dash", line_color="red")
    

    fig.update_layout(
        xaxis_mirror=True,
        yaxis_mirror=True,
        template='simple_white',
        autosize=False,
        width=800,
        height=600,
        yaxis=dict(
            title_text="Explanatory variables",
            tickmode="array",
            titlefont=dict(size=11),
        ),
        xaxis=dict(
            title_text="Coefficents of the explanatory variables variables",
            tickmode="array",
            titlefont=dict(size=11),
        )
        
    )

    fig.show()

In [None]:
Vars = ['retweet_count', 'reply_count', 'like_count', 'quote_count', 'reply_settings','hashtag_count', 'uppercase_count', 'uppercase_pct', 
        'exclamation_mark_count', 'question_mark_count', 'url_count', 'mention_count', 'emojie_count', 'followers_count', 'engagement_score', 
        'sentiment_score', 'sentiment_category', 'avg_replies_sentiment', 'avg_quotes_sentiment', 'avg_conversation_sentiment', 'var_replies_sentiment', 
        'var_quotes_sentiment', 'var_conversation_sentiment','pct_similar_sentiment_replies', 'pct_similar_sentiment_quotes',
        'var_replies_semantic', 'var_quotes_semantic', 'var_conversation_semantic', 'engagement_score_log_normal', 'Segment', 'WC', 'Analytic', 
        'Clout', 'Authentic', 'Tone', 'WPS', 'BigWords', 'Dic', 'Linguistic', 'function', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they', 
        'ipron', 'article', 'number', 'prep', 'auxverb', 'adverb', 'conj', 'negate', 'verb', 'adj', 'quantity', 'Drives', 'affiliation', 
        'achieve', 'power', 'Cognition', 'allnone', 'cogproc', 'insight', 'cause', 'tentat', 'certitude', 'differ', 'memory','Affect',
        'tone_pos', 'tone_neg', 'emotion', 'emo_pos', 'emo_neg', 'emo_anx', 'emo_anger', 'emo_sad', 'swear', 'Social', 'socbehav', 'prosocial', 
        'polite', 'conflict', 'moral', 'comm', 'socrefs', 'family', 'friend', 'female', 'male', 'Culture', 'politic', 'ethnicity', 'tech', 'Lifestyle', 
        'leisure', 'home', 'work', 'money', 'relig', 'Physical', 'health', 'illness', 'wellness', 'mental', 'substances', 'sexual', 'food', 'death', 
        'need', 'want', 'acquire', 'lack', 'fulfill', 'fatigue', 'reward', 'risk', 'curiosity', 'allure', 'Perception', 'attention', 'motion', 'space', 
        'visual', 'auditory', 'feeling', 'time', 'focuspast', 'focuspresent', 'focusfuture', 'Conversation', 'netspeak', 'assent', 'nonflu', 'filler', 
        ]


for i in range(123):
    reg = regression(EXTENDED_TWEETS, Vars, 'controversiality')
    var = pd.DataFrame(reg.summary().tables[1].data).sort_values(by=4).iloc[-2][0]
    pvalue = pd.DataFrame(reg.summary().tables[1].data).sort_values(by=4).iloc[-2][4]
    print(f"{i:0d}, Len(Vars) = {len(Vars):3d}: about to remove: {var:10s}, pvalue = {pvalue}")
    if var in Vars:
        Vars.remove(var)
    else:
        vvar = var[:var.index('[')]
        if  vvar in Vars:
            Vars.remove(vvar)
        else:
            print("Oopsi")
    if i > 110:        
        print(reg.summary())
    print('********************************************************************************************************************')

# Regression - Sample

In [None]:
indep_var = [
     'retweet_count',
 'reply_count',
 'like_count',
 'quote_count',
 'reply_settings',
 'hashtag_count',
 'uppercase_count',
 'uppercase_pct',
 'exclamation_mark_count',
 'question_mark_count',
 'url_count',
 'mention_count',
 'emojie_count',
 'followers_count',
 'engagement_score',
 'sentiment_score',
 'sentiment_category',
 'avg_replies_sentiment',
 'avg_quotes_sentiment',
 'avg_conversation_sentiment',
 'var_replies_sentiment',
 'var_quotes_sentiment',
 'var_conversation_sentiment',
 'pct_similar_sentiment_replies',
 'pct_similar_sentiment_quotes',
 'var_replies_semantic',
 'var_quotes_semantic',
 'var_conversation_semantic',
]
reg =  regression(niloo_farnaz, indep_var, 'GS_controversiality_score')
plot_regression(reg)
print(reg.summary())

In [None]:
indep_var = [
     'retweet_count',
 'reply_count',
 'like_count',
 'quote_count',
 'reply_settings',
 'hashtag_count',
 'uppercase_count',
 'uppercase_pct',
 'exclamation_mark_count',
 'question_mark_count',
 'url_count',
 'mention_count',
 'emojie_count',
 'followers_count',
 'engagement_score',
 'sentiment_score',
 'sentiment_category',
 'avg_replies_sentiment',
 'avg_quotes_sentiment',
 'avg_conversation_sentiment',
 'var_replies_sentiment',
 'var_quotes_sentiment',
 'var_conversation_sentiment',
 'pct_similar_sentiment_replies',
 'pct_similar_sentiment_quotes',
 'var_replies_semantic',
 'var_quotes_semantic',
 'var_conversation_semantic',
]
print(len(indep_var))

for i in range(23):
    reg = regression(niloo_farnaz_rest, indep_var, 'GS_controversiality_score')
    var = pd.DataFrame(reg.summary().tables[1].data).sort_values(by=4).iloc[-2][0]
    pvalue = pd.DataFrame(reg.summary().tables[1].data).sort_values(by=4).iloc[-2][4]
    print(f"{i:0d}, Len(indep_var) = {len(indep_var):3d}: about to remove: {var:10s}, pvalue = {pvalue}")
    if var in indep_var:
        indep_var.remove(var)
    else:
        vvar = var[:var.index('[')]
        if  vvar in indep_var:
            indep_var.remove(vvar)
        else:
            print("Oopsi")
            
    print(reg.summary())
    print('********************************************************************************************************************')

In [None]:
Intercept = 0.0363
var_quotes_semantic = 7.2802
var_replies_semantic = 6.9532
var_conversation_sentiment = 1.6245
var_replies_sentiment = 1.5732
var_quotes_sentiment = 0.5565
engagement_score = 0.1974

In [None]:
final_var = [ 'engagement_score', 'var_replies_sentiment', 'var_quotes_sentiment', 'var_conversation_sentiment', 'var_replies_semantic', 'var_quotes_semantic']

reg = regression(akhari, final_var, 'GS_controversiality_score')
print(reg.summary().as_latex())
plot_regression(reg)

In [None]:
result = {}
score = {}
for i, row in akhari.iterrows():

    x1 = row['var_quotes_semantic']
    x2 = row['var_replies_semantic']
    x3 = row['var_conversation_sentiment']
    x4 = row['var_replies_sentiment']
    x5 = row['var_quotes_sentiment']
    x6 = row['engagement_score']
    
    r = Intercept + (x1 * var_quotes_semantic) + (x2 * var_replies_semantic) + (x3 * var_conversation_sentiment) + (x4 * var_replies_sentiment) + (x5 * var_quotes_sentiment) + (x6 * engagement_score) 
    
    result[row['conversation_id']] = r
    score[row['conversation_id']] = row['GS_controversiality_score']


In [None]:
result1 = pd.DataFrame.from_dict(result, orient='index').reset_index()
score1 = pd.DataFrame.from_dict(score, orient='index').reset_index()

In [None]:
saeed = pd.merge(result1, score1, on='index')

In [None]:
saeed['diff'] = abs(saeed['0_x'] - saeed['0_y']) < 0.1

In [None]:
saeed.sum()['diff']/len(saeed)

# Extended tweets

In [None]:
print(regression(EXTENDED_TWEETS, final_var, 'controversiality').summary())

In [None]:
print('count  0:', len(EXTENDED_TWEETS.loc[EXTENDED_TWEETS['controversiality']==0]), 'pct:' ,len(EXTENDED_TWEETS.loc[EXTENDED_TWEETS['controversiality']==0])/len(EXTENDED_TWEETS)*100)
print('count  1:', len(EXTENDED_TWEETS.loc[EXTENDED_TWEETS['controversiality']==1]), 'pct:' ,len(EXTENDED_TWEETS.loc[EXTENDED_TWEETS['controversiality']==1])/len(EXTENDED_TWEETS)*100)
print('count  2:', len(EXTENDED_TWEETS.loc[EXTENDED_TWEETS['controversiality']==2]), 'pct:' ,len(EXTENDED_TWEETS.loc[EXTENDED_TWEETS['controversiality']==2])/len(EXTENDED_TWEETS)*100)
print('count  3:', len(EXTENDED_TWEETS.loc[EXTENDED_TWEETS['controversiality']==3]), 'pct:' ,len(EXTENDED_TWEETS.loc[EXTENDED_TWEETS['controversiality']==3])/len(EXTENDED_TWEETS)*100)

In [None]:
print('count  0:', len(niloo_farnaz.loc[niloo_farnaz['GS_controversiality_score']==0]), 'pct:' ,len(niloo_farnaz.loc[niloo_farnaz['GS_controversiality_score']==0])/len(niloo_farnaz)*100)
print('count  1:', len(niloo_farnaz.loc[niloo_farnaz['GS_controversiality_score']==1]), 'pct:' ,len(niloo_farnaz.loc[niloo_farnaz['GS_controversiality_score']==1])/len(niloo_farnaz)*100)
print('count  2:', len(niloo_farnaz.loc[niloo_farnaz['GS_controversiality_score']==2]), 'pct:' ,len(niloo_farnaz.loc[niloo_farnaz['GS_controversiality_score']==2])/len(niloo_farnaz)*100)
print('count  3:', len(niloo_farnaz.loc[niloo_farnaz['GS_controversiality_score']==3]), 'pct:' ,len(niloo_farnaz.loc[niloo_farnaz['GS_controversiality_score']==3])/len(niloo_farnaz)*100)

In [None]:
len(niloo_farnaz)