In [1]:
# Code adapted from https://github.com/manmeetkaurbaxi/2020-US-Elections
import pandas as pd
import numpy as np
import plotly.express as px
import scipy.stats as stats
from scipy.signal import savgol_filter
import warnings
warnings.filterwarnings("ignore")
import glob

SPAN = 500
WINDOW_LENGTH = 151
POLY_ORDER = 8

In [2]:
def calculateAverageEngagementsPerDay(dataframe, fromDate, toDate):
    
    dataframe = dataframe[(dataframe['created_at']>=fromDate) & (dataframe['created_at']<=toDate)]
    
    dataframe['engagement_rate'] = dataframe['like_count'].astype(int) + dataframe['reply_count'].astype(int) + dataframe['retweet_count'].astype(int) + dataframe['quote_count'].astype(int)
    
    engagements_per_day = dataframe.groupby(['created_at']).agg({'engagement_rate':'sum'}).reset_index()
    tweets_per_day = (dataframe.groupby(['created_at'])['tweet'].count()).to_frame('tweets_per_day')
    
    average_engagements_per_day = tweets_per_day.merge(engagements_per_day, how='inner', on='created_at')
    average_engagements_per_day['average_engagement_per_day'] = np.round((average_engagements_per_day['engagement_rate']/ (4 * average_engagements_per_day['tweets_per_day'])), 2)
    
    return average_engagements_per_day

In [3]:
user_info_df = pd.read_csv('../../data/twitter/user_info_reframe.csv')

In [4]:
user_info_df.columns

Index(['created_at', 'id', 'name', 'username', 'followers_count',
       'following_count', 'tweet_count', 'listed_count', 'description',
       'location', 'verified', 'user_impact', 'user_impact_scaled', 'group',
       'group_category', 'user_impact_scaled_by_group'],
      dtype='object')

In [5]:
user_info_df[['username','tweet_count']]

Unnamed: 0,username,tweet_count
0,CDCgov,32478
1,CDC_eHealth,7423
2,GCIndigenous,11888
3,GovCanHealth,26574
4,HHSGov,26895
5,IHSgov,2510
6,InspectionCan,10787
7,NIH,15142
8,NIHB1,3471
9,US_FDA,16597


In [6]:
user_folder_path = '../../data/twitter/'

In [7]:
pre_covid_from = '2017-01-01T00:00:00'
pre_covid_to = '2020-02-26 23:59:59'
during_covid_from = '2020-02-27 00:00:00'
during_covid_to =  '2021-12-31T23:59:59'

### Public Health Agencies

In [None]:
# Public Health Agencies
pre_covid_avg_engagements_per_day_df = pd.DataFrame()

for file in glob.glob(user_folder_path+'combined data/public health agencies/*.csv'):
    user_df = pd.read_csv(file)
    username = user_df['username'].unique()[0]
    user_impact = user_info_df[user_info_df['username'] == username]['user_impact_scaled_by_group'].unique()[0]
    # print(username, user_impact)
    
    # Calculate average engagement per day & it's Exponential Moving Average
    user_avg_engagements_per_day = calculateAverageEngagementsPerDay(user_df, pre_covid_from, during_covid_to)
    user_avg_engagements_per_day['EMA']= user_avg_engagements_per_day.iloc[:,3].ewm(span=SPAN, adjust=False).mean()
    user_avg_engagements_per_day['user'] = username  
    user_avg_engagements_per_day['user_impact'] = user_impact
    
    #  Calculate z-score & Remove outliers
    user_avg_engagements_per_day['zscore'] = stats.zscore(user_avg_engagements_per_day['EMA'])
    user_avg_engagements_per_day = user_avg_engagements_per_day[(user_avg_engagements_per_day.zscore >= -3) & (user_avg_engagements_per_day.zscore <= 3)]

    # Curve Smoothing
    user_avg_engagements_per_day['EMA:Degree8'] = savgol_filter(user_avg_engagements_per_day['EMA'], WINDOW_LENGTH, POLY_ORDER)
    
    # Add user-impact to EMA    
    user_avg_engagements_per_day['EMA*user_impact'] = user_avg_engagements_per_day['EMA:Degree8'].mul(user_avg_engagements_per_day['user_impact'])
    
    # Combine all topics
    pre_covid_avg_engagements_per_day_df = pre_covid_avg_engagements_per_day_df.append(user_avg_engagements_per_day, ignore_index=True, sort=False)
    
# Plot Graph
fig = px.line(pre_covid_avg_engagements_per_day_df, x='created_at', y='EMA*user_impact', color='user', width=1200, height=800, template='plotly')
fig.update_layout(yaxis_title = 'Average Engagement per day with User Impact', xaxis_title = 'Date',
    font = dict(
        size = 22,
        color = '#000000'
    ),
    # autosize = True,
    legend=dict(
        x=0.01,
        y=0.99,
        traceorder='normal',
        font=dict(
            size=22,),
    ),
    margin = dict(
        l = 10,
        b = 10,
        r = 10,
        t = 10
    ), 
    legend_title_text = 'Name of Organization'
)
fig.show()

### Pharma Companies

In [None]:
# jnj_df = pd.read_csv('../../data/twitter/pharma companies/JNJNews.csv')
# jnj_df.isnull().sum()
# jnj_df['retweet_count'].fillna(int(jnj_df['retweet_count'].mean()), inplace=True)
# jnj_df['like_count'].fillna(int(jnj_df['like_count'].mean()), inplace=True)
# jnj_df['reply_count'].fillna(0, inplace=True)
# jnj_df['quote_count'].fillna(0, inplace=True)
# jnj_df[jnj_df['reply_count'] == 'en']
# jnj_df = jnj_df.drop(jnj_df.index[[4555]])
# jnj_df.to_csv('../../data/twitter/pharma companies/JNJNews.csv', index=False)

In [None]:
# Pharma Companies
pre_pharma_companies_avg_engagements_per_day_df = pd.DataFrame()

for file in glob.glob(user_folder_path+'combined data/pharma companies/*.csv'):
    user_df = pd.read_csv(file)
    username = user_df['username'].unique()[0]
    user_impact = user_info_df[user_info_df['username'] == username]['user_impact_scaled_by_group'].unique()[0]
    # print(username, user_impact)
    
    # Calculate average engagement per day & it's Exponential Moving Average
    user_avg_engagements_per_day = calculateAverageEngagementsPerDay(user_df, pre_covid_from, during_covid_to)
    user_avg_engagements_per_day['EMA']= user_avg_engagements_per_day.iloc[:,3].ewm(span=SPAN, adjust=False).mean()
    user_avg_engagements_per_day['user'] = username  
    user_avg_engagements_per_day['user_impact'] = user_impact
    
    #  Calculate z-score & Remove outliers
    user_avg_engagements_per_day['zscore'] = stats.zscore(user_avg_engagements_per_day['EMA'])
    user_avg_engagements_per_day = user_avg_engagements_per_day[(user_avg_engagements_per_day.zscore >= -3) & (user_avg_engagements_per_day.zscore <= 3)]

    # Curve Smoothing
    user_avg_engagements_per_day['EMA:Degree8'] = savgol_filter(user_avg_engagements_per_day['EMA'], WINDOW_LENGTH, POLY_ORDER)
    
    # Add user-impact to EMA    
    user_avg_engagements_per_day['EMA*user_impact'] = user_avg_engagements_per_day['EMA:Degree8'].mul(user_avg_engagements_per_day['user_impact'])
    
    # Combine all topics
    pre_pharma_companies_avg_engagements_per_day_df = pre_pharma_companies_avg_engagements_per_day_df.append(user_avg_engagements_per_day, ignore_index=True, sort=False)
    
# Plot Graph
fig = px.line(pre_pharma_companies_avg_engagements_per_day_df, x='created_at', y='EMA*user_impact', color='user', width=1200, height=800, template='plotly')
fig.update_layout(yaxis_title = 'Average Engagement per day with User Impact', xaxis_title = 'Date',
    font = dict(
        size = 20,
        color = '#000000'
    ),
    # autosize = True,
    legend=dict(
        x=0.01,
        y=0.99,
        traceorder='normal',
        font=dict(
            size=16,),
    ),
    margin = dict(
        l = 10,
        b = 10,
        r = 10,
        t = 10
    ), 
    legend_title_text = 'Name of Organization'
)
fig.show()

### WHO

In [None]:
# NGO
pre_ngo_avg_engagements_per_day_df = pd.DataFrame()

for file in glob.glob(user_folder_path+'combined data/ngo/*.csv'):
    user_df = pd.read_csv(file)
    username = user_df['username'].unique()[0]
    user_impact = user_info_df[user_info_df['username'] == username]['user_impact'].unique()[0]
    # print(username, user_impact)
    
    # Calculate average engagement per day & it's Exponential Moving Average
    user_avg_engagements_per_day = calculateAverageEngagementsPerDay(user_df, pre_covid_from, during_covid_to)
    user_avg_engagements_per_day['EMA']= user_avg_engagements_per_day.iloc[:,3].ewm(span=SPAN, adjust=False).mean()
    user_avg_engagements_per_day['user'] = username  
    user_avg_engagements_per_day['user_impact'] = user_impact
    
    #  Calculate z-score & Remove outliers
    user_avg_engagements_per_day['zscore'] = stats.zscore(user_avg_engagements_per_day['EMA'])
    user_avg_engagements_per_day = user_avg_engagements_per_day[(user_avg_engagements_per_day.zscore >= -3) & (user_avg_engagements_per_day.zscore <= 3)]

    # Curve Smoothing
    user_avg_engagements_per_day['EMA:Degree8'] = savgol_filter(user_avg_engagements_per_day['EMA'], WINDOW_LENGTH, POLY_ORDER)
    
    # Add user-impact to EMA    
    user_avg_engagements_per_day['EMA*user_impact'] = user_avg_engagements_per_day['EMA:Degree8'].mul(user_avg_engagements_per_day['user_impact'])
    
    # Combine all topics
    pre_ngo_avg_engagements_per_day_df = pre_ngo_avg_engagements_per_day_df.append(user_avg_engagements_per_day, ignore_index=True, sort=False)
    
# Plot Graph
fig = px.line(pre_ngo_avg_engagements_per_day_df, x='created_at', y='EMA*user_impact', color='user', width=1200, height=800, template='plotly')
fig.update_layout(yaxis_title = 'Average Engagement per day with User Impact', xaxis_title = 'Date',
    font = dict(
        size = 20,
        color = '#000000'
    ),
    # autosize = True,
    legend=dict(
        x=0.01,
        y=0.99,
        traceorder='normal',
        font=dict(
            size=16,),
    ),
    margin = dict(
        l = 10,
        b = 10,
        r = 10,
        t = 10
    ), 
    legend_title_text = 'Name of Organization'
)
fig.show()