In [1]:
# Code adapted from https://github.com/manmeetkaurbaxi/2020-US-Elections
import pandas as pd
import tweepy
import math
import datetime
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, minmax_scale
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

In [2]:
user_info_df = pd.read_csv('../../data/twitter/user_info_reframe.csv')

In [3]:
data_collection_end_time = '2021-12-31 23:59:59'
def user_impact(tweet_count, created_at, followers_count, listed_count, following_count):
    created_at = str(created_at)[:-6]
    profile_age = (datetime.datetime.strptime(data_collection_end_time, '%Y-%m-%d %H:%M:%S') - datetime.datetime.strptime(created_at, '%Y-%m-%d %H:%M:%S')).days
    
    impact = np.round(((followers_count * listed_count * math.log10((followers_count/following_count) + 1)) / (tweet_count * profile_age)), 7)
    
    return impact

In [4]:
user_info_df['user_impact'] = user_info_df[['tweet_count','created_at','followers_count','listed_count','following_count']].apply(lambda x: user_impact(*x), axis=1)

In [5]:
user_info_df

Unnamed: 0,created_at,id,name,username,followers_count,following_count,tweet_count,listed_count,description,location,verified,user_impact,user_impact_scaled,group,group_category,user_impact_scaled_by_group
0,2010-05-21 19:40:40+00:00,146569971,CDC,CDCgov,4733986,256,32478,19751,CDC's official Twitter source for daily credib...,"Atlanta, GA",True,2895.875102,0.694244,Public Health Organizations,1,1.0
1,2008-07-24 19:35:55+00:00,15587500,CDC_eHealth,CDC_eHealth,696810,153,7423,7788,Digital & social media updates from @CDCgov. #...,"Atlanta, GA",True,544.956119,0.130635,Public Health Organizations,1,0.188169
2,2010-05-06 18:27:46+00:00,140924167,GCIndigenous,GCIndigenous,38479,277,11888,569,GCIndigenous is a Government of Canada account...,Canada,True,0.928378,0.00021,Public Health Organizations,1,0.000302
3,2009-04-29 14:53:41+00:00,36375825,Health Canada and PHAC,GovCanHealth,427566,70,26574,3054,HC and PHAC - #COVID19 updates - Promoting and...,Canada,True,40.188886,0.009622,Public Health Organizations,1,0.01386
4,2009-06-05 01:14:31+00:00,44783853,HHS.gov,HHSGov,1240591,357,26895,9790,News and information from the U.S. Department ...,"Washington, D.C.",True,348.236498,0.083473,Public Health Organizations,1,0.120236
5,2017-09-22 19:09:34+00:00,911306494536224768,IndianHealthService,IHSgov,6088,278,2510,102,"IHS, part of @HHSGov, provides healthcare for ...","Rockville, MD",True,0.215516,3.9e-05,Public Health Organizations,1,5.6e-05
6,2009-06-11 17:53:31+00:00,46444445,Canadian Food Inspection Agency,InspectionCan,63546,200,10787,689,We’re making it easier for you to find all the...,Canada,True,2.215679,0.000518,Public Health Organizations,1,0.000747
7,2008-06-16 13:57:45+00:00,15134240,NIH,NIH,1502497,319,15142,12092,Official Twitter account of the National Insti...,"Bethesda, Maryland, USA",True,891.064084,0.213611,Public Health Organizations,1,0.307688
8,2012-10-24 22:58:51+00:00,902769558,NIHB,NIHB1,8084,1828,3471,105,The National Indian Health Board advocates on ...,"Washington, D.C.",False,0.053515,0.0,Public Health Organizations,1,0.0
9,2010-10-26 17:31:41+00:00,208120290,U.S. FDA,US_FDA,529000,161,16597,4986,Our tweets are FDA Approved! Pri...,"Silver Spring, MD",True,136.847098,0.032795,Public Health Organizations,1,0.047238


In [6]:
minMaxScaler = MinMaxScaler()
user_info_df[['user_impact_scaled']] = minMaxScaler.fit_transform(user_info_df[['user_impact']])

In [7]:
user_info_df['group'] = ''

user_info_df.iloc[:10]['group'] = 'Public Health Organizations'
user_info_df.iloc[10:15]['group'] = 'Pharmaceutical Companies'
user_info_df.iloc[15:]['group'] = 'WHO'

In [8]:
labelEncoder = LabelEncoder()
user_info_df['group_category'] = labelEncoder.fit_transform(user_info_df['group'])
user_info_df['user_impact_scaled_by_group'] = user_info_df.groupby('group_category').user_impact.transform(lambda x:minmax_scale(x.astype(float)))

### Plots

In [9]:
fig = px.bar(user_info_df, x='username', y='user_impact', color='group', width=1000, height=500)
fig.update_layout(yaxis_title = 'User Impact', xaxis_title = 'Username',
    font = dict(
        size = 20,
        color = '#000000'
    ),
    # autosize = True,
    legend=dict(
        x=0.01,
        y=0.98,
        title_text='',
        traceorder='normal',
        font=dict(
            size=16,),
    ),
    margin = dict(
        l = 10,
        b = 10,
        r = 10,
        t = 10
    ), 
    legend_title_text = 'Type of Organization'
)
fig.show()
# fig.write_image('../../results/engagement-analysis/user-impact/user-impact.pdf', engine='kaleido')

In [13]:
fig = px.bar(user_info_df, x='username', y='user_impact_scaled', color='group', width=1000, height=500)
fig.update_layout(yaxis_title = 'User Impact', xaxis_title = 'Username',
    font = dict(
        size = 20,
        color = '#000000'
    ),
    # autosize = True,
    legend=dict(
        x=0.01,
        y=0.98,
        title_text='',
        traceorder='normal',
        font=dict(
            size=20,),
    ),
    margin = dict(
        l = 10,
        b = 10,
        r = 10,
        t = 10
    ), 
    # legend_title_text = 'Type of Organization' 
fig.show()
# fig.write_image('../../results/engagement-analysis/user-impact/user-impact-scaled.pdf', engine='kaleido')

In [11]:
user_info_df.to_csv('../../data/twitter/user_info_reframe.csv', index=False)