## Pre-process tweets fetched

In [148]:
import json
import pandas as pd
import networkx as nx
from sklearn.preprocessing import StandardScaler

In [149]:
tweets = {}
airline_twitter = ['@AlaskaAir','@AmericanAir','@JetBlue','@Delta','@united','@FlyFrontier','@SpiritAirlines','@SouthwestAir','@HawaiianAir','@VirginAmerica']
for a in airline_twitter:
    f = open('data/{}.json'.format(a))
    lines = f.readlines()
    f.close()
    t = []
    for l in lines:
        t.append(json.loads(l))
    tweets[a] = t

In [163]:
with open('tweets.json','w') as f:
    f.write(json.dumps(tweets))

In [3]:
tweets.keys()

['@united',
 '@AlaskaAir',
 '@HawaiianAir',
 '@VirginAmerica',
 '@FlyFrontier',
 '@JetBlue',
 '@AmericanAir',
 '@Delta',
 '@SpiritAirlines',
 '@SouthwestAir']

## Get top 50 influencers (twitter accounts)

In [150]:
aa = tweets['@VirginAmerica'] #put airline twitter account here to get influcer

In [151]:
n = len(aa)
user1 = []
user2 = []
tweet_type = []
tweet_text = []

for i in range(n):
    user_rt = ''
    user_reply = ''
    user_tw = aa[i]['user']['screen_name']
    txt = aa[i]['text']
    if aa[i]['entities']['user_mentions']: #if any @ in tweets
        user_mentioned = [m['screen_name'] for m in aa[i]['entities']['user_mentions']] #store all @ screen names 
        if 'retweeted_status' in aa[i].keys(): #if RT
            user_rt = aa[i]['retweeted_status']['user']['screen_name'] 
            user1.append(user_tw) #add one more link for RT
            user2.append(user_rt)
            tweet_type.append('RT')
            tweet_text.append(txt)
            user1.append(user_rt) #add one more link for og tweet, drop dup later
            user2.append(user_rt)
            tweet_type.append('tweet')
            tweet_text.append(aa[i]['retweeted_status']['text'])
        elif aa[i]['in_reply_to_screen_name']:
            user_reply = aa[i]['in_reply_to_screen_name']
            user1.append(user_tw) #add one more link for reply
            user2.append(user_reply)
            tweet_type.append('reply')
            tweet_text.append(txt)
        for u in [user for user in user_mentioned if (user != user_rt and user != user_reply)]:
            user1.append(user_tw) #add one more link for mention (exclude RT, reply)
            user2.append(u)
            tweet_type.append('mention')
            tweet_text.append(txt)
    else: #if no @ in tweet
        user1.append(user_tw)
        user2.append(user_tw)
        tweet_type.append('tweet')
        tweet_text.append(txt)

In [152]:
tweet_df = pd.DataFrame({'user1':user1,
                         'user2':user2,
                         'type':tweet_type,
                         'text':tweet_text})
tweet_df = tweet_df.drop_duplicates()[['user1','user2','type']]

In [153]:
tweet_link = tweet_df.groupby(['user1','user2']).count().reset_index()
user_from = tweet_link.user1.tolist()
user_to = tweet_link.user2.tolist()
weight = tweet_link.type.tolist()

In [154]:
G = nx.DiGraph()
for i in range(len(user_from)):
    G.add_edge(user_from[i],user_to[i], weight = weight[i])

In [155]:
users = G.nodes
degree = nx.degree_centrality(G)
between = nx.betweenness_centrality(G)
close = nx.closeness_centrality(G)

In [156]:
user_stats_central = pd.DataFrame(zip(users,degree.values(),between.values(),close.values()),columns = ['user_name','degree','between','close'])

In [157]:
user_stats = {}
for i in range(n):
    user_tw = aa[i]['user']['screen_name']
    if 'retweeted_status' in aa[i].keys():
        user_rt = aa[i]['retweeted_status']['user']['screen_name']
        if user_rt not in user_stats:
            user_stats[user_rt] = {}
            user_stats[user_rt]['listed_count'] = aa[i]['retweeted_status']['user']['listed_count']
            user_stats[user_rt]['followers_count'] = aa[i]['retweeted_status']['user']['followers_count']
            user_stats[user_rt]['statuses_count'] = aa[i]['retweeted_status']['user']['statuses_count']
    if user_tw not in user_stats:
        user_stats[user_tw] = {}
        user_stats[user_tw]['listed_count'] = aa[i]['user']['listed_count']
        user_stats[user_tw]['followers_count'] = aa[i]['user']['followers_count']
        user_stats[user_tw]['statuses_count'] = aa[i]['user']['statuses_count']

In [158]:
user_stats_api = pd.DataFrame.from_dict({i: user_stats[i] for i in user_stats.keys()},orient='index').drop_duplicates()
user_stats_api = user_stats_api.reset_index()
user_stats_api.columns = ['user_name','listed_count','followers_count','statuses_count']

In [159]:
user_stats_overall = user_stats_api.merge(user_stats_central,left_on='user_name',right_on='user_name')
w = [0.3,0.25,0.15,0.3]
user_factor = user_stats_overall
user_factor['central'] = (user_factor.degree + user_factor.between + user_factor.close)
user_factor = user_factor.drop(['degree','between','close'],axis = 1)
scaler = StandardScaler() #standardize
user_factor_norm = pd.DataFrame(scaler.fit_transform(user_factor[['listed_count','followers_count','statuses_count','central']]))
user_factor_norm['user_name'] = user_factor['user_name']
user_factor_norm['score'] = user_factor_norm.iloc[:,0]*w[0] + user_factor_norm.iloc[:,1]*w[1] + user_factor_norm.iloc[:,2]*w[2] + user_factor_norm.iloc[:,3]*w[3]
user_factor_norm.columns = ['listed_count','followers_count','statuses_count','central','user_name','score']
result = user_factor_norm[['user_name','score','listed_count','followers_count','statuses_count','central']].sort_values(by = 'score', ascending = False)[:50]

In [160]:
result

Unnamed: 0,user_name,score,listed_count,followers_count,statuses_count,central
155,JordinSparks,9.366119,23.101788,0.710438,15.124918,-0.035882
123,HelenMoyes,7.832102,-0.069604,-0.003117,-0.078994,26.218706
246,PleasureEllis,5.968766,9.281061,3.645793,15.305753,-0.076211
600,rolandsmartin,3.801617,2.670945,9.105615,4.765981,0.03011
318,VirginAmerica,3.376246,4.535833,2.908839,8.660341,-0.035882
344,airfarewatchdog,2.629263,3.741251,1.194904,8.005187,0.024611
319,VirginAtlantic,2.467154,3.106632,3.934476,3.869715,-0.096375
214,MoBea,2.054286,-0.077342,8.25778,0.27971,-0.096375
634,soncash_,1.888166,-0.078467,7.821247,-0.138284,-0.076211
596,rico_hands,1.707902,0.042999,6.613964,0.267852,0.004446


In [162]:
result.to_csv('output/@VirginAmerica_Inf.csv')