In [1]:
import gzip
import json
import pandas as pd
import numpy as np
import os
import re
from vaderSentiment.vaderSentiment import sentiment as vaderSentiment

In [2]:
# What was the impact of the GE brand in the collection of tweets gathered in the past Olympic games?

tweets = []
with open("filtered") as f:
    for line in f.readlines():
        tweets.append(json.loads(line))
        
dataset = pd.DataFrame(tweets)

In [3]:
# Create additional columns
dataset['screen_name'] = dataset.user.map(lambda x: x['screen_name'].lower())
dataset['name'] = dataset.user.map(lambda x: x['name'].lower())
dataset['statuses_count'] = dataset.user.map(lambda x: x['statuses_count'])
dataset['followers_count'] = dataset.user.map(lambda x: x['followers_count'])
dataset['text_lower'] = dataset.text.apply(lambda x: x.lower())

In [20]:
ge_accounts = ['generalelectric', 
               'ge_reports', 
               'gedobrasil', 
               'ge_canada', 
               'gehealthcare', 
               'ge_digital', 
               'gedesign', 
               'ge_water', 
               'gecareeers_latam', 
               'geaviation', 
               'ge_appliances', 
               'gepublicaffairs', 
               'gecapital', 
               'ge_uk', 
               'geinfosec', 
               'ge_power', 
               'geresearch', 
               'ge_oilandgas', 
               'gelighting', 
               'ge_foundation', 
               'ge_gaspower',
               'ge_grid']
dataset['text'][512]

u'RT @GEHealthcare: The Weekend Pulse: a new Olympic record in Rio and the meeting point of new ideas and the Industrial Internet  https://t.\u2026'

In [53]:
# Volume
volume_dict = {}

# Tweets by GE
ge_tweets = dataset[dataset['screen_name'].isin(ge_accounts)]
volume_dict["tweets-by-ge"] = len(ge_tweets)

# Followers and tweet count per account
volume_dict['accounts'] = {}
ge_tweets_unique = ge_tweets.drop_duplicates(subset='screen_name')
ge_tweets_unique = ge_tweets_unique[['name', 'screen_name', 'statuses_count', 'followers_count']]
for index, row in ge_tweets_unique.iterrows():
    volume_dict['accounts'][row['screen_name']] = {
        'name': row['name'],
        'tweets': row['statuses_count'],
        'followers': row['followers_count'] 
    }
         
# Function to get counts per element
def get_counts(elem_list, volume_dict, dataset, key):
    current_count = 0
    total_count = 0
    for elem in elem_list:
        current_count = len(dataset[dataset['text_lower'].str.contains(elem)])
        volume_dict[key][elem] = current_count
        total_count += current_count
    return total_count

# Function to get counts for a regex
def get_regex_counts(volume_dict, dataset, key, regex, regexstr):
    count = len(dataset[dataset['text_lower'].str.match(regex)])
    volume_dict[key][regexstr] = count
    return count

# Dataset excluding GE's accounts
exclude_ge = dataset[~dataset['screen_name'].isin(ge_accounts)]
    
# Mentions of GE accounts
volume_dict['mentions'] = {}
ge_mentions = list(ge_accounts)
ge_mentions = ['@{0}'.format(account) for account in ge_mentions]
ge_regex = re.compile('@ge | @ge$')
total_mentions = get_counts(ge_mentions, volume_dict, exclude_ge, 'mentions')
total_mentions += get_regex_counts(volume_dict, exclude_ge, 'mentions', ge_regex, "@ge")
volume_dict['total_mentions'] = total_mentions

# Hashtags of GE
volume_dict['hashtags'] = {}
ge_hashtags = list(ge_accounts)
ge_hashtags = ['#{0}'.format(account) for account in ge_hashtags]
ge_hashtags.append("#general_electric")
ge_hashtag_regex = re.compile('#ge | #ge$')
ge_dollar_regex = re.compile('\$ge | \$ge$')
total_hashtags = get_counts(ge_hashtags, volume_dict, exclude_ge, 'hashtags')
total_hashtags = get_regex_counts(volume_dict, exclude_ge, 'hashtags', ge_dollar_regex, "$ge")
total_hashtags = get_regex_counts(volume_dict, exclude_ge, 'hashtags', ge_hashtag_regex, "#ge")
volume_dict['total_hashtags'] = total_hashtags

# GE mentioned in text
volume_dict['text_mentions'] = {}
ge_text = ['general electric', 'generalelectric']
ge_website_regex = re.compile('ge.com | ge.com$')
total_hashtags = get_counts(ge_text, volume_dict, exclude_ge, 'text_mentions')
total_hashtags += get_regex_counts(volume_dict, exclude_ge, 'text_mentions', ge_website_regex, "ge.com")
volume_dict['total_text_mentions'] = total_hashtags

volume_dict

{'accounts': {u'ge_canada': {'followers': 16560,
   'name': u'ge canada',
   'tweets': 4016},
  u'ge_gaspower': {'followers': 2977,
   'name': u'gas power systems',
   'tweets': 2936},
  u'ge_oilandgas': {'followers': 41246,
   'name': u'ge oil & gas',
   'tweets': 3352},
  u'ge_power': {'followers': 14426, 'name': u'ge power', 'tweets': 4308},
  u'ge_reports': {'followers': 56046, 'name': u'ge reports', 'tweets': 4863},
  u'ge_uk': {'followers': 2636, 'name': u'ge uk', 'tweets': 1311},
  u'geaviation': {'followers': 146388,
   'name': u'ge aviation',
   'tweets': 10841},
  u'gehealthcare': {'followers': 78397,
   'name': u'ge healthcare',
   'tweets': 18407},
  u'gepublicaffairs': {'followers': 9564,
   'name': u'ge public affairs',
   'tweets': 11221}},
 'hashtags': {'#ge': 26,
  '#ge_appliances': 0,
  '#ge_canada': 0,
  '#ge_digital': 0,
  '#ge_foundation': 0,
  '#ge_gaspower': 0,
  '#ge_oilandgas': 0,
  '#ge_power': 0,
  '#ge_reports': 0,
  '#ge_uk': 0,
  '#ge_water': 0,
  '#geavia

In [68]:
# Reach

# All tweets made by GE
ge_tweets = dataset[dataset['screen_name'].isin(ge_accounts)]
ge_tweets['followers_count'] = ge_tweets.user.map(lambda x: x['followers_count'])
ge_tweets = ge_tweets[['id', 'screen_name', 'retweet_count', 'followers_count', 'favorite_count']]

# GE retweeters
exclude_ge = dataset[~dataset['screen_name'].isin(ge_accounts)]
ge_retweets = ['^rt @{0}'.format(account) for account in ge_accounts]
ge_retweeters = dataset[dataset['text_lower'].str.contains('|'.join(ge_retweets))]
ge_retweeters = ge_retweeters[~ge_retweeters['retweeted_status'].isnull()]
ge_retweeters['original_tweet_id'] = ge_retweeters.retweeted_status.map(lambda x: x['id'])
ge_retweeters['followers_count'] = ge_retweeters.user.map(lambda x: x['followers_count'])
ge_retweeters = ge_retweeters[['original_tweet_id', 'followers_count', 'favorite_count']]

# Calculate reach by summing followers of each retweeter
retweet_reach = ge_retweeters.groupby('original_tweet_id')
retweet_reach = retweet_reach.aggregate([np.sum, np.mean])

# Join the reach results to the original tweets
reach = pd.merge(ge_tweets, retweet_reach, left_on='id', right_index=True, how='left')
reach.columns = ['id', 
                 'screen_name', 
                 'retweet_count', 
                 'followers_count', 
                 'favorite_count', 
                 'retweet_followers',
                 'retweet_followers_avg',
                 'retweet_favorites',
                 'retweet_favorites_avg'
                ]
reach = reach.fillna(0)
reach['total_favorites'] = reach['retweet_favorites'] + reach['favorite_count']
reach['total_impressions'] = reach['retweet_followers'] + reach['followers_count']

# Group results by account
reach_per_account = reach.drop('id', 1)
reach_per_account = reach.groupby('screen_name')
reach_per_account = reach_per_account.aggregate([np.sum, np.mean])
reach_per_account

Unnamed: 0_level_0,id,id,retweet_count,retweet_count,followers_count,followers_count,favorite_count,favorite_count,retweet_followers,retweet_followers,retweet_followers_avg,retweet_followers_avg,retweet_favorites,retweet_favorites,retweet_favorites_avg,retweet_favorites_avg,total_favorites,total_favorites,total_impressions,total_impressions
Unnamed: 0_level_1,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean
screen_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
ge_canada,2.29565e+18,765216653315537536,0,0,49678,16559.333333,0,0,5867.0,1955.666667,5867.0,1955.666667,0.0,0.0,0.0,0.0,0.0,0.0,55545.0,18515.0
ge_gaspower,7.606189e+18,760618873260301952,0,0,31413,3141.3,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31413.0,3141.3
ge_oilandgas,7.659619e+17,765961857089798144,0,0,41246,41246.0,0,0,1892.0,1892.0,946.0,946.0,0.0,0.0,0.0,0.0,0.0,0.0,43138.0,43138.0
ge_power,2.265132e+18,755044046644177536,0,0,43803,14601.0,0,0,10720.0,3573.333333,888.314286,296.104762,0.0,0.0,0.0,0.0,0.0,0.0,54523.0,18174.333333
ge_reports,1.527838e+18,763918940223518720,0,0,112231,56115.5,0,0,3173.0,1586.5,1586.5,793.25,0.0,0.0,0.0,0.0,0.0,0.0,115404.0,57702.0
ge_uk,2.301776e+18,767258812436884096,0,0,7971,2657.0,0,0,4771.0,1590.333333,1590.333333,530.111111,0.0,0.0,0.0,0.0,0.0,0.0,12742.0,4247.333333
geaviation,4.619451e+18,769908492773660672,0,0,878681,146446.833333,0,0,9837.0,1639.5,1742.35,290.391667,0.0,0.0,0.0,0.0,0.0,0.0,888518.0,148086.333333
gehealthcare,1.376424e+19,764679952393735936,0,0,1441424,80079.111111,0,0,49802.0,2766.777778,18241.433333,1013.412963,0.0,0.0,0.0,0.0,0.0,0.0,1491226.0,82845.888889
gepublicaffairs,3.064172e+18,766042993980687360,0,0,38359,9589.75,0,0,6763.0,1690.75,3381.5,845.375,0.0,0.0,0.0,0.0,0.0,0.0,45122.0,11280.5


In [57]:
# Sentiment
corpus = dataset['text_lower']
sentiments = {}

def get_sentiment(scores):
    positive_value = scores['pos']
    negative_value = scores['neg']
    neutral_value = scores['neu']
    
    values = [positive_value, negative_value, neutral_value]
    
    if positive_value == max(values): return "positive"
    if negative_value == max(values): return "negative"
    if neutral_value == max(values): 
        if positive_value > 0.25 or negative_value > 0.25:
            return "positive" if positive_value > negative_value else "negative"
    
def analyse_sentiment(corpus):
    for index, row in corpus.iteritems():
        scores = vaderSentiment(row.encode('utf-8'))
        sentiments[row] = get_sentiment(scores)
        
analyse_sentiment(corpus)

positive_tweets = sum(1 for x in sentiments.values() if x == "positive")
negative_tweets = sum(1 for x in sentiments.values() if x == "negative")

#View tweets (negative)
#{k: v for k, v in sentiments.iteritems() if v == "negative"}

#...as percentages
#positive_tweets / float(len(corpus))
#negative_tweets / float(len(corpus))

In [85]:
# Audience Characteristics

# Lang
# Location

# Most active participants
top_participants = exclude_ge[['screen_name', 'text_lower']].groupby('screen_name').agg('count').rename(columns={'text_lower': 'tweet_count'})

# Most relevant participants
relevant_participants = exclude_ge[['screen_name', 'followers_count']].groupby('screen_name').agg('max')
relevant_participants = pd.merge(relevant_participants, top_participants, left_index=True, right_index=True, how='left')
relevant_participants['impressions'] = relevant_participants['followers_count'] * relevant_participants['tweet_count']
relevant_participants = relevant_participants.sort_values(by='impressions', ascending=False)
relevant_participants

Unnamed: 0_level_0,followers_count,tweet_count,impressions
screen_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
emiforlove,476925,1,476925
prweekuknews,63902,2,127804
admitonesin,114538,1,114538
zain_verjee,67259,1,67259
gateway978,55109,1,55109
bbdony,18962,2,37924
findsfromyester,37643,1,37643
sportsfeatures,34532,1,34532
atsocialmediauk,33856,1,33856
cfm_engines,32736,1,32736
