In [2]:
import gzip
import json
import pandas as pd
import numpy as np
import os
import re

In [3]:
# What was the impact of the GE brand in the collection of tweets gathered in the past Olympic games?

tweets = []
with gzip.open("./olympics/2016-08-02-14.gz") as f:
    for line in f.readlines():
        tweets.append(json.loads(line))
        
dataset = pd.DataFrame(tweets)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6261 entries, 0 to 6260
Data columns (total 32 columns):
contributors                 0 non-null object
coordinates                  15 non-null object
created_at                   6261 non-null object
entities                     6261 non-null object
extended_entities            2388 non-null object
favorite_count               6261 non-null int64
favorited                    6261 non-null bool
filter_level                 6261 non-null object
geo                          15 non-null object
id                           6261 non-null int64
id_str                       6261 non-null object
in_reply_to_screen_name      213 non-null object
in_reply_to_status_id        147 non-null float64
in_reply_to_status_id_str    147 non-null object
in_reply_to_user_id          213 non-null float64
in_reply_to_user_id_str      213 non-null object
is_quote_status              6261 non-null bool
lang                         6261 non-null object
place    

In [4]:
# Create additional columns
dataset['screen_name'] = dataset.user.map(lambda x: x['screen_name'].lower())
dataset['name'] = dataset.user.map(lambda x: x['name'].lower())
dataset['statuses_count'] = dataset.user.map(lambda x: x['statuses_count'])
dataset['followers_count'] = dataset.user.map(lambda x: x['followers_count'])
dataset['text_lower'] = dataset.text.apply(lambda x: x.lower())

In [27]:
ge_accounts = ['generalelectric', 
               'ge_reports', 
               'gedobrasil', 
               'ge_canada', 
               'gehealthcare', 
               'ge_digital', 
               'gedesign', 
               'ge_water', 
               'gecareeers_latam', 
               'geaviation', 
               'ge_appliances', 
               'gepublicaffairs', 
               'gecapital', 
               'ge_uk', 
               'geinfosec', 
               'ge_power', 
               'geresearch', 
               'ge_oilandgas', 
               'gelighting', 
               'ge_foundation', 
               'ge_gaspower']
dataset['favorite_count'][3]

0

In [6]:
# Volume
volume_dict = {}

# Tweets by GE
ge_tweets = dataset[dataset['screen_name'].isin(ge_accounts)]
volume_dict["tweets-by-ge"] = len(ge_tweets)

# Followers and tweet count per account
volume_dict['accounts'] = {}
ge_tweets_unique = ge_tweets.drop_duplicates(subset='screen_name')
ge_tweets_unique = ge_tweets_unique[['name', 'screen_name', 'statuses_count', 'followers_count']]
for index, row in ge_tweets_unique.iterrows():
    volume_dict['accounts'][row['screen_name']] = {
        'name': row['name'],
        'tweets': row['statuses_count'],
        'followers': row['followers_count'] 
    }
         
# Function to get counts per element
def get_counts(elem_list, volume_dict, dataset, key):
    current_count = 0
    total_count = 0
    for elem in elem_list:
        current_count = len(dataset[dataset['text_lower'].str.contains(elem)])
        volume_dict[key][elem] = current_count
        total_count += current_count
    return total_count

# Function to get counts for a regex
def get_regex_counts(volume_dict, dataset, key, regex, regexstr):
    count = len(dataset[dataset['text_lower'].str.match(regex)])
    volume_dict[key][regexstr] = count
    return count

# Dataset excluding GE's accounts
exclude_ge = dataset[~dataset['screen_name'].isin(ge_accounts)]
    
# Mentions of GE accounts
volume_dict['mentions'] = {}
ge_mentions = list(ge_accounts)
ge_mentions = ['@{0}'.format(account) for account in ge_mentions]
ge_regex = re.compile('@ge | @ge$')
total_mentions = get_counts(ge_mentions, volume_dict, exclude_ge, 'mentions')
total_mentions += get_regex_counts(volume_dict, exclude_ge, 'mentions', ge_regex, "@ge")
volume_dict['total_mentions'] = total_mentions

# Hashtags of GE
volume_dict['hashtags'] = {}
ge_hashtags = list(ge_accounts)
ge_hashtags = ['#{0}'.format(account) for account in ge_mentions]
ge_hashtags.append("#general_electric")
ge_hashtag_regex = re.compile('#ge | #ge$')
ge_dollar_regex = re.compile('\$ge | \$ge$')
total_hashtags = get_counts(ge_hashtags, volume_dict, exclude_ge, 'hashtags')
total_hashtags = get_regex_counts(volume_dict, exclude_ge, 'hashtags', ge_dollar_regex, "$ge")
total_hashtags = get_regex_counts(volume_dict, exclude_ge, 'hashtags', ge_hashtag_regex, "#ge")
volume_dict['total_hashtags'] = total_hashtags

# GE mentioned in text
volume_dict['text_mentions'] = {}
ge_text = ['general electric', 'generalelectric']
ge_website_regex = re.compile('ge.com | ge.com$')
total_hashtags = get_counts(ge_text, volume_dict, exclude_ge, 'text_mentions')
total_hashtags += get_regex_counts(volume_dict, exclude_ge, 'text_mentions', ge_website_regex, "ge.com")
volume_dict['total_text_mentions'] = total_hashtags

volume_dict

{'accounts': {},
 'hashtags': {'#@ge_appliances': 0,
  '#@ge_canada': 0,
  '#@ge_digital': 0,
  '#@ge_foundation': 0,
  '#@ge_gaspower': 0,
  '#@ge_oilandgas': 0,
  '#@ge_power': 0,
  '#@ge_reports': 0,
  '#@ge_uk': 0,
  '#@ge_water': 0,
  '#@geaviation': 0,
  '#@gecapital': 0,
  '#@gecareeers_latam': 0,
  '#@gedesign': 0,
  '#@gedobrasil': 0,
  '#@gehealthcare': 0,
  '#@geinfosec': 0,
  '#@gelighting': 0,
  '#@generalelectric': 0,
  '#@gepublicaffairs': 0,
  '#@geresearch': 0,
  '#ge': 0,
  '#general_electric': 0,
  '$ge': 0},
 'mentions': {'@ge': 0,
  '@ge_appliances': 0,
  '@ge_canada': 0,
  '@ge_digital': 0,
  '@ge_foundation': 0,
  '@ge_gaspower': 0,
  '@ge_oilandgas': 0,
  '@ge_power': 0,
  '@ge_reports': 0,
  '@ge_uk': 0,
  '@ge_water': 0,
  '@geaviation': 0,
  '@gecapital': 0,
  '@gecareeers_latam': 0,
  '@gedesign': 0,
  '@gedobrasil': 0,
  '@gehealthcare': 0,
  '@geinfosec': 0,
  '@gelighting': 0,
  '@generalelectric': 0,
  '@gepublicaffairs': 0,
  '@geresearch': 0},
 'text_m

In [10]:
# Reach

# All tweets made by GE
ge_tweets = dataset[dataset['screen_name'].isin(ge_accounts)]
ge_tweets['follower_count'] = ge_tweets.user.map(lambda x: x['follower_count'])
ge_tweets = ge_tweets['id', 'screen_name', 'retweet_count', 'follower_count', 'favorite_count']

# GE retweeters
exclude_ge = dataset[~dataset['screen_name'].isin(ge_accounts)]
ge_retweets = ['^rt @{0}'.format(account) for account in ge_accounts]
ge_retweeters = dataset[dataset['text_lower'].str.contains('|'.join(ge_retweets))]
ge_retweeters['original_tweet_id'] = ge_retweeters.retweet_status.map(lambda x: x['id'])
ge_retweeters['follower_count'] = ge_retweeters.user.map(lambda x: x['follower_count'])
ge_retweeters = ge_retweeters['original_tweet_id', 'follower_count', 'favorite_count']

# Calculate reach by summing followers of each retweeter
retweet_reach = ge_retweeters.groupby('original_tweet_id')
retweet_reach = retweet_reach.aggregate([np.sum, np.mean])
type(retweet_reach)

# Join the reach results to the original tweets
reach = pd.merge(ge_tweets, retweet_reach, left_on='id', right_on='original_tweet_id', how='left')
reach['total_favorites'] = reach['favorite_count_sum'] + reach['favorite_count']
reach['total_impressions'] = reach['follower_count_sum'] + reach['follower_count']

# Group results by account
reach_per_account = reach.groupby('screen_name')
reach_per_account = reach_per_account.aggregate([np.sum, np.avg])

AttributeError: 'DataFrame' object has no attribute 'retweet_status'

In [None]:
# Sentiment
corpus = dataset['text_lower']
sentiments = {}

def get_sentiment(scores):
    positive_value = scores['pos']
    negative_value = scores['neg']
    neutral_value = scores['neu']
    
    values = [positive_value, negative_value, neutral_value]
    
    if positive_value == max(values): return "positive"
    if negative_value == max(values): return "negative"
    if neutral_value == max(values): 
        if positive_value > 0.25 or negative_value > 0.25:
            return "positive" if positive_value > negative_value else "negative"
    
def analyse_sentiment(corpus):
    for index, row in corpus.iteritems():
        scores = vaderSentiment(row.encode('utf-8'))
        sentiments[row] = get_sentiment(scores)
        
analyse_sentiment(corpus)

positive_tweets = sum(1 for x in sentiments.values() if x == "positive")
negative_tweets = sum(1 for x in sentiments.values() if x == "negative")

#View tweets (negative)
#{k: v for k, v in sentiments.iteritems() if v == "negative"}

#...as percentages
#positive_tweets / float(len(corpus))
#negative_tweets / float(len(corpus))

In [None]:
# Audience Characteristics

# Lang
# Location

# Most active participants
top_participants = exclude_ge[['screen_name', 'text_lower']].groupby('screen_name').agg('count').sort('text_lower', ascending=False).rename(columns={'text_lower': 'count'})
top_participants
# Most influential participants
# Most influential content