In [18]:
import gzip
import json
import pandas as pd
import os
import numpy as np
import multiprocessing
import re
from functools import reduce, partial
from vaderSentiment.vaderSentiment import sentiment as vaderSentiment

In [19]:
tweets = []
with gzip.open("./olympics/2016-08-02-14.gz") as f:
    for line in f.readlines():
        tweets.append(json.loads(line))

dataset = pd.DataFrame(tweets)

In [20]:
# Create additional columns
dataset['screen_name'] = dataset.user.map(lambda x: x['screen_name'].lower())
dataset['name'] = dataset.user.map(lambda x: x['name'].lower())
dataset['statuses_count'] = dataset.user.map(lambda x: x['statuses_count'])
dataset['followers_count'] = dataset.user.map(lambda x: x['followers_count'])
dataset['text_lower'] = dataset.text.apply(lambda x: x.lower())

In [21]:
ge_accounts = ['generalelectric', 
               'ge_reports', 
               'gedobrasil', 
               'ge_canada', 
               'gehealthcare', 
               'ge_digital', 
               'gedesign', 
               'ge_water', 
               'gecareeers_latam', 
               'geaviation', 
               'ge_appliances', 
               'gepublicaffairs', 
               'gecapital', 
               'ge_uk', 
               'geinfosec', 
               'ge_power', 
               'geresearch', 
               'ge_oilandgas', 
               'gelighting', 
               'ge_foundation', 
               'ge_gaspower']
dataset['retweeted_status'][2]

nan

In [29]:
# Volume
volume_dict = {}

# Tweets by GE
ge_tweets = dataset[dataset['screen_name'].isin(ge_accounts)]
volume_dict["tweets-by-ge"] = len(ge_tweets)

# Followers and tweet count per account
volume_dict['accounts'] = {}
ge_tweets_unique = ge_tweets.drop_duplicates(subset='screen_name')
ge_tweets_unique = ge_tweets_unique[['name', 'screen_name', 'statuses_count', 'followers_count']]
for index, row in ge_tweets_unique.iterrows():
    volume_dict['accounts'][row['screen_name']] = {
        'name': row['name'],
        'tweets': row['statuses_count'],
        'followers': row['followers_count'] 
    }
         
# Function to get counts per element
def get_counts(elem_list, volume_dict, dataset, key):
    current_count = 0
    total_count = 0
    for elem in elem_list:
        current_count = len(dataset[dataset['text_lower'].str.contains(elem)])
        volume_dict[key][elem] = current_count
        total_count += current_count
    return total_count

# Function to get counts for a regex
def get_regex_counts(volume_dict, dataset, key, regex, regexstr):
    count = len(dataset[dataset['text_lower'].str.match(regex)])
    volume_dict[key][regexstr] = count
    return count

# Dataset excluding GE's accounts
exclude_ge = dataset[~dataset['screen_name'].isin(ge_accounts)]
    
# Mentions of GE accounts
volume_dict['mentions'] = {}
ge_mentions = list(ge_accounts)
ge_mentions = ['@{0}'.format(account) for account in ge_mentions]
ge_regex = re.compile('@ge | @ge$')
total_mentions = get_counts(ge_mentions, volume_dict, exclude_ge, 'mentions')
total_mentions += get_regex_counts(volume_dict, exclude_ge, 'mentions', ge_regex, "@ge")
volume_dict['total_mentions'] = total_mentions

# Hashtags of GE
volume_dict['hashtags'] = {}
ge_hashtags = list(ge_accounts)
ge_hashtags = ['#{0}'.format(account) for account in ge_mentions]
ge_hashtags.append("#general_electric")
ge_hashtag_regex = re.compile('#ge | #ge$')
ge_dollar_regex = re.compile('\$ge | \$ge$')
total_hashtags = get_counts(ge_hashtags, volume_dict, exclude_ge, 'hashtags')
total_hashtags = get_regex_counts(volume_dict, exclude_ge, 'hashtags', ge_dollar_regex, "$ge")
total_hashtags = get_regex_counts(volume_dict, exclude_ge, 'hashtags', ge_hashtag_regex, "#ge")
volume_dict['total_hashtags'] = total_hashtags

# GE mentioned in text
volume_dict['text_mentions'] = {}
ge_text = ['general electric', 'generalelectric']
ge_website_regex = re.compile('ge.com | ge.com$')
total_hashtags = get_counts(ge_text, volume_dict, exclude_ge, 'text_mentions')
total_hashtags += get_regex_counts(volume_dict, exclude_ge, 'text_mentions', ge_website_regex, "ge.com")
volume_dict['total_text_mentions'] = total_hashtags


Unnamed: 0,screen_name,text_lower
0,ebaykix,#ebaykix nike lunarepic low flyknit oc olympic...
1,syl20a,official sponsors don't have edge when it come...
2,kennethberard,rt @optimy good #commercial from @united to su...
3,jibodrift,rt @lee_young_jae: absolute olympic ban on rus...
4,rovuxamuvupo,rt @xxsonyxx16: i liked a @youtube video from ...
5,l_slierodriguez,watching replays of men's olympic gymnastic tr...
6,deannabartik,freak waves lash olympic broadcasting building...
7,zhoffs,rt @danijelanusbaum: the best olympic beauty s...
8,sh_eventing,rt @horseandhound: fans urged to support eques...
9,duncperry,rt @britishdressage: we need you! our olympic...


In [54]:
top_participants = exclude_ge[['screen_name', 'text_lower']].groupby('screen_name').agg('count').sort('text_lower', ascending=False).rename(columns={'text_lower': 'count'})
top_participants

Unnamed: 0_level_0,count
screen_name,Unnamed: 1_level_1
originaloffers,16
jpasalagua,13
zaroonrkhan,11
zicutakeolympic,8
beautycrib_tv,6
soldier76bot,6
ebayticketsales,6
teresoca2013,5
news835,5
andytelasai,5
