In [1]:
import gzip
import json
import pandas as pd
import os
import multiprocessing
from functools import reduce, partial

In [2]:
# What was the impact of the GE brand in the collection of tweets gathered in the past Olympic games?

tweets = []
with gzip.open("./olympics/2016-08-02-14.gz") as f:
    for line in f.readlines():
        tweets.append(json.loads(line))
        
dataset = pd.DataFrame(tweets)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6261 entries, 0 to 6260
Data columns (total 32 columns):
contributors                 0 non-null object
coordinates                  15 non-null object
created_at                   6261 non-null object
entities                     6261 non-null object
extended_entities            2388 non-null object
favorite_count               6261 non-null int64
favorited                    6261 non-null bool
filter_level                 6261 non-null object
geo                          15 non-null object
id                           6261 non-null int64
id_str                       6261 non-null object
in_reply_to_screen_name      213 non-null object
in_reply_to_status_id        147 non-null float64
in_reply_to_status_id_str    147 non-null object
in_reply_to_user_id          213 non-null float64
in_reply_to_user_id_str      213 non-null object
is_quote_status              6261 non-null bool
lang                         6261 non-null object
place    

In [10]:
# Create additional columns
dataset['screen_name'] = dataset.user.map(lambda x: x['screen_name'].lower())
dataset['name'] = dataset.user.map(lambda x: x['name'].lower())
dataset['statuses_count'] = dataset.user.map(lambda x: x['statuses_count'])
dataset['followers_count'] = dataset.user.map(lambda x: x['followers_count'])
dataset['text_lower'] = dataset.text.apply(lambda x: x.lower())

In [19]:
dataset['retweeted_status'][3]

{u'contributors': None,
 u'coordinates': None,
 u'created_at': u'Tue Aug 02 13:50:39 +0000 2016',
 u'entities': {u'hashtags': [],
  u'media': [{u'display_url': u'pic.twitter.com/FZk4qH2HKo',
    u'expanded_url': u'http://twitter.com/lee_young_jae/status/760472886276526080/photo/1',
    u'id': 760472883780915200,
    u'id_str': u'760472883780915200',
    u'indices': [72, 95],
    u'media_url': u'http://pbs.twimg.com/media/Co2-DwMUEAA7Azo.jpg',
    u'media_url_https': u'https://pbs.twimg.com/media/Co2-DwMUEAA7Azo.jpg',
    u'sizes': {u'large': {u'h': 330, u'resize': u'fit', u'w': 620},
     u'medium': {u'h': 330, u'resize': u'fit', u'w': 620},
     u'small': {u'h': 330, u'resize': u'fit', u'w': 620},
     u'thumb': {u'h': 150, u'resize': u'crop', u'w': 150}},
    u'type': u'photo',
    u'url': u'https://t.co/FZk4qH2HKo'}],
  u'symbols': [],
  u'urls': [{u'display_url': u'dlvr.it/LxQpD6',
    u'expanded_url': u'http://dlvr.it/LxQpD6',
    u'indices': [48, 71],
    u'url': u'https://t.co/4

In [14]:
# Volume
volume_dict = {}

# Tweets by GE
ge_accounts = ['generalelectric', 
               'ge_reports', 
               'gedobrasil', 
               'ge_canada', 
               'gehealthcare', 
               'ge_digital', 
               'gedesign', 
               'ge_water', 
               'gecareeers_latam', 
               'geaviation', 
               'ge_appliances', 
               'gepublicaffairs', 
               'gecapital', 
               'ge_uk', 
               'geinfosec', 
               'ge_power', 
               'geresearch', 
               'ge_oilandgas', 
               'gelighting', 
               'ge_foundation', 
               'ge_gaspower']
ge_tweets = dataset[dataset['screen_name'].isin(ge_accounts)]
volume_dict["tweets-by-ge"] = len(ge_tweets)

# Followers and tweet count per account
volume_dict['accounts'] = {}
ge_tweets_unique = ge_tweets.drop_duplicates(subset='screen_name')
ge_tweets_unique = ge_tweets_unique[['name', 'screen_name', 'statuses_count', 'followers_count']]
for index, row in ge_tweets_unique.iterrows():
    volume_dict['accounts'][row['screen_name']] = {
        'name': row['name'],
        'tweets': row['statuses_count'],
        'followers': row['followers_count'] 
    }
    
# Function to get counts per element
def get_counts(elem_list, volume_dict, dataset, key):
    current_count = 0
    total_count = 0
    for elem in elem_list:
        current_count = len(dataset[dataset['text_lower'].str.contains(elem)])
        volume_dict[key][elem] = current_count
        total_count += current_count
    return total_count

# Dataset excluding GE's accounts
exclude_ge = dataset[~dataset['screen_name'].isin(ge_accounts)]
    
# Mentions of GE accounts
volume_dict['mentions'] = {}
ge_mentions = list(ge_accounts)
ge_mentions = ['@{0}'.format(account) for account in ge_mentions]
ge_mentions.append("@ge")
volume_dict['total_mentions'] = get_counts(ge_mentions, volume_dict, exclude_ge, 'mentions')

# Hashtags of GE
volume_dict['hashtags'] = {}
ge_hashtags = list(ge_accounts)
ge_hashtags = ['#{0}'.format(account) for account in ge_mentions]
ge_hashtags.append("$ge")
ge_hashtags.append("#ge")
ge_hashtags.append("#general_electric")
volume_dict['total_hashtags'] = get_counts(ge_hashtags, volume_dict, exclude_ge, 'hashtags')

# GE mentioned in text
volume_dict['text_mentions'] = {}
ge_text = ['general electric', 'generalelectric', 'ge.com']
volume_dict['total_text_mentions'] = get_counts(ge_text, volume_dict, exclude_ge, 'text_mentions')

volume_dict

{'accounts': {},
 'hashtags': {'#@ge': 0,
  '#@ge_appliances': 0,
  '#@ge_canada': 0,
  '#@ge_digital': 0,
  '#@ge_foundation': 0,
  '#@ge_gaspower': 0,
  '#@ge_oilandgas': 0,
  '#@ge_power': 0,
  '#@ge_reports': 0,
  '#@ge_uk': 0,
  '#@ge_water': 0,
  '#@geaviation': 0,
  '#@gecapital': 0,
  '#@gecareeers_latam': 0,
  '#@gedesign': 0,
  '#@gedobrasil': 0,
  '#@gehealthcare': 0,
  '#@geinfosec': 0,
  '#@gelighting': 0,
  '#@generalelectric': 0,
  '#@gepublicaffairs': 0,
  '#@geresearch': 0,
  '#ge': 22,
  '#general_electric': 0,
  '$ge': 0},
 'mentions': {'@ge': 8,
  '@ge_appliances': 0,
  '@ge_canada': 0,
  '@ge_digital': 0,
  '@ge_foundation': 0,
  '@ge_gaspower': 0,
  '@ge_oilandgas': 0,
  '@ge_power': 0,
  '@ge_reports': 0,
  '@ge_uk': 0,
  '@ge_water': 0,
  '@geaviation': 0,
  '@gecapital': 0,
  '@gecareeers_latam': 0,
  '@gedesign': 0,
  '@gedobrasil': 0,
  '@gehealthcare': 0,
  '@geinfosec': 0,
  '@gelighting': 0,
  '@generalelectric': 0,
  '@gepublicaffairs': 0,
  '@geresearch'

In [None]:
# Reach

# 

In [None]:
# Sentiment

In [None]:
# Audience Characteristics

# Lang
# Location

# Most active participants
# Most influential participants
# Most influential content