In [1]:
%cd ..

/home/ec2-user/uclresearchanalysis


In [2]:
# Choosing the project
import builtins
# builtins.uclresearch_topic = 'GIVENCHY'
builtins.uclresearch_topic = 'HAWKING'
# builtins.uclresearch_topic = 'NYC'
# builtins.uclresearch_topic = 'FLORIDA'

# Import configuration
from configuration import config

# Ohter imports
from IPython.display import display, HTML

import os
import json
import datetime as dt
from datetime import timezone
from datetime import timedelta
from tqdm import tqdm
import numpy as np
import pandas as pd

('Reading config file from location: '
 '/home/ec2-user/uclresearchanalysis/configuration/env.properties')
{'calculate': {'analysis': True,
               'network': True,
               'uniquetweets': True,
               'uniqueusers': True},
 'data': {'dates': ['2018-03-13', '2018-03-14', '2018-03-15'],
          'eventname': "hawking's death",
          'phrases': ['stephen%20hawking%20death',
                      'stephen%20hawking%20die',
                      'stephen%20hawking%20passed%20away'],
          'starttime': 'Mar 13 12:00:00 -0500 2018'},
 'path': {'crawl': {'followers': '/home/ec2-user/uclresearchanalysis/followers',
                    'friends': '/home/ec2-user/uclresearchanalysis/friends'},
          'cwd': '/home/ec2-user/uclresearchanalysis/data/hawking',
          'ml': '/home/ec2-user/uclresearchanalysis/data/hawking/pickle',
          'networkx': {'all': '/home/ec2-user/uclresearchanalysis/data/hawking/pickle/networkx_all.dat',
                       'frien

In [3]:
file_input_path = config.settings['path']['twitter']
dates = config.settings['data']['dates']
search_phrases = config.settings['data']['phrases']
timeframe = config.settings['timeframe']
project_name = config.settings['data']['eventname']
starttime = config.settings['data']['starttime']

In [4]:
def order_and_reindex(df, column):
    df = df.sort_values(by=[column])
    df = df.set_index(np.arange(len(df.index)))
    return df

In [5]:
def convert_utc_to_est(time_string):
    datetime_object = dt.datetime.strptime(time_string, '%a %b %d %H:%M:%S %z %Y')
    return datetime_object.replace(tzinfo=timezone.utc).astimezone(tz=timezone(-timedelta(hours=5)))

def get_created_at(tweet):
    return convert_utc_to_est(tweet['created_at'])

def get_user_created_days(tweet):
    delta_time = convert_utc_to_est(tweet['created_at']) - convert_utc_to_est(tweet['user']['created_at'])
    return delta_time.days + 1

def get_retweet_id(tweet):
    if (tweet['text'].split()[0] == 'RT'):
        user_name = tweet['text'].split()[1][1:-1]
        mentions = tweet['entities']['user_mentions']
        for mention in mentions:
            if mention['screen_name'] == user_name:
                return string_to_int(mention['id'])

def get_reply_id(tweet):
    return string_to_int(tweet['in_reply_to_user_id_str'])
    
def get_user_mentions(tweet):
    retweet_id = get_retweet_id(tweet)
    reply_id = get_reply_id(tweet)  
    mentions = []
    for mention in tweet['entities']['user_mentions']:
        mention_id = string_to_int(mention['id'])
        if mention_id != retweet_id and mention_id != reply_id:
            mentions.append(mention_id)
    return mentions

def string_to_int(string):
    if string is None:
        return None
    else:
        return int(string)
    
def find_by_user_name(df, user_name):
    user = df[df.user == user_name]
    return user.iloc[0]

def find_by_user_id(df, user_id):
    user = df[df.user_id == user_id]
    return user.iloc[0]

def find_index_by_user_id(df, user_id):
    return df.user_id[df.user_id == user_id].index.tolist()[0]

def find_unique_tweets_crawled():
    file_path_dict = {
        date: ['{}/{}_{}.json'.format(file_input_path, x, date) for x in search_phrases]
        for date in dates
    }
    tweets_crawled_list = []
    for date, file_path_list in file_path_dict.items():
        for file_path in file_path_list:
            if (os.path.isfile(file_path)):
                with open(file_path, 'r') as file:
                    counter = 0
                    for line in file.readlines():
                        tweets_crawled_list.append(json.loads(line))
                        counter += 1
                    print('{}, {}, {}'.format(date, file_path, counter))
    
    unique_tweets = list({each['id']:each for each in tweets_crawled_list}.values())
    start_timestamp = dt.datetime.strptime(starttime, '%b %d %H:%M:%S %z %Y')
    df = pd.DataFrame()
    df['user_id'] = list(map(lambda tweet: string_to_int(tweet['user']['id_str']), unique_tweets))
    df['user'] = list(map(lambda tweet: tweet['user']['screen_name'], unique_tweets))
    df['created_at'] = list(map(lambda tweet: get_created_at(tweet), unique_tweets))
    df['reply_id'] = list(map(lambda tweet: get_reply_id(tweet), unique_tweets))
    df['retweet_id'] = list(map(lambda tweet: get_retweet_id(tweet), unique_tweets))
    df['at_ids'] = list(map(lambda tweet: get_user_mentions(tweet), unique_tweets))
    df['text'] = list(map(lambda tweet: tweet['text'], unique_tweets))
    
    df = df[df.created_at >= start_timestamp]
    
    df = order_and_reindex(df, 'created_at')
    df['time_lapsed'] = 0
    first_tweet_datetime = df.created_at.iloc[0]
    for index in tqdm(range(len(df))):
        df.loc[index, 'time_lapsed'] = round((df.loc[index, 'created_at'] - first_tweet_datetime).total_seconds() / 60.0, 2)
    df = df[df.time_lapsed < float(timeframe)]
    df = df.drop_duplicates(subset = ['user_id'])
    df = order_and_reindex(df, 'time_lapsed')
    df['order'] = df.index + 1
    df = df.drop(['created_at'], axis=1)
    
    df['mention_and_reply'] = [[] for _ in range(len(df))]
    
    unique_user_id_set = set(df.user_id)
    
    for index in tqdm(range(len(df))):
        user_name = df.loc[index, 'user']
        user_id = df.loc[index, 'user_id']
        reply_id = df.loc[index, 'reply_id']
        retweet_id = df.loc[index, 'retweet_id']
        at_ids = df.loc[index, 'at_ids']
        
        if reply_id is not None:
            if reply_id in unique_user_id_set:
                try:
                    find_by_user_id(df, user_id).mention_and_reply.append(find_index_by_user_id(df, int(reply_id)))
                except:
                    pass
        if retweet_id is not None:
            if retweet_id in unique_user_id_set:
                try:
                    find_by_user_id(df, user_id).mention_and_reply.append(find_index_by_user_id(df, int(retweet_id)))
                except:
                    pass
        for at_id in at_ids:
            if at_id in unique_user_id_set:
                try:
                    find_by_user_id(df, at_id).mention_and_reply.append(find_index_by_user_id(df, int(user_id)))
                except IndexError:
                    pass
                
    return df

In [6]:
unique_tweets = find_unique_tweets_crawled()
display(unique_tweets.head())
config.dump_tweets_dataframe(unique_tweets)
unique_tweets = config.load_tweets_dataframe()


2018-03-13, /home/ec2-user/uclresearchanalysis/data/hawking/twitter/stephen%20hawking%20death_2018-03-13.json, 0
2018-03-13, /home/ec2-user/uclresearchanalysis/data/hawking/twitter/stephen%20hawking%20die_2018-03-13.json, 0
2018-03-13, /home/ec2-user/uclresearchanalysis/data/hawking/twitter/stephen%20hawking%20passed%20away_2018-03-13.json, 0
2018-03-14, /home/ec2-user/uclresearchanalysis/data/hawking/twitter/stephen%20hawking%20death_2018-03-14.json, 95006
2018-03-14, /home/ec2-user/uclresearchanalysis/data/hawking/twitter/stephen%20hawking%20die_2018-03-14.json, 4866
2018-03-14, /home/ec2-user/uclresearchanalysis/data/hawking/twitter/stephen%20hawking%20passed%20away_2018-03-14.json, 13224
2018-03-15, /home/ec2-user/uclresearchanalysis/data/hawking/twitter/stephen%20hawking%20death_2018-03-15.json, 19976
2018-03-15, /home/ec2-user/uclresearchanalysis/data/hawking/twitter/stephen%20hawking%20die_2018-03-15.json, 1033
2018-03-15, /home/ec2-user/uclresearchanalysis/data/hawking/twitter/

100%|██████████| 131682/131682 [01:38<00:00, 1332.78it/s]
100%|██████████| 112613/112613 [02:08<00:00, 875.87it/s]


Unnamed: 0,user_id,user,reply_id,retweet_id,at_ids,text,time_lapsed,order,mention_and_reply
0,836922157,johnstempinNPR,,,[],BREAKING: We are aware of local reports in Lon...,0.0,1,[]
1,405808530,TheWebLender,,,[],Stephen Hawking has passed away \n#sad,1.58,2,[]
2,39582174,butterbob,,836922157.0,[],RT @johnstempinNPR: BREAKING: We are aware of ...,1.78,3,[0]
3,2868539474,_Refused,,,[],Did Stephen Hawking just die?,2.97,4,[]
4,28820629,Rowaenthe,,,[],I am gutted to hear of Stephen Hawking's death...,3.1,5,[]


Dumping data to path /home/ec2-user/uclresearchanalysis/data/hawking/pickle/tweets.dat
Finished dumping data to path /home/ec2-user/uclresearchanalysis/data/hawking/pickle/tweets.dat
Loading data file from path /home/ec2-user/uclresearchanalysis/data/hawking/pickle/tweets.dat
'Loaded 112613 entires'
