In [57]:
from collections import defaultdict
from array import array
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import defaultdict
import math
import numpy as np
import collections
from numpy import linalg as la
import json
import random
import re
import pandas as pd

In [58]:
data_folder = './data/'
output_folder = './output/'

data_path = data_folder + 'tw_hurricane_data.json'
map_path = data_folder + 'tweet_document_ids_map.csv'

# Reading every line in the input file.
with open(data_path) as file:
    lines = file.readlines()
    
print("Total number of tweets read: {}".format(len(lines)))

# Reading each json line
lines = [json.loads(line) for line in lines]

Total number of tweets read: 4000


In [59]:
def clean(text):
    """
    Preprocess the tweet text by lower-casing all characters, tokenizing,
    removing non-alphanumerics and stopwords and stemming.
    
    Argument:
    text -- string (text) to be preprocessed
    
    Returns:
    text - a list of tokens corresponding to the input text after the preprocessing.
    """
    
    
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    
    text = text.lower() ## Lower-casing all characters.
    text = word_tokenize(text) ## Tokenizing the text.
    text = [token for token in text if token.isalpha()] ## Removing non-alphanumeric tokens.
    text = [x for x in text if x not in stop_words] ## Removing stop words.
    text = [stemmer.stem(x) for x in text] ## Stemming the remaining tokens.
    
    return text

In [60]:
from numpy import NaN


def feature_extraction(tweet_line):
    """
    Extract the fundamental information of a tweet.
    
    Argument:
    tweet_line -- JSON with all the information of a tweet
    
    Returns:
    tweet - a dictionary that contains de following fields: Text (Unprocessed), 
            Username, Date, Hashtags, Mentions, Likes, Retweets, Url and Clean Text
    
    """
    
    tweet = {}
    tweet['Id'] = tweet_line['id']
    tweet['Text'] = tweet_line['full_text']
    tweet['Username'] = tweet_line['user']['screen_name']
    tweet['Date'] = tweet_line['created_at']
    tweet['Hashtags'] = [x['text'] for x in tweet_line['entities']['hashtags']]
    mentions = re.findall("@([a-zA-Z0-9_]{1,50})", tweet['Text'])
    tweet['Mentions'] = mentions if len(mentions) != 0 else NaN
    tweet['Likes'] = tweet_line['favorite_count']
    tweet['Retweets'] = tweet_line['retweet_count']
    tweet['Url'] = 'https://twitter.com/'+tweet['Username']+'/status/'+tweet_line['id_str']
    tweet['Clean_text'] = clean(re.sub(r'http\S+', '', tweet_line['full_text']))
    
    return tweet

In [61]:
l = random.randint(0, len(lines)-1)
new_tweet = feature_extraction(lines[l])

print('Id: \t\t{}'.format(new_tweet['Id']))
print('Default tweet: \t' + repr(new_tweet['Text']))
print('Clean Tweet: \t{}'.format(new_tweet['Clean_text']))
print('Username: \t{}'.format(new_tweet['Username']))
print('Date: \t\t{}'.format(new_tweet['Date']))
print('Hashtags: \t{}'.format(new_tweet['Hashtags']))
print('Mentions: \t{}'.format(new_tweet['Mentions']))
print('Likes: \t\t{}'.format(new_tweet['Likes']))
print('Retweets: \t{}'.format(new_tweet['Retweets']))
print('Url: \t\t{}\n'.format(new_tweet['Url']))

Id: 		1575902788558667780
Default tweet: 	'Palm Beach County Food Bank is alleviating hunger and supporting those impacted by #HurricaneIan \n\nDonate #crypto to @FoodBankPBC: https://t.co/VFpF91oeQM'
Clean Tweet: 	['palm', 'beach', 'counti', 'food', 'bank', 'allevi', 'hunger', 'support', 'impact', 'hurricaneian', 'donat', 'crypto', 'foodbankpbc']
Username: 	TheGivingBlock
Date: 		Fri Sep 30 17:37:58 +0000 2022
Hashtags: 	['HurricaneIan', 'crypto']
Mentions: 	['FoodBankPBC']
Likes: 		1
Retweets: 	0
Url: 		https://twitter.com/TheGivingBlock/status/1575902788558667780



In [62]:
def build_tweets_df(tweets):
    """
    Build a dataframe from all tweets JSON.
    
    Argument:
    tweets -- JSON list with all tweets' information
    
    Returns:
    df - a dataframe that contains de information of all tweets
    
    """
    
    tweet_dict = defaultdict(list)

    for i, tweet in enumerate(tweets):

        tweet_data = feature_extraction(tweet)
        for key, value in tweet_data.items():
            tweet_dict[key].append(value)

        if i%500 == 0:
            print('Tweets processed: {}'.format(i))

    return pd.DataFrame(tweet_dict)

In [63]:
data = build_tweets_df(lines)
data.head()

Tweets processed: 0
Tweets processed: 500
Tweets processed: 1000
Tweets processed: 1500
Tweets processed: 2000
Tweets processed: 2500
Tweets processed: 3000
Tweets processed: 3500


Unnamed: 0,Id,Text,Username,Date,Hashtags,Mentions,Likes,Retweets,Url,Clean_text
0,1575918182698979328,So this will keep spinning over us until 7 pm…...,suzjdean,Fri Sep 30 18:39:08 +0000 2022,[HurricaneIan],,0,0,https://twitter.com/suzjdean/status/1575918182...,"[keep, spin, us, away, alreadi, hurricaneian]"
1,1575918151862304768,Our hearts go out to all those affected by #Hu...,lytx,Fri Sep 30 18:39:01 +0000 2022,[HurricaneIan],,0,0,https://twitter.com/lytx/status/15759181518623...,"[heart, go, affect, hurricaneian, wish, everyo..."
2,1575918140839673873,Kissimmee neighborhood off of Michigan Ave. \n...,CHeathWFTV,Fri Sep 30 18:38:58 +0000 2022,[HurricaneIan],,0,0,https://twitter.com/CHeathWFTV/status/15759181...,"[kissimme, neighborhood, michigan, hurricaneian]"
3,1575918135009738752,I have this one tree in my backyard that scare...,spiralgypsy,Fri Sep 30 18:38:57 +0000 2022,"[scwx, HurricaneIan]",,0,0,https://twitter.com/spiralgypsy/status/1575918...,"[one, tree, backyard, scare, poltergeist, tree..."
4,1575918119251419136,@AshleyRuizWx @Stephan89441722 @lilmizzheidi @...,Blondie610,Fri Sep 30 18:38:53 +0000 2022,[HurricaneIan],"[AshleyRuizWx, Stephan89441722, lilmizzheidi, ...",0,0,https://twitter.com/Blondie610/status/15759181...,"[ashleyruizwx, lilmizzheidi, winknew, dylanfed..."


In [64]:
# Reading map file as a dataframe
df = pd.read_csv(map_path, delimiter='\t', names=['Document', 'Id'])

tweets_df = pd.merge(df,data)

In [65]:
tweets_df.head()

Unnamed: 0,Document,Id,Text,Username,Date,Hashtags,Mentions,Likes,Retweets,Url,Clean_text
0,doc_1,1575918182698979328,So this will keep spinning over us until 7 pm…...,suzjdean,Fri Sep 30 18:39:08 +0000 2022,[HurricaneIan],,0,0,https://twitter.com/suzjdean/status/1575918182...,"[keep, spin, us, away, alreadi, hurricaneian]"
1,doc_2,1575918151862304768,Our hearts go out to all those affected by #Hu...,lytx,Fri Sep 30 18:39:01 +0000 2022,[HurricaneIan],,0,0,https://twitter.com/lytx/status/15759181518623...,"[heart, go, affect, hurricaneian, wish, everyo..."
2,doc_3,1575918140839673873,Kissimmee neighborhood off of Michigan Ave. \n...,CHeathWFTV,Fri Sep 30 18:38:58 +0000 2022,[HurricaneIan],,0,0,https://twitter.com/CHeathWFTV/status/15759181...,"[kissimme, neighborhood, michigan, hurricaneian]"
3,doc_4,1575918135009738752,I have this one tree in my backyard that scare...,spiralgypsy,Fri Sep 30 18:38:57 +0000 2022,"[scwx, HurricaneIan]",,0,0,https://twitter.com/spiralgypsy/status/1575918...,"[one, tree, backyard, scare, poltergeist, tree..."
4,doc_5,1575918119251419136,@AshleyRuizWx @Stephan89441722 @lilmizzheidi @...,Blondie610,Fri Sep 30 18:38:53 +0000 2022,[HurricaneIan],"[AshleyRuizWx, Stephan89441722, lilmizzheidi, ...",0,0,https://twitter.com/Blondie610/status/15759181...,"[ashleyruizwx, lilmizzheidi, winknew, dylanfed..."
