In [None]:
from collections import defaultdict
from array import array
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import math
import numpy as np
import collections
from numpy import linalg as la
import json
import random
import re
import pandas as pd

In [None]:
data_folder = './data/'
output_folder = './output/'

data_path = data_folder + 'tw_hurricane_data.json'
map_path = data_folder + 'tweet_document_ids_map.csv'

# Reading every line in the input file.
with open(data_path) as file:
    lines = file.readlines()
    
print("Total number of tweets read: {}".format(len(lines)))

# Reading each json line
lines = [json.loads(line) for line in lines]

In [None]:
def clean(text):
    """
    Preprocess the tweet text by lower-casing all characters, tokenizing,
    removing non-alphanumerics and stopwords and stemming.
    
    Argument:
    text -- string (text) to be preprocessed
    
    Returns:
    text - a list of tokens corresponding to the input text after the preprocessing.
    """
    
    
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    
    text = text.lower() ## Lower-casing all characters.
    text = word_tokenize(text) ## Tokenizing the text.
    text = [token for token in text if token.isalpha()] ## Removing non-alphanumeric tokens.
    text = [x for x in text if x not in stop_words] ## Removing stop words.
    text = [stemmer.stem(x) for x in text] ## Stemming the remaining tokens.
    
    return text

In [None]:
def feature_extraction(tweet_line):
    """
    Extract the fundamental information of a tweet.
    
    Argument:
    tweet_line -- JSON with all the information of a tweet
    
    Returns:
    tweet - a dictionary that contains de following fields: Text (Unprocessed), 
            Username, Date, Hashtags, Mentions, Likes, Retweets, Url and Clean Text
    
    """
    
    tweet = {}
    tweet['Id'] = tweet_line['id']
    tweet['Text'] = tweet_line['full_text']
    tweet['Username'] = tweet_line['user']['screen_name']
    tweet['Date'] = tweet_line['created_at']
    tweet['Hashtags'] = [x['text'] for x in tweet_line['entities']['hashtags']]
    tweet['Mentions'] = re.findall("@([a-zA-Z0-9_]{1,50})", tweet['Text'])
    tweet['Likes'] = tweet_line['favorite_count']
    tweet['Retweets'] = tweet_line['retweet_count']
    tweet['Url'] = 'https://twitter.com/'+tweet['Username']+'/status/'+tweet_line['id_str']
    tweet['Clean_text'] = clean(re.sub(r'http\S+', '', tweet_line['full_text']))
    
    return tweet

In [None]:
l = random.randint(0, len(lines)-1)
new_tweet = feature_extraction(lines[l])

print('Id: \t\t{}'.format(new_tweet['Id']))
print('Default tweet: \t' + repr(new_tweet['Text']))
print('Clean Tweet: \t{}'.format(new_tweet['Clean_text']))
print('Username: \t{}'.format(new_tweet['Username']))
print('Date: \t\t{}'.format(new_tweet['Date']))
print('Hashtags: \t{}'.format(new_tweet['Hashtags']))
print('Mentions: \t{}'.format(new_tweet['Mentions']))
print('Likes: \t\t{}'.format(new_tweet['Likes']))
print('Retweets: \t{}'.format(new_tweet['Retweets']))
print('Url: \t\t{}\n'.format(new_tweet['Url']))

In [None]:
def tweets_df(tweets):
    """
    Build a dataframe containing all tweets' info
    
    Argument:
    tweets -- List of JSON containing the information of a tweet
    
    Returns:
    df - a dataframe that contains the information processed for all tweets
    
    """
    
    df = pd.DataFrame(columns=['Id', 'Text', 'Clean_text', 'Username', 'Date', 'Hashtags', 'Mentions', 'Likes', 'Retweets', 'Url'])

    for tweet in tweets:
        df.append(feature_extraction(tweet), ignore_index=True)

    return df

In [None]:
data = tweets_df(lines)
data

In [None]:
# Reading map file as a dataframe
df = pd.read_csv(map_path, delimiter='\t', names=['Document', 'Id'])

