# Information Retrieval and Web Analytics

# Part 1: Text Processing

Authors: 
*   <font color="blue">Miquel Casas Olivella</font>
*   <font color="blue">Aina Moncho Roig</font>
*   <font color="blue">Marina Suárez Blázquez</font>

#### Load Python packages
Let's first import all the packages that we will need during this assignment.

In [1]:
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from numpy import linalg as la
from datetime import datetime
import math
import numpy as np
import collections
import json # this will help us read the file
import re
import pandas as pd

In [2]:
# if you do not have 'nltk', the following command should work "python -m pip install nltk"
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

We are first going to read the tweets using json python's library.

In [3]:
data = [json.loads(line) for line in open('tw_hurricane_data.json', 'r')]
print('Total number of tweets: {}'.format(len(data)))

Total number of tweets: 4000


#### Pre-processing the documents


1. Removing stop words
2. Tokenization
3. Removing punctuation marks
4. Stemming
5. Bonus point





In [4]:
def build_terms(line):
  stemmer = PorterStemmer()
  stop_words = set(stopwords.words('english'))
  line = line.lower()
  line = re.sub('http[s]?://\S+', '', line) # BONUS: Remove urls
  line = re.sub('[\W_]+', ' ', line) # BONUS: Remove emojis, simbols...
  line = line.split()  # Tokenize the text to get a list of terms
  line = [x for x in line if x not in stop_words]  # Eliminate the stopwords
  line = [stemmer.stem(word) for word in line] # Perform stemming
  return line

In [5]:
i = 0
for tweet in data:  # Data contains all tweets

  # Keeping the needed tweet fields
  tweet = {'Tweet': tweet['full_text'], 
           'Username': tweet['user']['name'], 
           'Date': tweet['created_at'], 
           'Hashtags': tweet['entities']['hashtags'],
           'Likes': tweet['favorite_count'],
           'Retweets': tweet['retweet_count'],
           'Url': tweet['entities']['media'][0]['url'] if ( tweet['entities'].get('media') is not None) else {}} # encontrar que url es
  
  # Displaying 5 tweets before processing them
  if i<5:
    print('\033[1m' + 'UNPROCESSED' + '\033[0m')
    print(tweet) 
  
  # Processing each of the fields
  for key in tweet.keys():
    if key == 'Tweet':
      tweet[key]=build_terms(tweet[key]) # Pre-processing the full text
    if key == 'Username':
      tweet[key]=re.sub('[\W_]+',' ',tweet[key]) # Remove emojis, simbols... from username    
    if key == 'Date':
      tweet[key]=datetime.strptime(tweet[key], '%a %b %d %H:%M:%S %z %Y') # Changing Date format into an understandable format for python
    if key == 'Hashtags':
      tweet[key]=[hashtag['text'] for hashtag in tweet[key]] # Creating a list of all hashtags (without indices)     
  
  # Displaying 5 tweets after processing them
  if i<5:
    print('\033[1m' + 'PROCESSED' + '\033[0m')
    print(tweet, '\n') 

  data[i] = tweet
  i += 1 

[1mUNPROCESSED[0m
{'Tweet': 'So this will keep spinning over us until 7 pm…go away already. #HurricaneIan https://t.co/VROTxNS9rz', 'Username': 'Suz👻', 'Date': 'Fri Sep 30 18:39:08 +0000 2022', 'Hashtags': [{'text': 'HurricaneIan', 'indices': [63, 76]}], 'Likes': 0, 'Retweets': 0, 'Url': 'https://t.co/VROTxNS9rz'}
[1mPROCESSED[0m
{'Tweet': ['keep', 'spin', 'us', '7', 'pm', 'go', 'away', 'alreadi', 'hurricaneian'], 'Username': 'Suz ', 'Date': datetime.datetime(2022, 9, 30, 18, 39, 8, tzinfo=datetime.timezone.utc), 'Hashtags': ['HurricaneIan'], 'Likes': 0, 'Retweets': 0, 'Url': 'https://t.co/VROTxNS9rz'} 

[1mUNPROCESSED[0m
{'Tweet': 'Our hearts go out to all those affected by #HurricaneIan. We wish everyone on the roads currently braving the conditions safe travels. 💙', 'Username': 'Lytx', 'Date': 'Fri Sep 30 18:39:01 +0000 2022', 'Hashtags': [{'text': 'HurricaneIan', 'indices': [43, 56]}], 'Likes': 0, 'Retweets': 0, 'Url': {}}
[1mPROCESSED[0m
{'Tweet': ['heart', 'go', 'affect',

In [6]:
df = pd.DataFrame.from_dict(data) # Converting the list of dictionaries into a dataframe
doc_ids = pd.read_csv('tweet_document_ids_map.csv', names=['doc_id']) # Reading the tweet_document_ids_map
df_final = pd.merge(doc_ids, df, left_index=True, right_index=True) # Mapping the tweets with the document ids
df_final

Unnamed: 0,doc_id,Tweet,Username,Date,Hashtags,Likes,Retweets,Url
0,doc_1\t1575918182698979328,"[keep, spin, us, 7, pm, go, away, alreadi, hur...",Suz,2022-09-30 18:39:08+00:00,[HurricaneIan],0,0,https://t.co/VROTxNS9rz
1,doc_2\t1575918151862304768,"[heart, go, affect, hurricaneian, wish, everyo...",Lytx,2022-09-30 18:39:01+00:00,[HurricaneIan],0,0,{}
2,doc_3\t1575918140839673873,"[kissimme, neighborhood, michigan, ave, hurric...",Christopher Heath,2022-09-30 18:38:58+00:00,[HurricaneIan],0,0,https://t.co/jf7zseg0Fe
3,doc_4\t1575918135009738752,"[one, tree, backyard, scare, poltergeist, tree...",alex,2022-09-30 18:38:57+00:00,"[scwx, HurricaneIan]",0,0,{}
4,doc_5\t1575918119251419136,"[ashleyruizwx, stephan89441722, lilmizzheidi, ...",Tess,2022-09-30 18:38:53+00:00,[HurricaneIan],0,0,{}
...,...,...,...,...,...,...,...,...
3995,doc_3996\t1575856268022992896,"[cfrd, carrboropd, carrboro, public, work, car...",Carrboro Fire Rescue,2022-09-30 14:33:06+00:00,"[CarrboroSafe, ncwx, HurricaneIan]",2,0,{}
3996,doc_3997\t1575856245650919424,"[osceolacountyfl, list, fema, websit, widespre...",BaconBitsNews,2022-09-30 14:33:01+00:00,"[Kissimmee, SaintCloud, BlueCounty, Disney, De...",0,0,{}
3997,doc_3998\t1575856228886089728,"[realli, hurricaneian, flood, florida, magatea...",jganyfl,2022-09-30 14:32:57+00:00,"[HurricaneIan, Florida, MAGATears]",16,8,https://t.co/9VPkyjZvWO
3998,doc_3999\t1575856226139017216,"[damag, area, punta, gorda, thread, tropic, gu...",CJ Haddad,2022-09-30 14:32:56+00:00,[HurricaneIan],2,1,https://t.co/jcVCdY2FQ6
