In [1]:
# if you do not have 'nltk', the following command should work "python -m pip install nltk"
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Iu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import time

import json

In [3]:
docs_path = 'inputs/dataset_tweets_WHO.txt'
with open(docs_path) as fp:
    tweets = json.loads(fp.read())

print("Number of tweets:", len(tweets))

Number of tweets: 2399


In [4]:
lang = {} 
for tweet in tweets:
    if tweets[tweet]['lang'] in lang:
        lang[tweets[tweet]['lang']] += 1
    else:
        lang[tweets[tweet]['lang']] = 1
        
print("Languages:", lang)
print(sum(lang.values()) == len(tweets)) # Check is the number of extracted languages is the same as the number of tweets

Languages: {'en': 2353, 'es': 19, 'in': 2, 'fr': 7, 'und': 1, 'tl': 1, 'de': 6, 'ar': 2, 'ru': 2, 'uk': 1, 'ps': 1, 'ja': 4}
True


In [5]:
# Run the following code if the package is not installed: "pip install num2words"
#!pip install num2words

In [6]:
#pip install emoji

In [7]:
def process_word(word, stop_words):
    """
    Preprocess each word of the tweet getting rid of URLs, punctuation sings and stop words
    
    Argument:
    word -- string (text) to be preprocessed
    stop_words -- list of stop words to get rid of
    
    Returns:
    word - the resulting processed word. False in case we don't want that word
    """

# Eliminate URLs
    word = re.sub(r'http\S+', '', word) 

# Eliminate ampersands
    word = re.sub(r'&\S+', '', word) 

    if not word:
        return False

# Get rid of punctuation marks except "#" and "@"
    if word[0] == '#':
        word = '#' + word.translate(str.maketrans('', '', string.punctuation)) 
        return word

    elif word[0] == '@':
        word = '@' + word.translate(str.maketrans('', '', string.punctuation)) 
        return word

    else:
        word = word.translate(str.maketrans('', '', string.punctuation))

# Get rid of strings like '-'
    if len(word) <= 1 and not word.isdigit(): 
        return False
    
# Eliminate the stopwords 
    elif word not in stop_words: 
        return word

In [8]:
from num2words import num2words
import string
import re
def build_terms(line):
    """
    Preprocess the tweet text calling the process_word function, stemming,
    transforming in lowercase and return the tokens of the text.
    
    Argument:
    line -- string (text) to be preprocessed
    
    Returns:
    line - a list of tokens corresponding to the input text after the preprocessing
    """

    stemmer = PorterStemmer()
    
    stop_words = set()
    for lang in stopwords.fileids():
         stop_words |= set(stopwords.words(lang))
            
    line = emoji.get_emoji_regexp().sub("", line)
    
    line= line.lower()## Transform in lowercase
    line= line.split() ## Tokenize the text to get a list of terms
    templine=[]
    for word in line:      
        word = process_word(word, stop_words)
        if word:
            templine.append(word)
            
    line= templine
    line= [stemmer.stem(word) for word in line] ## perform stemming
    return line


In [9]:
import emoji
def create_index(tweets):
    """
    Implement the inverted index
    
    Argument:
    lines -- collection of tweets
    
    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of tweets where these keys appears in (and the positions) as values.
    """
    index = defaultdict(list)
    for tweet in tweets:
        line = tweets[tweet]['full_text']
        line_arr = line.replace("\n", ' ')
        tweet_id = tweet
        terms = build_terms(''.join(line_arr))
        
        ## ===============================================================        
        ## create the index for the current page and store it in current_page_index (current_page_index)
        ## current_tweet_index ==> { ‘term1’: [current_tweet, [list of positions]], ...,‘term_n’: [current_tweet, [list of positions]]}

        ## Example: if the curr_tweet has id 1 and his text is 
        ##"web retrieval information retrieval":

        ## current_tweet_index ==> { ‘web’: [1, [0]], ‘retrieval’: [1, [1,4]], ‘information’: [1, [2]]}

        ## the term ‘web’ appears in tweet 1 in positions 0, 
        ## the term ‘retrieval’ appears in tweet 1 in positions 1 and 4
        ## ===============================================================

        current_tweet_index = {}

        for position, term in enumerate(terms): # terms contains the text of the tweet
            try:
                # if the term is already in the index for the current tweet (current_tweet_index)
                # append the position to the corresponding list
                
                current_tweet_index[term][1].append(position)  
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_tweet_index[term]=[tweet_id, array('I',[position])] #'I' indicates unsigned int (int in Python)
            
        #merge the current tweet index with the main index
        for term_page, posting_page in current_tweet_index.items():
            index[term_page].append(posting_page)
                         
                    
    return index

In [10]:
start_time = time.time() # Mark the time it takes for the code to create the indexes
index = create_index(tweets)
print("Total time to create the index: {} seconds".format(np.round(time.time() - start_time, 2)))

Total time to create the index: 25.29 seconds


In [11]:
# Check the indexes by searching the desired words in the document/tweet list

print("Index results for the term 'researcher': {}\n".format(index['researcher']))
print("First 10 Index results for the term 'research': \n{}".format(index['research'][:10]))

Index results for the term 'researcher': []

First 10 Index results for the term 'research': 
[['22', array('I', [16])], ['153', array('I', [19])], ['171', array('I', [13])], ['203', array('I', [5])], ['210', array('I', [7])], ['211', array('I', [5])], ['221', array('I', [6])], ['422', array('I', [12])], ['428', array('I', [3])], ['459', array('I', [22])]]
