In [1]:
# if you do not have 'nltk', the following command should work "python -m pip install nltk"
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import time

import json

In [3]:
docs_path = 'dataset_tweets_WHO.txt'
with open(docs_path) as fp:
    tweets = json.loads(fp.read())

print("Number of tweets:", len(tweets))

Number of tweets: 2399


In [10]:
lang = {} 
for tweet in tweets:
    if tweets[tweet]['lang'] in lang:
        lang[tweets[tweet]['lang']] += 1
    else:
        lang[tweets[tweet]['lang']] = 1
        
print("Languages:", lang)
print(sum(lang.values()) == len(tweets)) # Check is the number of extracted languages is the same as the number of tweets

Languages: {'en': 2353, 'es': 19, 'in': 2, 'fr': 7, 'und': 1, 'tl': 1, 'de': 6, 'ar': 2, 'ru': 2, 'uk': 1, 'ps': 1, 'ja': 4}
True


In [5]:
# Run the following code if the package is not installed: "pip install num2words"

In [6]:
import num2words

def build_terms(line):
    """
    Preprocess the article text (title + body) removing stop words, stemming,
    transforming in lowercase and return the tokens of the text.
    
    Argument:
    line -- string (text) to be preprocessed
    
    Returns:
    line - a list of tokens corresponding to the input text after the preprocessing
    """

    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    ## START CODE
    line= line.lower()## Transform in lowercase
    line= line.split() ## Tokenize the text to get a list of terms
    templine=[]
    for word in line:      
        
        if isinstance(word, int): #si tenim números, convertir-los a text per no tenir com a termes diferents "dos" i "2"
            word = num2word.to_card(word)
        
        if word not in stop_words: ##eliminate the stopwords (HINT: use List Comprehension)
            templine.append(word)
            
            
    line= templine
    line= [stemmer.stem(word) for word in line] ## perform stemming (HINT: use List Comprehension)
    ## END CODE
    return line


In [7]:
def create_index(tweets):
    """
    Implement the inverted index
    
    Argument:
    lines -- collection of Wikipedia articles
    
    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of documents where these keys appears in (and the positions) as values.
    """
    index = defaultdict(list)
    title_index = {}  # dictionary to map page titles to page ids
    for tweet in tweets:  # Remember, lines contain all documents
        line = tweets[tweet]['full_text']
        line_arr = line.replace("\n", ' ')
        tweet_id = tweet
        terms = build_terms(''.join(line_arr)) #page_title + page_text
        
        ## we do not need to apply get terms to title because it used only to print titles and not in the index
        
        ## ===============================================================        
        ## create the index for the current page and store it in current_page_index (current_page_index)
        ## current_page_index ==> { ‘term1’: [current_doc, [list of positions]], ...,‘term_n’: [current_doc, [list of positions]]}

        ## Example: if the curr_doc has id 1 and his text is 
        ##"web retrieval information retrieval":

        ## current_page_index ==> { ‘web’: [1, [0]], ‘retrieval’: [1, [1,4]], ‘information’: [1, [2]]}

        ## the term ‘web’ appears in document 1 in positions 0, 
        ## the term ‘retrieval’ appears in document 1 in positions 1 and 4
        ## ===============================================================

        current_tweet_index = {}

        for position, term in enumerate(terms): # terms contains page_title + page_text. Loop over all terms
            try:
                # if the term is already in the index for the current page (current_page_index)
                # append the position to the corresponding list
                
        ## START CODE
                current_tweet_index[term][1].append(position)  
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_tweet_index[term]=[tweet_id, array('I',[position])] #'I' indicates unsigned int (int in Python)
            
        #merge the current page index with the main index
        for term_page, posting_page in current_tweet_index.items():
            index[term_page].append(posting_page)
        
        ## END CODE                    
                    
    return index, title_index

In [8]:
start_time = time.time() # Mark the time it takes for the code to create the indexes
index, title_index = create_index(tweets)
print("Total time to create the index: {} seconds".format(np.round(time.time() - start_time, 2)))

Total time to create the index: 3.0 seconds


In [12]:
# Check the indexes by searching the desired words in the document/tweet list

print("Index results for the term 'researcher': {}\n".format(index['researcher']))
print("First 10 Index results for the term 'research': \n{}".format(index['research'][:10]))

Index results for the term 'researcher': []

First 10 Index results for the term 'research': 
[['153', array('I', [27])], ['210', array('I', [9])], ['211', array('I', [6])], ['428', array('I', [5])], ['814', array('I', [0])], ['1246', array('I', [15])], ['1341', array('I', [9])], ['1383', array('I', [3, 13])], ['1384', array('I', [14])], ['1389', array('I', [7])]]
