In [1]:
# if you do not have 'nltk', the following command should work "python -m pip install nltk"
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
#pip install gensim --user

In [3]:
# Run the following code if the package is not installed: "pip install num2words"
#!pip install num2words

In [4]:
#pip install emoji

In [7]:
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import time
import string
import re
import emoji
import pandas as pd
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

import json

In [8]:
docs_path = 'dataset_tweets_WHO.txt'
with open(docs_path) as fp:
    tweets = json.loads(fp.read())

print("Number of tweets:", len(tweets))

Number of tweets: 2399


In [9]:
lang = {} 
for tweet in tweets:
    if tweets[tweet]['lang'] in lang:
        lang[tweets[tweet]['lang']] += 1
    else:
        lang[tweets[tweet]['lang']] = 1
        
print("Languages:", lang)
print(sum(lang.values()) == len(tweets)) # Check is the number of extracted languages is the same as the number of tweets

Languages: {'en': 2353, 'es': 19, 'in': 2, 'fr': 7, 'und': 1, 'tl': 1, 'de': 6, 'ar': 2, 'ru': 2, 'uk': 1, 'ps': 1, 'ja': 4}
True


In [10]:
def process_word(word, stop_words):
    """
    Preprocess each word of the tweet getting rid of URLs, punctuation sings and stop words
    
    Argument:
    word -- string (text) to be preprocessed
    stop_words -- list of stop words to get rid of
    
    Returns:
    word - the resulting processed word. False in case we don't want that word
    """
    
# Eliminate URLs
    word = re.sub(r'http\S+', '', word) 

# Eliminate ampersands
    word = re.sub(r'&\S+', '', word) 

    if not word:
        return False

# Get rid of punctuation marks except "#" and "@"
    if word[0] == '#':
        word = '#' + word.translate(str.maketrans('', '', string.punctuation)) 
        return word

    elif word[0] == '@':
        word = '@' + word.translate(str.maketrans('', '', string.punctuation)) 
        return word
    
    elif '¿' in word:
        word = word.replace('¿', '')
    
    else:
        word = word.translate(str.maketrans('', '', string.punctuation))

# Get rid of strings like '-'
    if len(word) <= 1 and not word.isdigit(): 
        return False
    
# Eliminate the stopwords 
    elif word not in stop_words: 
        return word

In [11]:
def build_terms(line):
    """
    Preprocess the tweet text calling the process_word function, stemming,
    transforming in lowercase and return the tokens of the text.
    
    Argument:
    line -- string (text) to be preprocessed
    
    Returns:
    line - a list of tokens corresponding to the input text after the preprocessing
    """

    stemmer = PorterStemmer()
    
    stop_words = set()
    for lang in stopwords.fileids():
         stop_words |= set(stopwords.words(lang))
            
    line = emoji.get_emoji_regexp().sub("", line)
    
    line= line.lower()## Transform in lowercase
    line= line.split() ## Tokenize the text to get a list of terms
    templine=[]
    for word in line:      
        word = process_word(word, stop_words)
        if word:
            templine.append(word)
            
    line= templine
    line= [stemmer.stem(word) for word in line] ## perform stemming
    return line


In [12]:
def create_index_tfidf(tweets, X):
    """
    Implement the inverted index
    
    Argument:
    lines -- collection of tweets
    
    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of tweets where these keys appears in (and the positions) as values.
    """
    index = defaultdict(list)
    tf = defaultdict(list)
    df = defaultdict(int)
    idf = defaultdict(float)
    num_tweets = len(tweets)
    
    likes_tw = defaultdict(list)
    followers_user = defaultdict(list)
    tweet2vec = defaultdict(list)
    rt_tw = defaultdict(list)
    
    for tweet in tweets:
        line = tweets[tweet]['full_text']
        line_arr = line.replace("\n", ' ')
        tweet_id = tweet
        terms = build_terms(''.join(line_arr))
        
        if len(terms) == 0:
            continue
        
        tweet2vec[tweet] = np.array([0.0]*len(X[terms[0]]))
        
        for term in terms:
            tweet2vec[tweet] += np.array(X[term])
        
        tweet2vec[tweet] = tweet2vec[tweet] / len(terms)
        
        ## ===============================================================        
        ## create the index for the current page and store it in current_page_index (current_page_index)
        ## current_tweet_index ==> { ‘term1’: [current_tweet, [list of positions]], ...,‘term_n’: [current_tweet, [list of positions]]}

        ## Example: if the curr_tweet has id 1 and his text is 
        ##"web retrieval information retrieval":

        ## current_tweet_index ==> { ‘web’: [1, [0]], ‘retrieval’: [1, [1,4]], ‘information’: [1, [2]]}

        ## the term ‘web’ appears in tweet 1 in positions 0, 
        ## the term ‘retrieval’ appears in tweet 1 in positions 1 and 4
        ## ===============================================================

        current_tweet_index = {}

        for position, term in enumerate(terms):  ## terms contains the text of the tweet
            try:
                # if the term is already in the index for the current tweet (current_tweet_index)
                # append the position to the corresponding list
                current_tweet_index[term][1].append(position)
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_tweet_index[term]=[tweet_id, array('I',[position])] #'I' indicates unsigned int (int in Python)

        #normalize term frequencies
        # Compute the denominator to normalize term frequencies (formula 2 above)
        # norm is the same for all terms of a tweet.
        norm = 0
        for term, posting in current_tweet_index.items():
            # posting will contain the list of positions for current term in current tweet. 
            # posting ==> [current_tweet, [list of positions]] 
            # you can use it to infer the frequency of current term.
            norm += len(posting[1]) ** 2
        norm = math.sqrt(norm)

        #calculate the tf(dividing the term frequency by the above computed norm) and df weights
        for term, posting in current_tweet_index.items():
            # append the tf for current term (tf = term frequency in current tweet/norm)
            tf[term].append(np.round(len(posting[1])/norm,4)) ## SEE formula (1) above
            #increment the tweet frequency of current term (number of tweets containing the current term)
            df[term] += 1# increment DF for current term
            
            likes_tw[term].append(tweets[posting[0]]['favorite_count']) # Compute the amount of likes of the tweet
            followers_user[term].append(tweets[posting[0]]['user']['followers_count']) # Compute the amount of followers of the user who tweeted
            rt_tw[term].append(tweets[posting[0]]['retweet_count'])

        #merge the current tweet index with the main index
        for term_page, posting_page in current_tweet_index.items():
            index[term_page].append(posting_page)

        # Compute IDF following the formula (3) above. HINT: use np.log
        for term in df:
            idf[term] = np.round(np.log(float(num_tweets/df[term])), 4)

    return index, tf, df, idf, likes_tw, followers_user, rt_tw, tweet2vec

In [13]:
def check_index(word, index): # Function that checks and returns if a given word is in the created index
    if word in index:
        return index[word]
    else:
        return []

In [14]:
def rank_tweets(terms, tweets, index, idf, tf):
    """
    Perform the ranking of the results of a search based on the tf-idf weights
    
    Argument:
    terms -- list of query terms
    tweets -- list of tweets, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies
    title_index -- mapping between page id and page title
    
    Returns:
    Print the list of ranked tweets
    """
    # I'm interested only on the element of the tweetVector corresponding to the query terms 
    # The remaining elements would became 0 when multiplied to the query_vector
    tweet_vectors = defaultdict(lambda: [0] * len(terms)) # I call tweet_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    query_vector = [0] * len(terms)

    # compute the norm for the query tf
    query_terms_count = collections.Counter(terms)  # get the frequency of each term in the query. 
    # Example: collections.Counter(["hello","hello","world"]) --> Counter({'hello': 2, 'world': 1})
    #HINT: use when computing tf for query_vector

    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue
            
        ## Compute tf*idf(normalize TF as done with tweets)
        query_vector[termIndex]=query_terms_count[term]/query_norm * idf[term]

        # Generate tweet_vectors for matching tweets
        for tweet_index, (tweet, postings) in enumerate(index[term]):
            # Example of [tweet_index, (tweet, postings)]
            # 0 (26, array('I', [1, 4, 12, 15, 22, 28, 32, 43, 51, 68, 333, 337]))
            # 1 (33, array('I', [26, 33, 57, 71, 87, 104, 109]))
            # term is in tweet 26 in positions 1,4, .....
            # term is in tweet 33 in positions 26,33, .....

            #tf[term][0] will contain the tf of the term "term" in the tweet 26            
            if tweet in tweets:
                tweet_vectors[tweet][termIndex] = tf[term][tweet_index] * idf[term]  # check if multiply for idf

    # Calculate the score of each tweet 
    # compute the cosine similarity between queyVector and each tweetVector:
    # We use the dot product because in case of normalized vectors it corresponds to the cosine similarity
    
    tweet_scores=[[np.dot(curtweetVec, query_vector), tweet] for tweet, curtweetVec in tweet_vectors.items() ]
    tweet_scores.sort(reverse=True)

    result_tweets = [x[1] for x in tweet_scores]

    if len(result_tweets) == 0:
        print("No results found, try again")
        return False

    return result_tweets, tweet_scores

In [15]:
def rank_tweets_ours(terms, tweets, index, idf, tf, likes, followers, retweets, all_tweets):
    """
    Perform the ranking of the results of a search based on the tf-idf weights
    
    Argument:
    terms -- list of query terms
    tweets -- list of tweets, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies
    title_index -- mapping between page id and page title
    
    Returns:
    Print the list of ranked tweets
    """
    # I'm interested only on the element of the tweetVector corresponding to the query terms 
    # The remaining elements would become 0 when multiplied to the query_vector
    tweet_vectors = defaultdict(lambda: [0] * len(terms)) # I call tweet_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    query_vector = [0] * len(terms)

    # compute the norm for the query tf
    query_terms_count = collections.Counter(terms)  # get the frequency of each term in the query. 
    # Example: collections.Counter(["hello","hello","world"]) --> Counter({'hello': 2, 'world': 1})
    #HINT: use when computing tf for query_vector
    query_norm = la.norm(list(query_terms_count.values()))
    
##############

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        #if term not in index:
        #    continue
        like_avg = sum(likes[term]) / len(likes[term])
        rt_avg = sum(retweets[term]) / len(retweets[term])
        followers_avg = sum(followers[term]) / len(followers[term])
        ## Compute tf*idf(normalize TF as done with tweets)

        query_vector[termIndex]=query_terms_count[term]/query_norm * idf[term] + (like_avg + rt_avg/followers_avg)*100

        # Generate tweet_vectors for matching tweets
        for tweet_index, (tweet, postings) in enumerate(index[term]):
            # Example of [tweet_index, (tweet, postings)]
            # 0 (26, array('I', [1, 4, 12, 15, 22, 28, 32, 43, 51, 68, 333, 337]))
            # 1 (33, array('I', [26, 33, 57, 71, 87, 104, 109]))
            # term is in tweet 26 in positions 1,4, .....
            # term is in tweet 33 in positions 26,33, .....

            #tf[term][0] will contain the tf of the term "term" in the tweet 26            
            if tweet in tweets:
                tweet_vectors[tweet][termIndex] = tf[term][tweet_index] * idf[term] + (like_avg + rt_avg/followers_avg)*100 #all_tweets[tweet]['favorite_count']  # TODO: check if multiply for idf
    
    # Calculate the score of each tweet 
    # compute the cosine similarity between queyVector and each tweetVector:
    # HINT: you can use the dot product because in case of normalized vectors it corresponds to the cosine similarity
    # see np.dot
    
    tweet_scores=[[np.dot(curtweetVec, query_vector), tweet] for tweet, curtweetVec in tweet_vectors.items() ]
    tweet_scores.sort(reverse=True)

    result_tweets = [x[1] for x in tweet_scores]

    if len(result_tweets) == 0:
        print("No results found, try again")
        return False

    return result_tweets, tweet_scores

In [16]:
def rank_tweets_word2vec(terms, tweets, index, idf, tf, model, tweet2vec):
    """
    Perform the ranking of the results of a search based on the tf-idf weights
    
    Argument:
    terms -- list of query terms
    tweets -- list of tweets, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies
    title_index -- mapping between page id and page title
    
    Returns:
    Print the list of ranked tweets
    """
    
    # We will first have to compute each query as a normalized vector of words
    X = model.wv
    query_vector = np.array([0.0]*len(X[terms[0]])) # Create the query vector
    
    for word in terms:
        query_vector += np.array(X[word]) # Add the words
    
    query_vector = query_vector/len(terms) # Normalize
    
    tweet_vectors = {}
    
    for tweet in tweets:
        tweet_vectors[tweet] = tweet2vec[tweet]

#############
    # Calculate the score of each tweet 
    # compute the cosine similarity between queyVector and each tweetVector:
    # HINT: you can use the dot product because in case of normalized vectors it corresponds to the cosine similarity
    # see np.dot
    
    tweet_scores=[[np.dot(curtweetVec, query_vector), tweet] for tweet, curtweetVec in tweet_vectors.items() ]
    tweet_scores.sort(reverse=True) # Compute and sort by scores

    result_tweets = [x[1] for x in tweet_scores] # Return the top ranked tweets of the query

    if len(result_tweets) == 0:
        print("No results found, try again")
        return False

    return result_tweets, tweet_scores

In [17]:
def query_output(query, index, idf, tf, likes, followers, retweets,  tweets, mode, model = None):
    
    query_words = build_terms(query) # Separate query by preprocessed terms
    
    docs = {}
    for word in query_words: # Save the documents in which query terms appear
        docs[word] = [val[0] for val in check_index(word, index)]
        if len(docs[word]) == 0:
            return ['No tweets found'], None # Reference for when the query terms are not found in any tweet
    
    doc_list = list(docs.values())
    final_doc_list = doc_list[0]
    for list_ in doc_list:
        final_doc_list = list(set(final_doc_list) & set(list_))
        
    # Depending on the selected mode, the computed score and output ranking will be one or another
    # - Mode 1: Our score and ranking, taking into account the number of likes and followers
    # - Mode 2: Word2Vec score
    # - Otherwise: Use the initial tf-idf score
    if mode == 1:
        ranked_tweets, tweet_scores = rank_tweets_ours(query_words, final_doc_list, index, idf, tf, likes, followers, retweets, tweets)
    elif mode == 2:
        ranked_tweets, tweet_scores = rank_tweets_word2vec(query_words, final_doc_list, index, idf, tf, model, tweet2vec)
    else:
        ranked_tweets, tweet_scores = rank_tweets(query_words, final_doc_list, index, idf, tf)
    
    return ranked_tweets, tweet_scores
        
def display_tweets(list_): # Function that will be used to display the resulting list of ranked tweets
    if 'No tweets found' in list_:
        print('No tweets found matching this query')
        return 
    
    if len(list_) == 0:
        return 'No tweets'
    
    for tw in list_:
        print('\033[1m'+'Tweet index:'+'\033[0m', tw) # Tweet
        print('\033[1m'+'Tweet:'+'\033[0m', tweets[tw]['full_text']) # Text
        print('\033[1m'+'Username:'+'\033[0m', tweets[tw]['user']['name']) # Username
        print('\033[1m'+'Date:'+'\033[0m', tweets[tw]['created_at']) # Date of creation
        print('\033[1m'+'Hashtags:'+'\033[0m', [d['text'] for d in tweets[tw]['entities']['hashtags']]) # Hashtags
        print('\033[1m'+'Favourites:'+'\033[0m', tweets[tw]['favorite_count']) # Likes
        print('\033[1m'+'Retweets:'+'\033[0m', tweets[tw]['retweet_count']) # Retweets
        print('\033[1m'+'URL:'+'\033[0m', 'https://twitter.com/'+tweets[tw]['user']['screen_name']+'/status/'+str(tweets[tw]['id']), '\n') # URL
        print('\n')
    return

In [18]:
clean_tweets = {}
for tweet in tweets:
    line = tweets[tweet]['full_text']
    line_arr = line.replace("\n", ' ')
    terms = build_terms(''.join(line_arr))
    clean_tweets[tweet] = terms
    
sentences = clean_tweets.values()

model = Word2Vec(sentences, workers=4, min_count=1, window=10, sample=1e-5)

In [19]:
start_time = time.time() # Mark the time it takes for the code to create the index
index, tf, df, idf, likes, followers, retweets, tweet2vec = create_index_tfidf(tweets, model.wv)
print("Total time to create the index: {} seconds".format(np.round(time.time() - start_time, 2)))

Total time to create the index: 320.19 seconds


In [20]:
queries = ['covid19 vaccine', 'death risk', 'world pandemic', 'health service', 'developing countries', 'molta caca']

mode_ = 1

for query_text in queries:
    out, scores = query_output(query_text, index , idf, tf, likes, followers, retweets, tweets, mode_, model)
    if out == ['No tweets found']: # In case no tweets are found
        print('No tweets found matching query:', query_text)
    else:
        print(out[:20])
        print('\033[1m'+'Top 20 tweets from query "{}":'.format(query_text)+'\033[0m')
        display_tweets(out[:20])
        print("###############################################################\n\n")


['2257', '1959', '904', '2190', '2186', '1045', '2188', '2193', '935', '2195', '1772', '2187', '1565', '1310', '266', '2191', '1849', '310', '2202', '2192']
[1mTop 20 tweets from query "covid19 vaccine":[0m
[1mTweet index:[0m 2257
[1mTweet:[0m Q&amp;A #AskWHO on COVID-19 vaccines effectiveness https://t.co/FEdfOREhjn
[1mUsername:[0m World Health Organization (WHO)
[1mDate:[0m Wed Jun 30 16:12:43 +0000 2021
[1mHashtags:[0m ['AskWHO']
[1mFavourites:[0m 219
[1mRetweets:[0m 85
[1mURL:[0m https://twitter.com/WHO/status/1410270080873598979 



[1mTweet index:[0m 1959
[1mTweet:[0m 💉💉💉💉
💉💉💉💉
💉💉💉💉
💉💉💉💉
💉💉💉💉                 💉💉💉💉
💉💉💉💉                 💉💉💉💉

COVID-19 vaccines     COVID-19 vaccines
in 10 countries             in the rest of the 🌍

#VaccinEquity is 🗝️ to ending the pandemic, together!

#WorldEmojiDay
[1mUsername:[0m World Health Organization (WHO)
[1mDate:[0m Sat Jul 17 16:24:23 +0000 2021
[1mHashtags:[0m ['VaccinEquity', 'WorldEmojiDay']
[1mFavourites:[

In [21]:
# Interactive way of testing the engine (put your own query and select the ranking mode)
print("Insert your query:\n")
query = input()

print('\nInsert the type of ranking you want to follow:')
print('- Mode 1: Our score and ranking, taking into account the number of likes and followers')
print('- Mode 2: Word2Vec score')
print('- Otherwise: Use the initial tf-idf score')
mode = input()
ranked_tweets, tmp = query_output(query_text, index , idf, tf, likes, followers, retweets, tweets, mode_, model)
top = 20

if ranked_tweets:
    print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(ranked_tweets)))
    display_tweets(ranked_tweets[:top])

Insert your query:

hola:)

Insert the type of ranking you want to follow:
- Mode 1: Our score and ranking, taking into account the number of likes and followers
- Mode 2: Word2Vec score
- Otherwise: Use the initial tf-idf score
2

Top 20 results out of 1 for the searched query:

No tweets found matching this query
