In [2]:
!pip install pycountry

Collecting pycountry
  Downloading pycountry-20.7.3.tar.gz (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 26.0 MB/s 
[?25hBuilding wheels for collected packages: pycountry
  Building wheel for pycountry (setup.py) ... [?25l[?25hdone
  Created wheel for pycountry: filename=pycountry-20.7.3-py2.py3-none-any.whl size=10746883 sha256=648e8c1111ab346848f83264c840284023ed116a1003f989a6b1886da194e470
  Stored in directory: /root/.cache/pip/wheels/57/e8/3f/120ccc1ff7541c108bc5d656e2a14c39da0d824653b62284c6
Successfully built pycountry
Installing collected packages: pycountry
Successfully installed pycountry-20.7.3


In [3]:
import nltk
import json
import pycountry
import re
import collections
from numpy import linalg as la
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
import math
import numpy as np
from nltk.corpus import stopwords
from gensim.models import Word2Vec 
from sklearn.manifold import TSNE
import pandas as pd

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
%cd /content/drive/Shareddrives/Information retrieval/Project

/content/drive/Shareddrives/Information retrieval/Project


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Loading the dataset


In [5]:
def load_tweets():
    path = 'data/dataset_tweets_WHO.txt'
    tweets = []
    with open(path,"r",encoding='utf-8') as fp:
        for line in fp:
            tweet_ = json.loads(line)
            for key in tweet_.keys():
                tweet = []
                tweet.append(key)
                tweet.append(tweet_[key]['full_text'])
                tweet.append(tweet_[key]['lang'])
                tweets.append(tweet)
    return tweets

In [6]:
def remove_stopwords(string,language):
    #we build the stopwords set depending on the language of the tweet.
    stop_words = set(stopwords.words(language))         
    return [word for word in string if word not in stop_words]


def stemming(string,language):
    porter_stemmer = PorterStemmer()
    if language in SnowballStemmer.languages:
        snowball_stemmer = SnowballStemmer(language)
        return [snowball_stemmer.stem(word) for word in string]
    else:
        return [porter_stemmer.stem(word) for word in string]

def clean(string,language='en'):

    string = string.lower()
    string = string.split() #tokenize
    string = [re.sub("[^a-z0-9#@]","",word) for word in string] #we remove everything except words, numbers # and @.
    string = [word for word in string if word != ''] # we delete the token that are empty.
    
    if len(language) == 2:
            language = pycountry.languages.get(alpha_2=language).name.lower()
    elif len(language) == 3:
            language = pycountry.languages.get(alpha_3=language).name.lower()
            
    string = remove_stopwords(string,language)  #remove stopwords
    string = stemming(string,language)   #stemming

    return string

In [11]:
tweets = load_tweets()

filtered_tweets = []
for tweet in tweets:
    try:
        filtered_tweets.append([tweet[0],clean(tweet[1],tweet[2]),tweet[2]])
    except:
        pass

# Indexing


In [12]:
from collections import defaultdict
from array import array

In [13]:
def create_index(tweets):
    index = defaultdict(list)
    for tweet in tweets:  
        tweet_id = int(tweet[0])
        
        ## ===============================================================        
        ## create the index for the current page and store it in current_page_index (current_page_index)
        ## current_page_index ==> { ‘term1’: [current_doc, [list of positions]], ...,‘term_n’: [current_doc, [list of positions]]}

        ## Example: if the curr_doc has id 1 and his text is 
        ##"web retrieval information retrieval":

        ## current_page_index ==> { ‘web’: [1, [0]], ‘retrieval’: [1, [1,4]], ‘information’: [1, [2]]}

        ## the term ‘web’ appears in document 1 in positions 0, 
        ## the term ‘retrieval’ appears in document 1 in positions 1 and 4
        ## ===============================================================

        current_page_index = {}

        for position, term in enumerate(tweet[1]): # terms contains page_title + page_text. Loop over all terms
            try:
                # if the term is already in the index for the current page (current_page_index)
                # append the position to the corresponding list
        ## START CODE
                current_page_index[term][1].append(position)  
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_page_index[term]=[tweet_id, array('I',[position])] #'I' indicates unsigned int (int in Python)
            
        #merge the current page index with the main index
        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)
                  
                    
    return index

In [14]:
index_ = create_index(filtered_tweets)

In [15]:
def get_document_information():
    path = 'data/dataset_tweets_WHO.txt'
    tweets = {}
    with open(path,"r",encoding='utf-8') as fp:
        for line in fp:
            tweet_ = json.loads(line)
            for key in tweet_.keys():
                tweet = []
                #tweet.append(key)
                tweet.append(tweet_[key]['full_text'])
                tweet.append(tweet_[key]['user']['name'])
                tweet.append(tweet_[key]['created_at'])
                hashtags = []
                for x in tweet_[key]['entities']['hashtags']:
                    hashtags.append(x['text'])
                tweet.append(hashtags)
                tweet.append(tweet_[key]['favorite_count'])
                tweet.append(tweet_[key]['retweet_count'])
                tweet.append("https://twitter.com/%s/status/%s" % (tweet_[key]['user']['screen_name'], tweet_[key]['id_str']))
                #tweets.append(tweet)
                tweets[key] = tweet

    return tweets

In [16]:
doc_info = get_document_information()
num_documents = len(doc_info)

In [17]:
def create_index_tfidf(tweets, num_documents):
    """
    Implement the inverted index and compute tf, df and idf
    
    Argument:
    lines -- collection of Wikipedia articles
    num_documents -- total number of documents
    
    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of document these keys appears in (and the positions) as values.
    tf - normalized term frequency for each term in each document
    df - number of documents each term appear in
    idf - inverse document frequency of each term
    """
    index = defaultdict(list)
    tf = defaultdict(list)  #term frequencies of terms in documents (documents in the same order as in the main index)
    df = defaultdict(int)  #document frequencies of terms in the corpus
    idf = defaultdict(float)

    for tweet in tweets:  
        tweet_id = int(tweet[0])
        
        ## ===============================================================        
        ## create the index for the **current page** and store it in current_page_index
        ## current_page_index ==> { ‘term1’: [current_doc, [list of positions]], ...,‘term_n’: [current_doc, [list of positions]]}
        ## ===============================================================

        current_page_index = {}

        for position, term in enumerate(tweet[1]):  ## terms contains page_title + page_text
            try:
                # if the term is already in the dict append the position to the corresponding list
                current_page_index[term][1].append(position)
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_page_index[term]=[tweet_id, array('I',[position])] #'I' indicates unsigned int (int in Python)

        #normalize term frequencies
        # Compute the denominator to normalize term frequencies (formula 2 above)
        # norm is the same for all terms of a document.
        norm = 0
        for term, posting in current_page_index.items():
            norm += len(posting) ** 2
        norm = math.sqrt(norm)

        #calculate the tf(dividing the term frequency by the above computed norm) and df weights
        for term, posting in current_page_index.items():
            # append the tf for current term (tf = term frequency in current doc/norm)
            tf[term].append(np.round(len(posting)/norm,4)) ## SEE formula (1) above
            #increment the document frequency of current term (number of documents containing the current term)
            df[term] += 1  # increment DF for current term

        #merge the current page index with the main index
        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)

        # Compute IDF following the formula (3) above. HINT: use np.log
        for term in df:
            idf[term] = np.round(np.log(float(num_documents/df[term])), 4)

    return index, tf, df, idf


In [18]:
index,tf,df,idf = create_index_tfidf(filtered_tweets,num_documents)

# Ranking methods
## 1a - TF-IDF + cosine_similarity

In [19]:
topK = 20

In [20]:
def rank_tfidf(terms, docs, doc_info, index, idf, tf):
    """
    Perform the ranking of the results of a search based on the tf-idf weights
    
    Argument:
    terms -- list of query terms
    docs -- list of documents, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies
    title_index -- mapping between page id and page title
    
    Returns: scores of ranked documents
    """

    # I'm interested only on the element of the docVector corresponding to the query terms 
    # The remaining elements would became 0 when multiplied to the query_vector
    doc_vectors = defaultdict(lambda: [0] * len(terms)) # I call doc_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    query_vector = [0] * len(terms)

    # compute the norm for the query tf
    query_terms_count = collections.Counter(terms)  # get the frequency of each term in the query. 

    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue

        ## Compute tf*idf(normalize TF as done with documents)
        query_vector[termIndex]=query_terms_count[term]/query_norm * idf[term] 

        # Generate doc_vectors for matching docs
        for doc_index, (doc, postings) in enumerate(index[term]):       
            if doc in docs:
                doc_vectors[doc][termIndex] = tf[term][doc_index] * idf[term] 

    # Calculate the score of each doc 
    # compute the cosine similarity between queyVector and each docVector
    doc_scores=[[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items() ]
    
    if len(doc_scores) == 0:
        print("No results found, try again")
    return doc_scores

## 1b - Custom score + cosine similarity

In [100]:
def perform_date_substraction(tweet_date):
  x = datetime.now()
  tweet_datetime = datetime.strptime(tweet_date,'%a %b %d %H:%M:%S %z %Y')
  t = x - tweet_datetime.replace(tzinfo=None) 
  return t.total_seconds()

In [132]:
from datetime import datetime
def rank_custom(terms, docs,doc_info, index, idf, tf):

    #Query vector is the average of the two statistics
    query_vector = [0.5, 0.5]
    
    #Cosine similarity with likes and retweets
    custom_scores = [[np.dot(query_vector, doc_info[str(doc)][4:6]), doc] for doc in docs]
    # Normalize custom scores
    custom_array  = np.array(custom_scores)
    sum_of_col = custom_array.sum(axis=0)[0]
    normalized_custom_scores = [[r[0]/sum_of_col,r[1]] for r in custom_array]
    #print(normalized_custom_scores)
    # Normalize tf-idf scores and scale it by 0.8
    tf_idf_doc_scores = rank_tfidf(terms, docs, doc_info, index, idf, tf) 
    custom_array  = np.array(tf_idf_doc_scores)
    sum_of_col = custom_array.sum(axis=0)[0]
    normalized_tfidf_scores = [[r[0]/sum_of_col*0.4,r[1]] for r in custom_array]
    #print(normalized_tfidf_scores)

    
    datetime_score = [[perform_date_substraction(doc_info[str(doc)][2]), doc] for doc in docs]

    # Normalize time score and scale it by 1.2
    custom_array  = np.array(datetime_score)
    sum_of_col = custom_array.sum(axis=0)[0]
    normalized_time_scores = [[(1-r[0]/sum_of_col)*1.2,r[1]] for r in custom_array]
    b = [el[1] for el in tf_idf_doc_scores]
    b_1 = [el[1] for el in custom_scores]
    a = set(b) | set(b_1)
    
    total_scores = [[] for e in a ]
    total_scores = []
    for e in a:
        suma = 0
        for x in normalized_tfidf_scores:
            if x[1] == e:
                suma+=x[0]
        for y in normalized_custom_scores:
            if y[1] == e:
                suma+=y[0]
        for z in normalized_time_scores:
            if z[1] == e:
                suma+=z[0]
        total_scores.append([suma,e])
    return total_scores

## 2 - Word2Vec + cosine similarity

In [22]:
def w2v_vector(text, w2v_model):
    '''
    return the mean vector created from the embedded matrix of the word2vec model
    '''

    # Compute W2V embedding for each term
    w2v_vectors = []
    
    for term in text:
        
        try: w2v_vectors.append(w2v_model.wv[term])
        except: pass
        
    if len(w2v_vectors) == 0:
        return np.zeros(w2v_model.vector_size)

    # Compute mean vector
    final_vector = np.zeros(w2v_model.vector_size)
    for vec in w2v_vectors:
        for pos in range(w2v_model.vector_size):
            final_vector[pos] += vec[pos]

    
    return final_vector/len(w2v_vectors)

In [23]:
def rank_w2vec(terms, docs, doc_info, index, idf, tf):

    words = []
    for tweet in doc_info:
        try: text = doc_info[tweet][0]
        except: text = doc_info[tweet][3]
        words.append(clean(text))
    w2v_model = Word2Vec(words)

    # Create mean vectors
    query_vector = w2v_vector(terms, w2v_model)
    
    # create score
    w2v_scores = [[np.dot(query_vector, w2v_vector(clean(doc_info[str(doc)][0]),
                                              w2v_model)), doc] for doc in docs]
    
    return w2v_scores

## Evaluation

In [24]:
def search(query,language, index, method):
    """
    output is the list of documents that contain any of the query terms. 
    So, we will get the list of documents for each query term, and take the union of them.
    """
    query = clean(query,language)
    docs = [set()] * len(query)
    for i, term in enumerate(query):
        try:
            # store in term_docs the ids of the docs that contain "term"                        
            termDocs=[posting[0] for posting in index[term]]
            
            # docs = docs Union term_docs
            docs[i] = set(termDocs)
        except:
            #term is not in index
            pass
    
    docs = list(set.intersection(*docs))
    
    
    if method == "2":
        doc_scores = rank_custom(query, docs, doc_info, index, idf, tf)
    elif method == "3":
        doc_scores = rank_w2vec(query, docs, doc_info, index, idf, tf)
    else:
        doc_scores = rank_tfidf(query, docs, doc_info, index, idf, tf)
    
    doc_scores.sort(reverse=True)
    
    ranked_docs = [x[1] for x in doc_scores]
    
    return ranked_docs

In [25]:
def perform_query(query=None, method=None):
  if query is None:
    print("Write your query (i.e.: Computer Science):\n")
    query = input()
  
  if method is None:
    print("Choose ranking method: 1-TF-IDF(default) 2-Custom 3-Word2Vec")
    method = input()

  if method == "2":
      print("Using custom ranking...")
  elif method == "3":
      print("Using Word2Vec ranking...")
  else:
      method="1"
      print("Using TF-IDF ranking...")


  while True:
      try:
          #we will assume query is perfomed in english but leave the language options
          language = "en" 
          if len(language) == 2:
              language = pycountry.languages.get(alpha_2=language).name.lower()
          elif len(language) == 3:
              language = pycountry.languages.get(alpha_3=language).name.lower()
          else:
              if language not in list(pycountry.languages):
                  raise Exception("")
          break           
      except:
          print("Language not detected. Try again (write the ISO 639-3 code or the name in english): ")
          continue
  print("Language: "+language)
  
  ranked_docs = search(query,language, index, method)
  
  print("\n======================\n Results ordered by relevance for the searched query:\n")
  for d_id in ranked_docs[:topK]:
      print(d_id, doc_info[str(d_id)][0],doc_info[str(d_id)][-1])
      print('--------------------------------------------------')

In [133]:
#Testing our custom ranking
perform_query(query="covid19 vaccine", method="2")

Using custom ranking...
Language: english

 Results ordered by relevance for the searched query:

1959 💉💉💉💉
💉💉💉💉
💉💉💉💉
💉💉💉💉
💉💉💉💉                 💉💉💉💉
💉💉💉💉                 💉💉💉💉

COVID-19 vaccines     COVID-19 vaccines
in 10 countries             in the rest of the 🌍

#VaccinEquity is 🗝️ to ending the pandemic, together!

#WorldEmojiDay https://twitter.com/WHO/status/1416433609091653633
--------------------------------------------------
904 #COVID19 variants &amp; vaccines:

✅ COVID-19 vaccines provide strong protection against serious illness &amp; death
✅ Get all necessary doses to develop maximum protection
✅ Continue practicing all the protective behaviours even after vaccination to stop COVID-19 variants https://twitter.com/WHO/status/1434790971632336906
--------------------------------------------------
1045 ▶️ If you have #COVID19, is it safe to breastfeed your baby❓

▶️ Is it safe to get vaccinated against COVID-19 if you are breastfeeding❓

▶️ How can you keep your baby safe whil

## word2vec + cosine similarity queries

In [141]:
perform_query(query="vaccine", method="3")

Using Word2Vec ranking...
Language: english

 Results ordered by relevance for the searched query:

1469 Getting vaccinated 💉 can help protect you and those around you from #COVID19 ⬇️ https://t.co/3GrojOAWT6 https://twitter.com/WHO/status/1423591981456838656
--------------------------------------------------
2250 RT @DrTedros: We are calling on #G20 countries to share more #COVID19 vaccine doses now, including by ensuring at least 1 billion doses are… https://twitter.com/WHO/status/1410362947046682625
--------------------------------------------------
1323 RT @DrTedros: Countries with high coverage are seeing a decoupling of #COVID19 cases &amp; deaths, whereas countries that can't access vaccines… https://twitter.com/WHO/status/1426509340744327170
--------------------------------------------------
2386 Here is why you should get vaccinated even if you have had #COVID19 👇 https://twitter.com/WHO/status/1408416636084707335
--------------------------------------------------
1400 @DrTedr

In [142]:
perform_query(query="challenge", method="3")

Using Word2Vec ranking...
Language: english

 Results ordered by relevance for the searched query:

285 RT @DrTedros: In May, @WHO set the world a #VaccinEquity challenge: to vaccinate at least 10% of the population of every country against #C… https://twitter.com/WHO/status/1443834880526331957
--------------------------------------------------
444 @DrTedros @EmmanuelMacron @ACTAccelerator "Countries told us they needed support to build their capacities for emergency preparedness &amp; response, to respond to the dual challenge of communicable &amp; noncommunicable diseases, &amp; to strengthen every facet of their health systems"-@DrTedros #WHOAcademy https://t.co/Osc48s0z2p https://twitter.com/WHO/status/1442494793607639049
--------------------------------------------------
2093 With 15 signatories, including @DrTedros &amp; @KagutaMuseveni, the Kampala Declaration stresses that unequal access to vaccines is an existential challenge for global health &amp; calls for increased interna

In [143]:
perform_query(query="health", method="3")

Using Word2Vec ranking...
Language: english

 Results ordered by relevance for the searched query:

528 RT @DrTedros: It was my pleasure to join Prince Harry and Meghan, The Duke &amp; Duchess of Sussex, &amp; a group of leaders in health, politics &amp;… https://twitter.com/WHO/status/1441373266875064320
--------------------------------------------------
1493 RT @DrTedros: .@WHO &amp; partners are:
1. providing access to life-saving health services for those worst-affected by the crisis
2. supporting… https://twitter.com/WHO/status/1422999094042169344
--------------------------------------------------
1424 RT @DrTedros: The new @IPCC_CH report shows that every fraction of a degree hotter endangers our health and future. Similarly, every action… https://twitter.com/WHO/status/1424990056939466763
--------------------------------------------------
909 RT @DrTedros: At the @g20org Health Minister meeting, I called for commitment &amp; support of #G20 countries to reach @WHO's global #COVI

In [144]:
perform_query(query="recovery", method="3")

Using Word2Vec ranking...
Language: english

 Results ordered by relevance for the searched query:

14 LIVE with @DrTedros: Ending the #COVID19 pandemic: road to an inclusive recovery. #VaccinEquity https://t.co/q9x4v3ixBl https://twitter.com/WHO/status/1447972516161490944
--------------------------------------------------
1843 The #VaccinEquity Dashboard from WHO, @UNDP, and @BlavatnikSchool shows that low- and lower-middle-income countries must have access to vaccines now to:
👩‍⚕️Save lives
💹Promote economic recovery
👨‍👩‍👧‍👦Advance human development

https://t.co/48frkTNNmC https://t.co/zAnwTQfX3f https://twitter.com/WHO/status/1418240365086355456
--------------------------------------------------
2115 @DrTedros "Variants are currently winning the race against vaccines because of inequitable vaccine production &amp; distribution, which also threatens the global economic recovery. It didn’t have to be this way &amp; it doesn’t have to be this way going forward"-@DrTedros #COVID19 #Vac

In [145]:
perform_query(query="climate", method="3")

Using Word2Vec ranking...
Language: english

 Results ordered by relevance for the searched query:

2331 "2⃣WHO has helped SIDS to mobilize funds for #ClimateChange adaptation, resilient health systems, food security, &amp; emergency preparedness &amp; response.
[...]
WHO offers our support to SIDS in better utilizing financing platforms, such as the Green Climate Fund"-@DrTedros https://twitter.com/WHO/status/1409429916785983489
--------------------------------------------------
597 "I urge all countries to put these [Air Quality] guidelines to use, to save lives, support healthy communities, and help address the climate crisis. These guidelines come at an important time, ahead of the COP26 #ClimateChange Conference in November"-@DrTedros #AirPollution https://twitter.com/WHO/status/1440669759188529154
--------------------------------------------------
28 WHO's 10 calls for #ClimateAction❗️

1⃣ Commit to a healthy recovery
2⃣ Place health at the ❤️ of the climate talks
3⃣ Harness the 