In [177]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/alfa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [178]:
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
import time 
import json
import re
from numpy import linalg as la

In [179]:
#This function will clean our text from data that is not important so that has no weight 
def clean_text(tweet):
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    
    tweet = tweet.lower() # Transform in lowercase

    tweet = re.sub(r'@[a-zA-Z]+', '', tweet) # Here we remove the mentions in the tweet ex: @canodep
    tweet = re.sub(r"\B#([a-z0-9]{2,})(?![~!@#$%^&*()=+_`\-\|\/'\[\]\{\}]|[?.,]*\w)", '', tweet) # Here we remove the hashtags, because we will treat it later
    tweet = re.sub(r'[^\w\s]', '', tweet) # Here we remove punctuation marks
    tweet = re.sub(r'http\S+', '',tweet) # Remove http and https
    tweet = tweet.split() # Tokenize the text to get a list of terms

    tweet = [word for word in tweet if word not in stop_words]  # eliminate the stopwords
    tweet = [stemmer.stem(word) for word in tweet] # Perform stemming 
    return tweet
    


In [180]:
docs_path = 'data/tw_hurricane_data.json'
tweets_title = 'data/tweet_document_ids_map.csv'

tweets_id_title = {}

with open(tweets_title) as fp:
    lines = fp.readlines()


for l in lines:
    l = l.strip().split("\t")
    tweets_id_title[int(l[1])] =  l[0]


tweets = []
lines = []

for line in open(docs_path, 'r'):
    lines.append(line)
    #media = json.loads(line).get('entities').get('media')
    tweets.append({
        'id' : int(json.loads(line).get('id')),
        'title' : tweets_id_title[int(json.loads(line).get('id'))],
        'text': clean_text(json.loads(line).get('full_text')),
        'username' : json.loads(line).get('user').get('screen_name'),
        'date' : json.loads(line).get('created_at'),
        'hashtag' : list(map(lambda hashtag:  hashtag.get('text'),  json.loads(line).get('entities').get('hashtags'))),
        'like' : json.loads(line).get('favorite_count'),
        'rt' : json.loads(line).get('retweet_count'),
        'URL' : 'https://twitter.com/' + json.loads(line).get('user').get('screen_name') + "/status/" + str(json.loads(line).get('id'))
    }) 


In [181]:
def create_index(tweets):
    """
    Implement the inverted index
    
    Argument:
    lines -- collection of Wikipedia articles
    
    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of documents where these keys appears in (and the positions) as values.
    """
    index = defaultdict(list)    

    for i in range(len(tweets)):
        terms = tweets[i].get("text")
        tweet_id = tweets[i].get('id')

        tweet_title = tweets_id_title[tweet_id]

        current_tweet_index = {}

        for position, term in enumerate(terms):
            try:
                # if the term is already in the index for the current page (current_page_index)
                # append the position to the corresponding list

                ## START CODE
                current_tweet_index[term][1].append(position)  
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_tweet_index[term] = [tweet_title, array('I', [position])] #'I' indicates unsigned int (int in Python)

        #merge the current page index with the main index
        for term, posting_page in current_tweet_index.items():
            index[term].append(posting_page)
        ## END CODE                    

    return index

In [182]:
index = create_index(tweets)

print("Index results for the term 'researcher': {}\n".format(index['researcher']))
print("First 10 Index results for the term 'research': \n{}".format(index['research'][:10]))

Index results for the term 'researcher': []

First 10 Index results for the term 'research': 
[['doc_1', array('I', [0, 1])], ['doc_220', array('I', [8])], ['doc_405', array('I', [8])], ['doc_1354', array('I', [1])], ['doc_1612', array('I', [2])], ['doc_2026', array('I', [6])], ['doc_2600', array('I', [9])], ['doc_2748', array('I', [11])], ['doc_3132', array('I', [2])], ['doc_3307', array('I', [5])]]


In [183]:
def search(query, index):
    """
    The output is the list of documents that contain any of the query terms. 
    So, we will get the list of documents for each query term, and take the union of them.
    """
    query = clean_text(query)
    docs = set()
    for term in query:
        try:
            # store in term_docs the ids of the docs that contain "term"
            term_docs = [posting[0] for posting in index[term]]
            # docs = docs Union term_docs
            docs |= set(term_docs)
        except:
            #term is not in index
            pass
    docs = list(docs)
    return docs

In [184]:
#print("Insert your query (i.e.: Computer Science):\n")
#query = input()
#docs = search(query, index)
#top = 10

#print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))

#for d_id in docs[:top]:
#    print("page_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))



query = "My house floods"
docs = search(query, index)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))

for d_id in docs[:top]:
    print("page_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))



query = "I am scared to death, a hurricane is coming to my city"
docs = search(query, index)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))

for d_id in docs[:top]:
    print("page_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))

query = "Landfall in South Carolina"
docs = search(query, index)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))

for d_id in docs[:top]:
    print("page_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))

query = "Help and recovery during the hurricane disaster"
docs = search(query, index)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))

for d_id in docs[:top]:
    print("page_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))


query = "Floodings in South Carolina"
docs = search(query, index)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))

for d_id in docs[:top]:
    print("page_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))
    


Sample of 10 results out of 342 for the searched query:

page_id= 1575913635393961985 - page_title: doc_370
page_id= 1575905031869677584 - page_title: doc_1077
page_id= 1575872006687707136 - page_title: doc_2770
page_id= 1575874215445958657 - page_title: doc_2622
page_id= 1575908696118669312 - page_title: doc_801
page_id= 1575902489052090369 - page_title: doc_1234
page_id= 1575870035016314882 - page_title: doc_2943
page_id= 1575903222681989121 - page_title: doc_1178
page_id= 1575868604230287361 - page_title: doc_3067
page_id= 1575863772786442240 - page_title: doc_3455

Sample of 10 results out of 819 for the searched query:

page_id= 1575897998768439297 - page_title: doc_1452
page_id= 1575856416132198400 - page_title: doc_3980
page_id= 1575914467149303809 - page_title: doc_292
page_id= 1575863134723923969 - page_title: doc_3509
page_id= 1575863226805493760 - page_title: doc_3496
page_id= 1575916485331107867 - page_title: doc_119
page_id= 1575909928862203926 - page_title: doc_675
page_

In [185]:
#print("Insert your query (i.e.: Computer Science):\n")
#query = input()
#docs = search(query, index)
#top = 10

#print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))

#for d_id in docs[:top]:
#    print("page_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))



query = "My house floods"
docs = search(query, index)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))

for d_id in docs[:top]:
    print("page_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))



query = "I am scared to death, a hurricane is coming to my city"
docs = search(query, index)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))

for d_id in docs[:top]:
    print("page_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))

query = "Landfall in South Carolina"
docs = search(query, index)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))

for d_id in docs[:top]:
    print("page_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))

query = "Help and recovery during the hurricane disaster"
docs = search(query, index)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))

for d_id in docs[:top]:
    print("page_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))


query = "Floodings in South Carolina"
docs = search(query, index)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))

for d_id in docs[:top]:
    print("page_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))
    


Sample of 10 results out of 342 for the searched query:

page_id= 1575913635393961985 - page_title: doc_370
page_id= 1575905031869677584 - page_title: doc_1077
page_id= 1575872006687707136 - page_title: doc_2770
page_id= 1575874215445958657 - page_title: doc_2622
page_id= 1575908696118669312 - page_title: doc_801
page_id= 1575902489052090369 - page_title: doc_1234
page_id= 1575870035016314882 - page_title: doc_2943
page_id= 1575903222681989121 - page_title: doc_1178
page_id= 1575868604230287361 - page_title: doc_3067
page_id= 1575863772786442240 - page_title: doc_3455

Sample of 10 results out of 819 for the searched query:

page_id= 1575897998768439297 - page_title: doc_1452
page_id= 1575856416132198400 - page_title: doc_3980
page_id= 1575914467149303809 - page_title: doc_292
page_id= 1575863134723923969 - page_title: doc_3509
page_id= 1575863226805493760 - page_title: doc_3496
page_id= 1575916485331107867 - page_title: doc_119
page_id= 1575909928862203926 - page_title: doc_675
page_

In [186]:
def create_index_tfidf(tweets, num_docs):
    index = defaultdict(list)
    tf = defaultdict(list)  # term frequencies of terms in documents (documents in the same order as in the main index)
    df = defaultdict(int)  # document frequencies of terms in the corpus
    title_index = defaultdict(str)
    idf = defaultdict(float)

    for tweet in tweets:
        tweet_id = tweet['id']
        terms = tweet['text']
        title = tweet['title']
        title_index[tweet_id] = title

        current_page_index = {}

        for position, term in enumerate(terms):
            try:
                current_page_index[term][1].append(position)
            except:
                current_page_index[term] = [tweet_id, array('I', [position])]
        norm = 0
        for term, posting in current_page_index.items():
            norm += len(posting[1]) ** 2
        norm = math.sqrt(norm)

        for term, posting in current_page_index.items():
            tf[term].append((tweet_id, np.round(len(posting[1])) / norm, 4))
            df[term] = 1

        for term, posting in current_page_index.items():
            index[term].append(posting)

        for term in df:
            idf[term] = np.round(np.log(float(num_docs / df[term])), 4)

    return index, tf, df, idf, title_index


In [187]:
start_time = time.time()
num_documents = len(tweets)
index, tf, df, idf, title_index = create_index_tfidf(tweets, num_documents)
print("Total time to create the index: {} seconds" .format(np.round(time.time() - start_time, 2)))

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/home/alfa/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3378, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_34270/4072662602.py", line 3, in <module>
    index, tf, df, idf, title_index = create_index_tfidf(tweets, num_documents)
  File "/tmp/ipykernel_34270/481609075.py", line 34, in create_index_tfidf
    idf[term] = np.round(np.log(float(num_docs / df[term])), 4)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/alfa/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 1997, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "/home/alfa/.local/lib/python3.8/site-packages/IPython/core/ultratb.py", line 1112, in structured_traceback
    return FormattedTB.structured_traceback(
  File "/home/alfa/.local/lib/python3.8/site-packages/IPython/co

In [191]:
def rank_documents(terms, docs, index, idf, tf, title_index):
    doc_vectors = defaultdict(lambda: [0] * len(terms))
    query_vector = [0] * len(terms)

    query_terms_count = collections.Counter(terms)

    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):
        if term not in index:
            continue

        query_vector[termIndex] = query_terms_count[term] / query_norm * idf[term]

        for doc_index, (doc, postings) in enumerate(index[term]):
            if doc in docs:
                doc_vectors[doc][termIndex] = idf[term]

    doc_scores = [[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items()]

    doc_scores.sort(reverse=True)
    result_docs = [doc for score, doc in doc_scores]

    if len(result_docs) == 0:
        print("No documents found for the given query!")
        query = input()
        result_docs = search_tf_idf(query, index)

    return result_docs


In [192]:
def search_tf_idf(query, index):
    terms = clean_text(query)
    docs = set()
    for term in terms:
        try:
            term_docs= [posting[0] for posting in index[term]]
            docs |= set(term_docs)
        except:
            pass
    docs = list(docs)
    return rank_documents(terms, docs, index, idf, tf, title_index)

In [197]:
print("Insert your query (i.e.: Computer Science):\n")
query = input()
ranked_docs = search_tf_idf(query, index)
top = 10

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(ranked_docs)))
for d_id in ranked_docs[:top]:
    print("tweet_id= {} - page_title: {}".format(list(tweets_id_title.keys())[list(tweets_id_title.values()).index(d_id)], d_id))

Insert your query (i.e.: Computer Science):

No documents found for the given query!
No documents found for the given query!

Top 10 results out of 261 for the searched query:

tweet_id= 1575906699046981632 - page_title: doc_979
tweet_id= 1575906817322123264 - page_title: doc_973
tweet_id= 1575907263688036352 - page_title: doc_934
tweet_id= 1575908015475019776 - page_title: doc_879
tweet_id= 1575908033040830464 - page_title: doc_877
tweet_id= 1575908317553213442 - page_title: doc_857
tweet_id= 1575908441499090944 - page_title: doc_829
tweet_id= 1575908465662656512 - page_title: doc_824
tweet_id= 1575908467122094084 - page_title: doc_823
tweet_id= 1575908546927292417 - page_title: doc_813
