Ariadna Gonzalez, Júlia Dalmau i Mireia Cuenca

# INDEXING

In [1]:
#Imports

from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import time
import string
import nltk
import json
import pandas as pd 
import random

In [2]:
#Load data into memory

docs_path = 'C:/Users/judal/Downloads/tw_hurricane_data.json' #path of tw_hurricane_data.json
with open(docs_path) as fp:
    lines = fp.readlines()
    
#Print lines[0] to see the structure of the data 

print(lines[12])

{"created_at": "Fri Sep 30 18:38:31 +0000 2022", "id": 1575918028717707265, "id_str": "1575918028717707265", "full_text": "Together we have raised over $20,000 in less than 24 hours for #HurricaneIan relief. \n\nPlease stop by the @BigManBigHeart_ tailgate tomorrow to donate in person. My family &amp; team will be at Tent #2 on the Legacy Walk near Gate K! \n\n#HurricaneRelief #NoleFam https://t.co/ikmWOP0bR0", "truncated": false, "display_text_range": [0, 261], "entities": {"hashtags": [{"text": "HurricaneIan", "indices": [63, 76]}, {"text": "HurricaneRelief", "indices": [236, 252]}, {"text": "NoleFam", "indices": [253, 261]}], "symbols": [], "user_mentions": [{"screen_name": "BigManBigHeart_", "name": "Big Man Big Heart", "id": 1428430898609938433, "id_str": "1428430898609938433", "indices": [106, 122]}], "urls": [{"url": "https://t.co/ikmWOP0bR0", "expanded_url": "https://twitter.com/gibbonsdillan/status/1575538547750162432", "display_url": "twitter.com/gibbonsdillan/\u2026", "indic

In [3]:
#Transform data into a dict to make easier the code

datos_diccionario = [json.loads(line) for line in lines]

#Now if we want to access to the screen_name of a user we colud do it like this

datos_diccionario[12]['id']

1575918028717707265

### Funtions

In [4]:
def build_terms(line):

    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))

    #Transform in lowercase
    line=  line.lower()
    
    #Removing punctuation marks
    line= line.translate(str.maketrans('', '', string.punctuation)) 
    
    #Tokenize the text to get a list of terms
    line=  line.split(" ")
    
    #Removing the stopwords
    line=[x for x in line if x not in stop_words]
    
    #Perform stemming
    line=[stemmer.stem(x) for x in line]

    return line

In [5]:
def create_index(lines, num_documents):
    
    index = defaultdict(list)
    title_index = defaultdict(float)
    idf = defaultdict(float)
    tf = defaultdict(list)  #term frequencies of terms in documents
    df = defaultdict(int)  #document frequencies of terms in the corpus
    
    for line in lines:
    
        terms = build_terms(line['full_text'])
        
        page_id = line['id']
        title = line['id_str']

        
        # The final output must return Tweet, Username, Date, Hashtags, Likes, Retweets and Url
        args = [line["id_str"], line['user']['screen_name'], line['created_at'], line['entities']['hashtags'], line['user']['favourites_count'], line['retweet_count'], line['entities']['urls'],line['full_text']]
        
        title_index[title]=args
        

        positions_term_in_doc = {}
        for position, term in enumerate(terms):
            try:
                # if the term is already in the dict append the position to the corresponding list
                positions_term_in_doc[term].append(position) 
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                positions_term_in_doc[term] = [position]
                
        norm = 0
        for term, posting in positions_term_in_doc.items():
            norm += len(positions_term_in_doc[term]) ** 2
        norm = math.sqrt(norm)
        
        for term, posting in positions_term_in_doc.items():
            tf[term].append(np.round(len(positions_term_in_doc[term]) / norm, 4))
            df[term] += 1
        
        for term in terms:
            index[term].append(title)
            
        for term in df:
            idf[term] = np.round(np.log(float(num_documents / df[term])), 4)
            
                    
    return index, tf, df, idf, title_index

###### TF-IDF + cosine similarity

In [6]:
def rank_documents_tf_idf(terms, docs, index, idf, tf, title_index):
    
    doc_vectors = defaultdict(lambda: [0] * len(terms))
    query_vector = [0] * len(terms)

    # compute the norm for the query tf
    query_terms_count = collections.Counter(terms) 

    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):
        if term not in index:
            continue

        query_vector[termIndex] = query_terms_count[term] / query_norm * idf[term]
        # Generate doc_vectors for matching docs
        for doc_index, doc in enumerate(index[term]): 
            if doc in docs and len(tf[term])>doc_index:
                doc_vectors[doc][termIndex] = tf[term][doc_index] * idf[term] 

    # Calculate the score of each doc 
    
    doc_scores = [[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items()]
    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]
    result_scores= [x[0] for x in doc_scores]
    
    if len(result_docs) == 0:
        print("No results found, try again")
        #query = input()
        #docs = search_tf_idf(query, index)
    return result_scores,result_docs

In [7]:
def search_tf_idf(query, index):
   
    query = build_terms(query)
    docs = []
    for term in query:
        try:
            #Term is in the index
            keys = [i for i in index.keys()]
            term_docs = [index[t] for t in keys if t==term]
            docs=term_docs[0]
            
        except:
            #Term is not in index
            pass
    docs = list(docs)
    scores_docs,ranked_docs = rank_documents_tf_idf(query, docs, index, idf, tf, title_index)
    return scores_docs,ranked_docs

###### BM25

In [8]:
def rank_documents_bm25(terms, docs, index, df, tf, title_index):
    
    doc_vectors = defaultdict(lambda: [0] * len(terms))
    query_vector = [0] * len(terms)
    N=len(docs)
    k1=random.uniform(1.2,2)
    b=0.75
    RSV=dict()
    lenghts=[]
    longitud=len(terms)
    tf_=np.zeros((longitud,N))
    
    for ids in docs:
        tweet=title_index[ids][7]
        terms_=build_terms(tweet)
        lenght=len(terms_)
        lenghts.append(lenght)
               
    for i in range(len(terms)):
        for position,tweet in enumerate(docs):
            count=0
            terms_=build_terms(title_index[tweet][7])
            for j in range(len(terms_)):
                if terms[i]==terms_[j]:
                    count+=1
            tf_[i][position]=count
                
    Lave=np.mean(lenghts)
    for position,tweet in enumerate(docs):
        sumatorio=0
        for i in range(len(terms)):
            x=math.log((N/df[terms[i]]),2)
            xx=tf_[i][position]
            y=(k1+1)*xx
            yy=lenghts[position]
            z=k1*((1-b)+b*(yy/Lave)+xx)
            sumatorio+=abs(x*(y/z))
        RSV[tweet]=sumatorio


    # Calculate the score of each doc 
    RSV_sort=dict(sorted(RSV.items(),key=lambda item:item[1],reverse=True))
    result_docs = [x for x,y in RSV_sort.items()]
    result_scores=[y for x,y in RSV_sort.items()]
    
    if len(result_docs) == 0:
        print("No results found, try again")
        
    return result_scores,result_docs

In [9]:
def search_bm25(query, index):
   
    query = build_terms(query)
    docs = []
    for term in query:
        try:
            #Term is in the index
            keys = [i for i in index.keys()]
            term_docs = [index[t] for t in keys if t==term]
            docs=term_docs[0]
            
        except:
            #Term is not in index
            pass
    docs = list(docs)
    scores_docs,ranked_docs = rank_documents_bm25(query, docs, index, idf, tf, title_index)
    return scores_docs,ranked_docs

###### Our score + cosine similarity:

In [10]:
def rank_documents_our_score(terms, docs, index, idf, tf, title_index): 
    
    sumatori=0
    sumatori2=0
    for doc in docs: 
        sumatori+=title_index[doc][4]
        sumatori2+=title_index[doc][5]
    if sumatori2==0: 
        sumatori2=1
    if sumatori==0:
        sumatori=1

    doc_vectors = defaultdict(lambda: [0] * len(terms))
    query_vector = [0] * len(terms)
    
    # compute the norm for the query tf
    query_terms_count = collections.Counter(terms) 
    query_norm = la.norm(list(query_terms_count.values()))
    

    for termIndex, term in enumerate(terms):
        if term not in index:
            continue
        query_vector[termIndex] = query_terms_count[term] / query_norm * idf[term]
        # Generate doc_vectors for matching docs
        
        for doc_index, doc in enumerate(index[term]): 
            if doc in docs and len(tf[term])>doc_index:
                doc_vectors[doc][termIndex] = (title_index[doc][4]/sumatori)* (title_index[doc][5]/sumatori2)

    # Calculate the score of each doc
    doc_scores = [[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items()]
    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]
    result_scores= [x[0] for x in doc_scores]
    
    if len(result_docs) == 0:
        print("No results found, try again")
        #query = input()
        #docs = search_tf_idf(query, index)
    return result_scores,result_docs

In [11]:
def search_our_score(query, index):
    query = build_terms(query)
    docs = []
    for term in query:
        try:
            #Term is in the index
            keys = [i for i in index.keys()]
            term_docs = [index[t] for t in keys if t==term]
            docs=term_docs[0]
        except:
            #Term is not in index
            pass
    docs = list(docs)
    scores_docs,ranked_docs = rank_documents_our_score(query, docs, index, idf, tf, title_index)
    return scores_docs,ranked_docs

In [12]:
index, tf, df, idf, title_index = create_index(datos_diccionario, len(lines))

#### QUERIES

In [14]:
#QUERIES

query = "Computer Science"
scores,docs = search_bm25(query, index)
top = 10


print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))
for d_id in docs[:top]:
    print('Tweet_id: ', title_index[d_id][0], ' Username: ', title_index[d_id][1], ' Date: ', title_index[d_id][2], ' Hashtags: ', title_index[d_id][3], ' Likes: ', title_index[d_id][4], ' Url: ', title_index[d_id][5])


Sample of 10 results out of 5 for the searched query:

Tweet_id:  1575878436207443969  Username:  WorkingatDuke  Date:  Fri Sep 30 16:01:12 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [27, 40]}]  Likes:  3918  Url:  0
Tweet_id:  1575914809383583744  Username:  wluera  Date:  Fri Sep 30 18:25:44 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [105, 118]}]  Likes:  7766  Url:  0
Tweet_id:  1575900541221146625  Username:  twinmetalhen54  Date:  Fri Sep 30 17:29:02 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [113, 126]}, {'text': 'GSM', 'indices': [128, 132]}, {'text': 'Science', 'indices': [134, 142]}, {'text': 'Truth', 'indices': [144, 150]}]  Likes:  19694  Url:  0
Tweet_id:  1575859051233038341  Username:  LexRich5Schools  Date:  Fri Sep 30 14:44:10 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [52, 65]}, {'text': 'D5Reads365', 'indices': [227, 238]}, {'text': 'OurD5Story', 'indices': [239, 250]}]  Likes:  10893  Url:  0
Tweet_

In [15]:
query = "instagram"
scores,docs = search_bm25(query, index)
top = 10


print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))
for d_id in docs[:top]:
    print('Tweet_id: ', title_index[d_id][0], ' Username: ', title_index[d_id][1], ' Date: ', title_index[d_id][2], ' Hashtags: ', title_index[d_id][3], ' Likes: ', title_index[d_id][4], ' Url: ', title_index[d_id][5])


Sample of 10 results out of 3 for the searched query:

Tweet_id:  1575887728964534273  Username:  pettigrewmed  Date:  Fri Sep 30 16:38:07 +0000 2022  Hashtags:  [{'text': 'hurricaneian', 'indices': [22, 35]}]  Likes:  31  Url:  0
Tweet_id:  1575859537738838016  Username:  craigtimes  Date:  Fri Sep 30 14:46:06 +0000 2022  Hashtags:  [{'text': 'Florida', 'indices': [0, 8]}, {'text': 'HurricaneIan', 'indices': [112, 125]}]  Likes:  230080  Url:  11
Tweet_id:  1575864352837701635  Username:  savcandy  Date:  Fri Sep 30 15:05:14 +0000 2022  Hashtags:  [{'text': 'savannahcandykitchen', 'indices': [238, 259]}, {'text': 'hurricaneian', 'indices': [260, 273]}]  Likes:  2288  Url:  1


In [16]:
query = "vaccine"
scores,docs = search_bm25(query, index)
top = 10


print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))
for d_id in docs[:top]:
    print('Tweet_id: ', title_index[d_id][0], ' Username: ', title_index[d_id][1], ' Date: ', title_index[d_id][2], ' Hashtags: ', title_index[d_id][3], ' Likes: ', title_index[d_id][4], ' Url: ', title_index[d_id][5])


Sample of 10 results out of 2 for the searched query:

Tweet_id:  1575905732649689089  Username:  spinning_will  Date:  Fri Sep 30 17:49:40 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [31, 44]}, {'text': 'FJB', 'indices': [61, 65]}]  Likes:  107663  Url:  0
Tweet_id:  1575901730730283014  Username:  JTTmemes  Date:  Fri Sep 30 17:33:46 +0000 2022  Hashtags:  [{'text': 'TuaTagovailoa', 'indices': [182, 196]}, {'text': 'HurricaneIan', 'indices': [197, 210]}, {'text': 'coronavirus', 'indices': [211, 223]}, {'text': 'vaccine', 'indices': [224, 232]}]  Likes:  3664  Url:  0


In [17]:
query = "covid"
scores,docs = search_bm25(query, index)
top = 10


print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))
for d_id in docs[:top]:
    print('Tweet_id: ', title_index[d_id][0], ' Username: ', title_index[d_id][1], ' Date: ', title_index[d_id][2], ' Hashtags: ', title_index[d_id][3], ' Likes: ', title_index[d_id][4], ' Url: ', title_index[d_id][5])


Sample of 10 results out of 4 for the searched query:

Tweet_id:  1575886977764728838  Username:  RamblingMyMind  Date:  Fri Sep 30 16:35:08 +0000 2022  Hashtags:  [{'text': 'COVID', 'indices': [92, 98]}, {'text': 'hurricaneian', 'indices': [131, 144]}, {'text': 'Hurricane', 'indices': [145, 155]}, {'text': 'JustBecauseYouCantSeeIt', 'indices': [156, 180]}]  Likes:  38047  Url:  1
Tweet_id:  1575871626540818432  Username:  AMDPU22  Date:  Fri Sep 30 15:34:08 +0000 2022  Hashtags:  [{'text': 'MentalHealth', 'indices': [0, 13]}, {'text': 'COVID', 'indices': [55, 61]}, {'text': 'HurricaneIan', 'indices': [66, 79]}, {'text': 'DePaulSMN', 'indices': [151, 161]}]  Likes:  27  Url:  0
Tweet_id:  1575865182944894978  Username:  FraserFaithful  Date:  Fri Sep 30 15:08:32 +0000 2022  Hashtags:  [{'text': 'JunkScience', 'indices': [0, 12]}, {'text': 'GlobalWarming', 'indices': [38, 52]}, {'text': 'HurricaneIan', 'indices': [71, 84]}, {'text': 'Covid', 'indices': [132, 138]}, {'text': 'FakeNews',

In [18]:
query = "Landfall in South Carolina"
scores, docs = search_bm25(query, index)
top = 10


print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))
for d_id in docs[:top]:
    print('Tweet_id: ', title_index[d_id][0], ' Username: ', title_index[d_id][1], ' Date: ', title_index[d_id][2], ' Hashtags: ', title_index[d_id][3], ' Likes: ', title_index[d_id][4], ' Url: ', title_index[d_id][5])


Sample of 10 results out of 263 for the searched query:

Tweet_id:  1575914959091163136  Username:  sfdb  Date:  Fri Sep 30 18:26:19 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [0, 13]}]  Likes:  18768  Url:  0
Tweet_id:  1575875671385071618  Username:  JoshFitzWx  Date:  Fri Sep 30 15:50:13 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [3, 16]}]  Likes:  26  Url:  1
Tweet_id:  1575913904143626240  Username:  NicholeDWBZ  Date:  Fri Sep 30 18:22:08 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [5, 18]}]  Likes:  6617  Url:  0
Tweet_id:  1575914903806042112  Username:  WIONews  Date:  Fri Sep 30 18:26:06 +0000 2022  Hashtags:  [{'text': 'BREAKING', 'indices': [0, 9]}, {'text': 'HurricaneIan', 'indices': [12, 25]}]  Likes:  7271  Url:  1
Tweet_id:  1575910357293764608  Username:  AlecSilvaWX  Date:  Fri Sep 30 18:08:02 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [0, 13]}]  Likes:  6259  Url:  0
Tweet_id:  1575873971827798016  U

In [19]:
#model our_search_score
query = "Computer Science"
scores,docs = search_our_score(query, index)
top = 10


print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))
for d_id in docs[:top]:
    print('Tweet_id: ', title_index[d_id][0], ' Username: ', title_index[d_id][1], ' Date: ', title_index[d_id][2], ' Hashtags: ', title_index[d_id][3], ' Likes: ', title_index[d_id][4], ' Url: ', title_index[d_id][5])


Sample of 10 results out of 5 for the searched query:

Tweet_id:  1575914809383583744  Username:  wluera  Date:  Fri Sep 30 18:25:44 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [105, 118]}]  Likes:  7766  Url:  0
Tweet_id:  1575900541221146625  Username:  twinmetalhen54  Date:  Fri Sep 30 17:29:02 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [113, 126]}, {'text': 'GSM', 'indices': [128, 132]}, {'text': 'Science', 'indices': [134, 142]}, {'text': 'Truth', 'indices': [144, 150]}]  Likes:  19694  Url:  0
Tweet_id:  1575889117942104065  Username:  GVWire  Date:  Fri Sep 30 16:43:38 +0000 2022  Hashtags:  [{'text': 'GVWire', 'indices': [70, 77]}, {'text': 'News', 'indices': [78, 83]}, {'text': 'Politics', 'indices': [84, 93]}, {'text': 'Weather', 'indices': [94, 102]}, {'text': 'Climate', 'indices': [103, 111]}, {'text': 'Environment', 'indices': [112, 124]}, {'text': 'Science', 'indices': [125, 133]}, {'text': 'Floods', 'indices': [134, 141]}, {'text': 'rain

In [20]:
query = "Landfall in South Carolina"
scores, docs = search_our_score(query, index)
top = 10


print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))
for d_id in docs[:top]:
    print('Tweet_id: ', title_index[d_id][0], ' Username: ', title_index[d_id][1], ' Date: ', title_index[d_id][2], ' Hashtags: ', title_index[d_id][3], ' Likes: ', title_index[d_id][4], ' Url: ', title_index[d_id][5])


Sample of 10 results out of 255 for the searched query:

Tweet_id:  1575873317483036681  Username:  capitalweather  Date:  Fri Sep 30 15:40:51 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [14, 27]}]  Likes:  44216  Url:  42
Tweet_id:  1575909527924490241  Username:  WeatherNation  Date:  Fri Sep 30 18:04:45 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [33, 46]}, {'text': 'SCwx', 'indices': [243, 248]}]  Likes:  82674  Url:  21
Tweet_id:  1575861142211235842  Username:  HananyaNaftali  Date:  Fri Sep 30 14:52:29 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [187, 200]}]  Likes:  35667  Url:  46
Tweet_id:  1575860260614832128  Username:  capitalweather  Date:  Fri Sep 30 14:48:58 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [63, 76]}]  Likes:  44216  Url:  17
Tweet_id:  1575865415225729024  Username:  B_Carp01  Date:  Fri Sep 30 15:09:27 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [13, 26]}, {'text': 'SCwx', 'ind

In [21]:
query = "covid"
scores,docs = search_our_score(query, index)
top = 10


print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))
for d_id in docs[:top]:
    print('Tweet_id: ', title_index[d_id][0], ' Username: ', title_index[d_id][1], ' Date: ', title_index[d_id][2], ' Hashtags: ', title_index[d_id][3], ' Likes: ', title_index[d_id][4], ' Url: ', title_index[d_id][5])


Sample of 10 results out of 4 for the searched query:

Tweet_id:  1575886977764728838  Username:  RamblingMyMind  Date:  Fri Sep 30 16:35:08 +0000 2022  Hashtags:  [{'text': 'COVID', 'indices': [92, 98]}, {'text': 'hurricaneian', 'indices': [131, 144]}, {'text': 'Hurricane', 'indices': [145, 155]}, {'text': 'JustBecauseYouCantSeeIt', 'indices': [156, 180]}]  Likes:  38047  Url:  1
Tweet_id:  1575860573795127296  Username:  holdsworth353  Date:  Fri Sep 30 14:50:13 +0000 2022  Hashtags:  [{'text': 'Florida', 'indices': [37, 45]}, {'text': 'Covid', 'indices': [75, 81]}, {'text': 'HurricaneIan', 'indices': [232, 245]}, {'text': 'Cleanup', 'indices': [246, 254]}, {'text': 'MyStory', 'indices': [255, 263]}]  Likes:  4535  Url:  1
Tweet_id:  1575865182944894978  Username:  FraserFaithful  Date:  Fri Sep 30 15:08:32 +0000 2022  Hashtags:  [{'text': 'JunkScience', 'indices': [0, 12]}, {'text': 'GlobalWarming', 'indices': [38, 52]}, {'text': 'HurricaneIan', 'indices': [71, 84]}, {'text': 'Covi

In [22]:
query = "instagram"
scores,docs = search_our_score(query, index)
top = 10


print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))
for d_id in docs[:top]:
    print('Tweet_id: ', title_index[d_id][0], ' Username: ', title_index[d_id][1], ' Date: ', title_index[d_id][2], ' Hashtags: ', title_index[d_id][3], ' Likes: ', title_index[d_id][4], ' Url: ', title_index[d_id][5])


Sample of 10 results out of 3 for the searched query:

Tweet_id:  1575859537738838016  Username:  craigtimes  Date:  Fri Sep 30 14:46:06 +0000 2022  Hashtags:  [{'text': 'Florida', 'indices': [0, 8]}, {'text': 'HurricaneIan', 'indices': [112, 125]}]  Likes:  230080  Url:  11
Tweet_id:  1575864352837701635  Username:  savcandy  Date:  Fri Sep 30 15:05:14 +0000 2022  Hashtags:  [{'text': 'savannahcandykitchen', 'indices': [238, 259]}, {'text': 'hurricaneian', 'indices': [260, 273]}]  Likes:  2288  Url:  1
Tweet_id:  1575887728964534273  Username:  pettigrewmed  Date:  Fri Sep 30 16:38:07 +0000 2022  Hashtags:  [{'text': 'hurricaneian', 'indices': [22, 35]}]  Likes:  31  Url:  0


In [23]:
query = "vaccine"
scores,docs = search_our_score(query, index)
top = 10


print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))
for d_id in docs[:top]:
    print('Tweet_id: ', title_index[d_id][0], ' Username: ', title_index[d_id][1], ' Date: ', title_index[d_id][2], ' Hashtags: ', title_index[d_id][3], ' Likes: ', title_index[d_id][4], ' Url: ', title_index[d_id][5])


Sample of 10 results out of 2 for the searched query:

Tweet_id:  1575905732649689089  Username:  spinning_will  Date:  Fri Sep 30 17:49:40 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [31, 44]}, {'text': 'FJB', 'indices': [61, 65]}]  Likes:  107663  Url:  0
Tweet_id:  1575901730730283014  Username:  JTTmemes  Date:  Fri Sep 30 17:33:46 +0000 2022  Hashtags:  [{'text': 'TuaTagovailoa', 'indices': [182, 196]}, {'text': 'HurricaneIan', 'indices': [197, 210]}, {'text': 'coronavirus', 'indices': [211, 223]}, {'text': 'vaccine', 'indices': [224, 232]}]  Likes:  3664  Url:  0


# Word2vec


In [24]:
def top20_2vec(query, tweets, index, title_index, model):
    terms={}
    for ids in tweets:
        term_tweet=build_terms(title_index[ids][7])
        
        wordspresent = [word for word in term_tweet if word in model.wv.index_to_key]
        vector = np.mean(model.wv[wordspresent], axis=0)
        terms[ids] = vector
      
    
    
    wordspresent = [word for word in query if word in model.wv.index_to_key]
    queryVector =  np.mean(model.wv[wordspresent], axis=0)
    
    
    #calculate cosine similarity  
    tweetScores = [ [np.dot(curTweetVec, queryVector), tweet_id] for tweet_id, curTweetVec in terms.items() ]
    tweetScores.sort(reverse=True)
    resultTweets = [x[1] for x in tweetScores][:20]
    resultScores = [x[0] for x in tweetScores][:20]
    
    return  resultScores, resultTweets

In [25]:
def search(query,index,tweets,tf,idf,model,title_index):
    
    query= build_terms(query)
    docs = []
    for term in query:
        try:
            #Term is in the index
            keys = [i for i in index.keys()]
            term_docs = [index[t] for t in keys if t==term]
            docs=term_docs[0]
            
        except:
            #Term is not in index
            pass
        
        
        
    tweets = list(docs)
    scores_docs,ranked_docs = top20_2vec(query, tweets, index, title_index,model)
    return scores_docs,ranked_docs
    

In [26]:
from gensim.models.word2vec import Word2Vec

clean_tweets=[]
for tweets in datos_diccionario:
    terms=build_terms(tweets['full_text'])
    clean_tweets.append(terms)
    
model = Word2Vec(clean_tweets)


In [27]:
query = "Computer Science"

scores,word2vectop20 = search(query, index, datos_diccionario,tf,idf,model,title_index)

print("\n======================\nTop 20 results for the searched query, using Word2Vec :\n")
for d_id in word2vectop20[:top]:
    print(d_id)
    print('Tweet_id: ', title_index[d_id][0], ' Username: ', title_index[d_id][1], ' Date: ', title_index[d_id][2], ' Hashtags: ', title_index[d_id][3], ' Likes: ', title_index[d_id][4], ' Url: ', title_index[d_id][5])


Top 20 results for the searched query, using Word2Vec :

1575889117942104065
Tweet_id:  1575889117942104065  Username:  GVWire  Date:  Fri Sep 30 16:43:38 +0000 2022  Hashtags:  [{'text': 'GVWire', 'indices': [70, 77]}, {'text': 'News', 'indices': [78, 83]}, {'text': 'Politics', 'indices': [84, 93]}, {'text': 'Weather', 'indices': [94, 102]}, {'text': 'Climate', 'indices': [103, 111]}, {'text': 'Environment', 'indices': [112, 124]}, {'text': 'Science', 'indices': [125, 133]}, {'text': 'Floods', 'indices': [134, 141]}, {'text': 'rain', 'indices': [142, 147]}, {'text': 'Hurricane', 'indices': [148, 158]}, {'text': 'HurricaneIan', 'indices': [159, 172]}, {'text': 'Carolina', 'indices': [173, 182]}, {'text': 'Florida', 'indices': [183, 191]}]  Likes:  898  Url:  0
1575900541221146625
Tweet_id:  1575900541221146625  Username:  twinmetalhen54  Date:  Fri Sep 30 17:29:02 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [113, 126]}, {'text': 'GSM', 'indices': [128, 132]}, {'text': 

In [28]:
query = "Landfall in South Carolina"
scores,word2vectop20 = search(query, index, datos_diccionario,tf,idf,model,title_index)

print("\n======================\nTop 20 results for the searched query, using Word2Vec :\n")
for d_id in word2vectop20[:top]:
    print(d_id)
    print('Tweet_id: ', title_index[d_id][0], ' Username: ', title_index[d_id][1], ' Date: ', title_index[d_id][2], ' Hashtags: ', title_index[d_id][3], ' Likes: ', title_index[d_id][4], ' Url: ', title_index[d_id][5])


Top 20 results for the searched query, using Word2Vec :

1575905732649689089
Tweet_id:  1575905732649689089  Username:  spinning_will  Date:  Fri Sep 30 17:49:40 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [31, 44]}, {'text': 'FJB', 'indices': [61, 65]}]  Likes:  107663  Url:  0
1575905027058987008
Tweet_id:  1575905027058987008  Username:  KellyCooleyCPS  Date:  Fri Sep 30 17:46:52 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [42, 55]}]  Likes:  1651  Url:  0
1575880595313332224
Tweet_id:  1575880595313332224  Username:  AlecSilvaWX  Date:  Fri Sep 30 16:09:47 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [37, 50]}]  Likes:  6259  Url:  0
1575915969913839616
Tweet_id:  1575915969913839616  Username:  TheAstuteGaloot  Date:  Fri Sep 30 18:30:20 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [15, 28]}]  Likes:  178656  Url:  0
1575913904143626240
Tweet_id:  1575913904143626240  Username:  NicholeDWBZ  Date:  Fri Sep 30 18:22:08 

In [29]:
query = "university"
scores,word2vectop20 = search(query, index, datos_diccionario,tf,idf,model,title_index)

print("\n======================\nTop 20 results for the searched query, using Word2Vec :\n")
for d_id in word2vectop20[:top]:
    print(d_id)
    print('Tweet_id: ', title_index[d_id][0], ' Username: ', title_index[d_id][1], ' Date: ', title_index[d_id][2], ' Hashtags: ', title_index[d_id][3], ' Likes: ', title_index[d_id][4], ' Url: ', title_index[d_id][5])


Top 20 results for the searched query, using Word2Vec :

1575894038015778816
Tweet_id:  1575894038015778816  Username:  DisFanCom  Date:  Fri Sep 30 17:03:12 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [78, 91]}, {'text': 'florida', 'indices': [92, 100]}, {'text': 'disney', 'indices': [101, 108]}, {'text': 'universalorlando', 'indices': [109, 126]}]  Likes:  8  Url:  0
1575886902653485061
Tweet_id:  1575886902653485061  Username:  HeidiCraun  Date:  Fri Sep 30 16:34:50 +0000 2022  Hashtags:  [{'text': 'elevatecx', 'indices': [39, 49]}, {'text': 'CX', 'indices': [86, 89]}, {'text': 'HurricaneIan', 'indices': [141, 154]}]  Likes:  6306  Url:  0
1575894054696517636
Tweet_id:  1575894054696517636  Username:  krollbondrating  Date:  Fri Sep 30 17:03:15 +0000 2022  Hashtags:  [{'text': 'RMBS', 'indices': [48, 53]}, {'text': 'HurricaneIan', 'indices': [122, 135]}, {'text': 'MortgageBackedSecurities', 'indices': [247, 272]}]  Likes:  729  Url:  1
1575917476382056449
Tweet_id: 

In [30]:
query = "climate"
scores,word2vectop20 = search(query, index, datos_diccionario,tf,idf,model,title_index)

print("\n======================\nTop 20 results for the searched query, using Word2Vec :\n")
for d_id in word2vectop20[:top]:
    print(d_id)
    print('Tweet_id: ', title_index[d_id][0], ' Username: ', title_index[d_id][1], ' Date: ', title_index[d_id][2], ' Hashtags: ', title_index[d_id][3], ' Likes: ', title_index[d_id][4], ' Url: ', title_index[d_id][5])


Top 20 results for the searched query, using Word2Vec :

1575858986720845827
Tweet_id:  1575858986720845827  Username:  chriscartw83  Date:  Fri Sep 30 14:43:55 +0000 2022  Hashtags:  [{'text': 'Climate', 'indices': [14, 22]}, {'text': 'Ian', 'indices': [43, 47]}, {'text': 'hurricaneIan', 'indices': [48, 61]}]  Likes:  32966  Url:  1
1575860718335045634
Tweet_id:  1575860718335045634  Username:  adamzyglis  Date:  Fri Sep 30 14:50:47 +0000 2022  Hashtags:  [{'text': 'IanHurricane', 'indices': [109, 122]}, {'text': 'HurricaneIan', 'indices': [123, 136]}, {'text': 'Florida', 'indices': [137, 145]}, {'text': 'ClimateCrisis', 'indices': [146, 160]}]  Likes:  11618  Url:  5
1575864220310114308
Tweet_id:  1575864220310114308  Username:  josh_a_scott  Date:  Fri Sep 30 15:04:42 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [76, 89]}]  Likes:  12906  Url:  0
1575867077138599937
Tweet_id:  1575867077138599937  Username:  Earth42morrow  Date:  Fri Sep 30 15:16:04 +0000 2022  Hasht

In [31]:
query = "crisis"
scores,word2vectop20 = search(query, index, datos_diccionario,tf,idf,model,title_index)

print("\n======================\nTop 20 results for the searched query, using Word2Vec :\n")
for d_id in word2vectop20[:top]:
    print(d_id)
    print('Tweet_id: ', title_index[d_id][0], ' Username: ', title_index[d_id][1], ' Date: ', title_index[d_id][2], ' Hashtags: ', title_index[d_id][3], ' Likes: ', title_index[d_id][4], ' Url: ', title_index[d_id][5])


Top 20 results for the searched query, using Word2Vec :

1575909479220416512
Tweet_id:  1575909479220416512  Username:  Restoration1HDQ  Date:  Fri Sep 30 18:04:33 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [0, 13]}]  Likes:  305  Url:  0
1575906504066080768
Tweet_id:  1575906504066080768  Username:  elizabethcrisp  Date:  Fri Sep 30 17:52:44 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [9, 22]}]  Likes:  17650  Url:  2
1575865559354425347
Tweet_id:  1575865559354425347  Username:  MagellanHealth  Date:  Fri Sep 30 15:10:02 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [35, 48]}]  Likes:  1052  Url:  0
1575861555673145345
Tweet_id:  1575861555673145345  Username:  LegalAid941  Date:  Fri Sep 30 14:54:07 +0000 2022  Hashtags:  [{'text': 'HurricaneIan', 'indices': [40, 53]}]  Likes:  33  Url:  0
1575907404041965569
Tweet_id:  1575907404041965569  Username:  citizenfisher  Date:  Fri Sep 30 17:56:18 +0000 2022  Hashtags:  [{'text': 'Hurricane