In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
from collections import Counter
# from config import *
import json
import datetime
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel



In [None]:
#This function will clean our text from data that is not important so that has no weight 
def clean_text(tweet):
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
   
    tweet = tweet.lower() # Transform in lowercase

    tweet = re.sub(r'@[a-zA-Z]+', '', tweet) # Here we remove the mentions in the tweet ex: @canodep
    tweet = re.sub(r"\B#([a-z0-9]{2,})(?![~!@#$%^&*()=+_`\-\|\/'\[\]\{\}]|[?.,]*\w)", '', tweet) # Here we remove the hashtags, because we will treat it later
    tweet = re.sub(r'[^\w\s]', '', tweet) # Here we remove punctuation marks
    tweet = re.sub(r'http\S+', '',tweet) # Remove http and https
    tweet = tweet.split() # Tokenize the text to get a list of terms

    tweet = [word for word in tweet if word not in stop_words]  # eliminate the stopwords
    tweet = [stemmer.stem(word) for word in tweet] # Perform stemming 
    return tweet
    

In [None]:
docs_path = 'data/tw_hurricane_data.json'
tweets_title = 'data/tweet_document_ids_map.csv'

tweets_id_title = {}

with open(tweets_title) as fp:
    lines = fp.readlines()


for l in lines:
    l = l.strip().split("\t")
    tweets_id_title[int(l[1])] =  l[0]


tweets = []
lines = []

for line in open(docs_path, 'r'):
    lines.append(line)
    #media = json.loads(line).get('entities').get('media')
    tweets.append({
        'id' : int(json.loads(line).get('id')),
        'title' : tweets_id_title[int(json.loads(line).get('id'))],
        'text': json.loads(line).get('full_text'),
        'username' : json.loads(line).get('user').get('screen_name'),
        'date' : json.loads(line).get('created_at'),
        'hashtag' : list(map(lambda hashtag:  hashtag.get('text'),  json.loads(line).get('entities').get('hashtags'))),
        'like' : json.loads(line).get('favorite_count'),
        'rt' : json.loads(line).get('retweet_count'),
        'URL' : 'https://twitter.com/' + json.loads(line).get('user').get('screen_name') + "/status/" + str(json.loads(line).get('id'))
    }) 

tweets_texts = [tweet['text'] for tweet in tweets]

In [None]:
def termFequency(term, document):
    return document.count(term) / len(document)

def inverseDocumentFrequency(term, documents):
    n = 0
    for doc in documents:
        if term.lower() in doc:
            n += 1
    return 1.0 + np.log(float(len(documents)) / n) if (n > 0) else 1.0


def tfiidf(term, document, documents):
    tf = termFequency(term, document)
    idf = inverseDocumentFrequency(term, documents)
    return tf * idf


tfiidf('neighborhood', clean_text(tweets[2]['text']), tweets_texts)

In [None]:
query = "Help and during the hurricane disaster"
def generateVectors(query, documents):
    query = clean_text(query)
    tf_idf_matrix = np.zeros((len(query), len(documents)))
    for i, term in enumerate(query):
        idf = inverseDocumentFrequency(term, documents)
        for j, document in enumerate(documents):
            tf_idf_matrix[i][j] = idf * termFequency(term, document)
    return tf_idf_matrix

tf_idf_matrix = generateVectors(query, tweets_texts)

def word_count(query):
    query = clean_text(query)
    count = dict()
    for word in query:
        if word in count:
            count[word] += 1
        else:
            count[word] = 1
    return count

def queryVector(query, documents):
    count = word_count(query)
    vector = np.zeros((len(count),1))
    for i, word in enumerate(clean_text(query)):
        vector[i] = float(count[word])/len(count) * inverseDocumentFrequency(word, documents)
    return vector

query_vector = queryVector(query, tweets_texts)


def cosineSimilarity(vector1, vector2):
    return np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))

In [337]:
import math
def compute_relevance(query, tweets):
    # Calculate relevance with cosine similarity
    documents = [tweet['text'] for tweet in tweets]
    query_vector = queryVector(query, documents)
    tf_idf_matrix = generateVectors(query, documents)
    relevance = []
    
    for i, document in enumerate(documents):
        relevance_i = (cosineSimilarity( tf_idf_matrix[:, i].reshape(1, -1), query_vector))
        relevance.append(relevance_i if math.isnan(relevance_i) == False else 0.0)
    for i, tweet in enumerate(tweets):
        tweet['relevance'] = relevance[i]
        
    return sorted(tweets, key=lambda tweet: tweet['relevance'], reverse=True)
    
tweets_ranked = compute_relevance(query, tweets)[:10]

for tweet in tweets_ranked:
    print("Tweet id: {} | tweet title: {} | relevance: {}".format(tweet["id"], tweet["title"], tweet["relevance"]))


Tweet id: 1575886758075678720 | tweet title: doc_1921 | relevance: [[1.]]
Tweet id: 1575874361675976705 | tweet title: doc_2610 | relevance: [[1.]]
Tweet id: 1575870774027911168 | tweet title: doc_2868 | relevance: [[1.]]
Tweet id: 1575880023617437696 | tweet title: doc_2224 | relevance: [[0.95129964]]
Tweet id: 1575911845927567375 | tweet title: doc_509 | relevance: [[0.95101843]]
Tweet id: 1575905585320497152 | tweet title: doc_1048 | relevance: [[0.95101843]]
Tweet id: 1575917149356691457 | tweet title: doc_68 | relevance: [[0.90621033]]
Tweet id: 1575917131564097536 | tweet title: doc_73 | relevance: [[0.90621033]]
Tweet id: 1575908420355379201 | tweet title: doc_834 | relevance: [[0.90621033]]
Tweet id: 1575905473261293570 | tweet title: doc_1057 | relevance: [[0.90621033]]


  return np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))


## Custom Rank
We will sort the documents that contain the query by like and RT

In [338]:

def customRank(query, tweets):
    # CAlculate the relevance by the times query appears in the tweet
    # and the number of likes and retweets

    for word in clean_text(query):
        for tweet in tweets:
            tweet['relevance'] = 0
            if word in clean_text(tweet['text']):
                tweet['relevance'] += 1
            tweet['relevance'] += tweet['like'] + tweet['rt']
            
    return sorted(tweets, key=lambda tweet: tweet['relevance'], reverse=True)
        
    
tweets_ranked = customRank("work", tweets)[:10]

for tweet in tweets_ranked:
    print("Tweet id: {} | tweet title: {} | relevance: {}".format(tweet["id"], tweet["title"], tweet["relevance"]))



Tweet id: 1575908406355197952 | tweet title: doc_838 | relevance: 1645
Tweet id: 1575864357845495809 | tweet title: doc_3397 | relevance: 1222
Tweet id: 1575906862829953025 | tweet title: doc_970 | relevance: 1142
Tweet id: 1575863704222019585 | tweet title: doc_3458 | relevance: 563
Tweet id: 1575894492443336714 | tweet title: doc_1586 | relevance: 481
Tweet id: 1575859547012534273 | tweet title: doc_3743 | relevance: 400
Tweet id: 1575861505509462021 | tweet title: doc_3623 | relevance: 391
Tweet id: 1575875845125771270 | tweet title: doc_2493 | relevance: 313
Tweet id: 1575910903425695745 | tweet title: doc_581 | relevance: 270
Tweet id: 1575857697265635328 | tweet title: doc_3892 | relevance: 255


## BM25

In [340]:
def bm25(query, tweets, k1=1.5, b=0.75):
    # Calculate relevance with BM25 algorithm using k1=1.5 and b=0.75
    documents = [tweet['text'] for tweet in tweets]
    query = clean_text(query)
    tf_idf_matrix = np.zeros((len(query), len(documents)))
    for i, term in enumerate(query):
        idf = inverseDocumentFrequency(term, documents)
        for j, document in enumerate(documents):
            tf_idf_matrix[i][j] = idf * termFequency(term, document)
    relevance = []
    for i, document in enumerate(documents):
        relevance.append(np.sum(tf_idf_matrix[:, i] * (k1 + 1) / (tf_idf_matrix[:, i] + k1 * (1 - b + b * len(document) / np.mean([len(doc) for doc in documents])))))
    
    for i, tweet in enumerate(tweets):
        tweet['relevance'] = relevance[i]
    
    return sorted(tweets, key=lambda tweet: tweet['relevance'], reverse=True)
        
    
tweets_ranked = bm25("work", tweets)[:10]

for tweet in tweets_ranked:
    print("Tweet id: {} | tweet title: {} | relevance: {}".format(tweet["id"], tweet["title"], tweet["relevance"]))



Tweet id: 1575878683675578370 | tweet title: doc_2293 | relevance: 0.12596235467069963
Tweet id: 1575870693698568193 | tweet title: doc_2880 | relevance: 0.11963651580071205
Tweet id: 1575909937137565723 | tweet title: doc_673 | relevance: 0.11568219971711384
Tweet id: 1575858054683205632 | tweet title: doc_3861 | relevance: 0.11192204434229668
Tweet id: 1575884021573132288 | tweet title: doc_2033 | relevance: 0.10661860886047461
Tweet id: 1575915585505878016 | tweet title: doc_207 | relevance: 0.09419688566175363
Tweet id: 1575869948886335488 | tweet title: doc_2956 | relevance: 0.09009614357279695
Tweet id: 1575884212347211776 | tweet title: doc_2023 | relevance: 0.08553299920433532
Tweet id: 1575868031473168385 | tweet title: doc_3116 | relevance: 0.08526481354421626
Tweet id: 1575862322324611075 | tweet title: doc_3573 | relevance: 0.08151415751233337
