In [None]:
import pandas as pd
import numpy as np
import ast
import nltk
from itertools import chain
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
from statistics import mean
nltk.download('stopwords')
DEM_PATH = '../../data/twitter/raw/users/democrats/required/'
REP_PATH = '../../data/twitter/raw/users/republicans/required/'

In [None]:
# Utility function to flatten column (in the form of list of lists) into a list
def flatten2dList(df, column):
    """
    Input:
        df:     dataframe to be processed
        column: column from the dataframe to be flattened
    Returns:
        series: A series object of the flattened column
    """
    list_2d = list(filter(lambda i: i!='[]', df[column].tolist()))
    series = []
    for sublist in list_2d:
        sublist = ast.literal_eval(sublist)
        for i in sublist:
            series.append(i)
    # series = Series(series)
    return series

In [None]:
def create_dataframe(matrix, tokens):
    doc_names = [f'doc_{i+1}' for i, _ in enumerate(matrix)]
    df = pd.DataFrame(data=matrix, index=doc_names, columns=tokens)
    return df

In [None]:
model = SentenceTransformer('paraphrase-mpnet-base-v2')

In [None]:
# Stopwords
stopWords = stopwords.words('english')
additionalStopwords = ['http','https','amp','CO','Trump','Trump2016','Donald','Clinton','Hillary','realDonaldTrump','will','say','said','let','vote','now','go','today','thanks','thank']
stopWords.extend(additionalStopwords)
stopWords = set(stopWords)

In [None]:
# def calculateJaccardSimilarity(username1, username2):
#     """
#     Calculates the Jaccard Similarity Score between two twitter users on the basis of tweets.
#     Inputs:
#         username1, username2: usernames of the two users
#     Output: Jaccard Similarity Score for tweets
#     """
#     user1_df = pd.read_csv(REP_PATH+username1+'.csv')
#     if isinstance(username2, list):
#         user2_listOfDataframes = []
#         for users in username2:
#             user2_df = pd.read_csv(REP_PATH+users+'.csv')
#             user2_listOfDataframes.append(user2_df)
#         user2_df = pd.concat(user2_listOfDataframes)
#     else:
#         user2_df = pd.read_csv(REP_PATH+username2+'.csv')

#     # Calculate Jaccard Similarity on Tweets
#     user1_tweets = ((user1_df.tweet.values).astype('str')).tolist()
#     user2_tweets = ((user2_df.tweet.values).astype('str')).tolist()
    
#     jaccardSimilarityList = []
    
#     for user1_tweet in user1_tweets:
#         query_tw = user1_tweet
#         query_words = set(query_tw.split())
#         filtered_query_words = set([w for w in query_words if w not in stopWords])
        
#         maxJaccardSimList = []
        
#         for user2_tweet in user2_tweets:
#             user2_words = set(user2_tweet.split())
#             # filtered_user2_words = set([w for w in user2_words if w not in stopWords])
            
#             jaccardSimQuery = round(len(query_words.intersection(user2_words))/len(query_words.union(user2_words)), 7)
        
#             maxJaccardSimList.append(jaccardSimQuery)
            
#         maxJaccardSim = max(maxJaccardSimList)
#         jaccardSimilarityList.append(maxJaccardSim)
        
#     avgJaccardSimilarity = mean(jaccardSimilarityList)  
    
#     return avgJaccardSimilarity


In [None]:
def calculateCosineSimilarity(username1, username2):
# def calculateCosineSimilarity(username1, username2, fromDate, toDate):
    """
    Calculates the Cosine Similarity Score between two twitter users on tweets.
    Inputs:
        username1, username2: usernames of the two users
    Output: Cosine Similarity Score for tweets
    """
    user1_df = pd.read_csv(DEM_PATH + username1 + '.csv')
    if isinstance(username2, list):
        user2_listOfDataframes = []
        for users in username2:
            user2_df = pd.read_csv(DEM_PATH + users + '.csv')
            user2_listOfDataframes.append(user2_df)
        user2_df = pd.concat(user2_listOfDataframes)
    else:
        user2_df = pd.read_csv(DEM_PATH + username2 + '.csv')
    

    # Calculate Cosine Similarity on Tweets
    user1_tweets = ((user1_df.tweet.values).astype('str')).tolist()
    user2_tweets = ((user2_df.tweet.values).astype('str')).tolist()
    user1_embeddings = model.encode(user1_tweets, convert_to_tensor=True)
    user2_embeddings = model.encode(user2_tweets, convert_to_tensor=True)
    cosineSim = []
    for user1_embedding in user1_embeddings:
      cosine_scores = (util.pytorch_cos_sim(user1_embedding, user2_embeddings[:])).tolist()[0]

      maxScore = max(cosine_scores)

      cosineSim.append(maxScore)      

    avgCosineSim = mean(cosineSim)
    return avgCosineSim

In [None]:
candidate1 = 'realDonaldTrump'
candidate2 = 'HouseGOP'
# jsTweets = calculateJaccardSimilarity(candidate1, candidate2)
csTweets = calculateCosineSimilarity(candidate1, candidate2)

print(candidate1, 'VS', candidate2, sep=' ')
# print('Jaccard Similarity for Tweets:', jsTweets)
print('Cosine Similarity for Tweets:', csTweets)