In [1]:
# import relevant moduls
import pandas as pd
pd.set_option('display.max_columns', None)
import glob
from tqdm.notebook import tqdm
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')
import ast
import json
import numpy as np
from gensim.models import Word2Vec
from gensim import models
import plotly.express as px

In [2]:
# read and clean files
cluster_df = pd.read_csv('../../data/BTW17_Suggestions/suggestions/cluster.csv')
cluster_df.drop(columns='Unnamed: 0', axis=1, inplace=True)
topics_df = pd.read_csv('../../data/BTW17_Twitter/lda/topics.csv')
topics_df.drop(columns='Unnamed: 0', axis=1, inplace=True)

In [3]:
cluster_df.head()

Unnamed: 0,x,y,z,suggestion,cluster,vector
0,-5.934507,-13.884504,-15.370424,['waz'],0,[-0.2414 0.4842 -0.1616 -0.0521 0.6159 -0.84...
1,-4.854665,-10.56566,-15.258295,"['bild', 'zeitung']",0,[-0.1158 0.48233333 -0.212 -0.3332 ...
2,-4.574384,-12.412379,-15.251037,"['interview', 'badische', 'zeitung']",0,[ 9.11500007e-02 1.71550006e-01 1.30000710e-...
3,-5.265123,-12.737933,-17.156967,['tagesanzeiger'],0,[-0.43395 0.10869999 -0.36154997 -0.757949...
4,-2.996472,-10.088203,-17.916126,['traueranzeigen'],0,[-0.9071 -0.1063 -0.4996 0.0197 -0.5739 -0.61...


In [4]:
topics_df.head()

Unnamed: 0,hashtag,topic_words,scores,coherence
0,dobrindt,"['dobrindt', 'dieselgate', 'fahrverbote', 'ver...","['0.082', '0.070', '0.039', '0.037', '0.035', ...",0.374704
1,ard,"['ard', 'afd', 'btw', 'zdf', 'traudichdeutschl...","['0.112', '0.101', '0.094', '0.088', '0.085', ...",0.249636
2,zeitfürmartin,"['zeitfürmartin', 'meineschule', 'gebührenfrei...","['0.171', '0.037', '0.037', '0.037', '0.025', ...",0.421646
3,ehefueralle,"['ehefueralle', 'bundestag', 'merkel', 'stimme...","['0.124', '0.017', '0.017', '0.012', '0.011', ...",0.324373
4,digitalisierung,"['digitalisierung', 'nix', 'zukunft', 'eigentl...","['0.102', '0.049', '0.030', '0.029', '0.028', ...",0.258874


In [5]:
# read pretrained word2vec model
model = models.KeyedVectors.load_word2vec_format('../../data/Word2Vec/dewiki_20180420_100d.txt')

In [6]:
# get lists for faster iterating
hashtag = topics_df['hashtag'].tolist()
topic_words = topics_df['topic_words'].tolist()
scores = topics_df['scores'].tolist()

In [7]:
# retrieve vector per hashtag 
vectors = []

# iterate through hashtags
for i in range(len(hashtag)):
    hashtag_vector = []
    word_vectors = []
    
    # get word scores from lda
    word_scores = ast.literal_eval(scores[i])
    topics = ast.literal_eval(topic_words[i])
    for j in reversed(range(len(topics))):
        try:
            # append word vector from word2vec model
            word_vectors.append(model[topics[j]])
        except:
            # clean lists 
            topics.pop(j)
            word_scores.pop(j)
    
    # get weighted average vector per hashtag and save them in one list
    hashtag_vector = np.average(word_vectors, axis=0, weights=np.array(word_scores).astype(np.float))
    vectors.append(hashtag_vector)

In [8]:
# retrieve lists for suggestion and suggestions word2vec vectors
suggestions = []
for i in range(len(cluster_df)):
    suggestions.append(ast.literal_eval(cluster_df['suggestion'][i]))

sugg_vectors = cluster_df['vector'].tolist()

In [10]:
# get similarity score for every suggestion per hashtag
sim_scores = []

# iterate through suggestions
for i in tqdm(range(len(suggestions))):
    temp = []
    for vector in vectors:
        
        # clean suggestions vector because of shitty format
        sugg_vector = ast.literal_eval(sugg_vectors[i]
                                       .replace('\n', '')
                                       .replace(' ', ',')
                                       .replace('[,', '[')
                                       .replace(',]', ']')
                                       .replace(',,,,', ',')
                                       .replace(',,,', ',')
                                       .replace(',,', ','))
        
        # calculate vector similarity for suggestions vector and hashtag vector
        score = round(np.dot(sugg_vector, vector)
                      / (np.linalg.norm(sugg_vector)
                       * np.linalg.norm(vector)),3)
        temp.append(score)
    sim_scores.append(temp)

  0%|          | 0/25321 [00:00<?, ?it/s]

In [15]:
cluster = cluster_df['cluster'].tolist()

# create output df
similarity_df = pd.DataFrame({'suggestion': suggestions, 'cluster': cluster, 'similarity_scores': sim_scores})

In [18]:
similarity_df.head(3)

Unnamed: 0,suggestion,cluster,similarity_scores
0,[waz],0,"[0.39, 0.465, 0.393, 0.428, 0.296, 0.533, 0.29..."
1,"[bild, zeitung]",0,"[0.525, 0.605, 0.657, 0.57, 0.574, 0.516, 0.52..."
2,"[interview, badische, zeitung]",0,"[0.525, 0.511, 0.527, 0.495, 0.586, 0.509, 0.4..."


In [17]:
# save dfs to csv
similarity_df.to_csv('../../data/BTW17_Suggestions/suggestions/topic_similarity.csv')