In [33]:
from rapidfuzz import process as pr
import pandas as pd
import numpy as np
from textblob import TextBlob # to convert plurals to singulars

### RapidFuzz

In [2]:
# #load set of DW keywords before fuzzyWuzzy into the file
uni_kw=pd.read_csv('../data/interim/out_2019-2021_keywords_before_FuzzyWuzzy.csv', names = ['ind', 'keyword'], header=0) 
#unique_keywords=uni_kw

In [3]:
print(len(uni_kw))
uni_kw.head()

32704


Unnamed: 0,ind,keyword
0,0,malware
1,1,state repression
2,2,climate consensus
3,3,Canary Islands
4,4,Angela I


# Cleaning

### Cleaning 1: all in lower case

In [69]:
keywords = list(uni_kw['keyword'])
keywords_clean = list(set(uni_kw['keyword'].str.lower())) # Cleaning: Write all in lower case

print('original: ', len(keywords))
print('all in lower case: ', len(keywords_clean))

original:  32704
all in lower case:  30385


### Cleaning 2: split keywords that were not split

In [70]:
substring = ', '

list_multikw = [kw for kw in keywords_clean if substring in kw] # keywords that did not get split
new_keywords = [kw.split(substring) for kw in list_multikw] # make a list of new keywords (the splited multi kw)

# Flatten list of list of new keywords
flat_new_keywords = [item for sublist in new_keywords for item in sublist]
flat_new_keywords.remove('') # remove empty values

# Remove the non-seperated keywords
for el in list_multikw:
    keywords_clean.remove(el)

# Add the seperated ones
keywords_clean = keywords_clean + flat_new_keywords

print('after splitting:', len(keywords_clean)) # number is higher because split long keyowrds into multiple

after splitting: 30404


### Cleaning 3: remove '\u2002' and '.' and '\xa0'

In [84]:
keywords_clean = list(set([kw.replace('\u2002','') for kw in keywords_clean]))
keywords_clean = list(set([kw.replace('.','') for kw in keywords_clean]))
keywords_clean = list(set([kw.replace('\xa0',' ') for kw in keywords_clean]))
keywords_clean.remove('') # TODO understand why there is an empty row
print('after removing u2022:', len(keywords_clean))

after removing u2022: 30320


# Compute similarity of each keyword of each other

In [86]:
ratio_array= pr.cdist(keywords_clean, keywords_clean, score_cutoff = 90)
len(ratio_array)

: 

: 

In [None]:
df_array = pd.DataFrame(ratio_array, columns = keywords_clean)
df_array.head(6)

ValueError: Shape of passed values is (30321, 30321), indices imply (30321, 30320)

### Extract similar keywords based on this matrix

In [74]:
# Count number of non zero values in each row
nb_non_zero = np.count_nonzero(np.asarray(ratio_array), axis=1) 

# Save indices of rows with more than 1 non-zero value
indices_correlating_rows = [i for i, el in enumerate(list(nb_non_zero)) if el>1]
len(indices_correlating_rows)

4979

In [77]:
for i in range(0,30):
    list_similar_words = [keyword for val, keyword in zip(list(df_array.iloc[indices_correlating_rows[i]]), keywords_clean) if val!=0]
    print(list_similar_words)

['trans-atlantic relations', 'saif al-islam', 'phil spector']
['zaikur rehman lakhvi', 'chuck schumer']
['terror list', 'syrian war']
['bosporus', '#speakup! barometer']
['global media forum 2018', 'euros', 'etna']
['united nations', 'repurposing']
['737max', 'emoglyphs']
['aphrodisiacs', 'bushido']
['skripals', 'lodz']
['synagoge', 'blasphemey']
['vineyards', 'hawaii']
['deep-sea mining', 'la soufriere']
['tortured', 'dorian']
['reeducation camps', 'agent orange', 'big tech']
['north rhine westphalia', 'us house of represntatives', 'infectious diseases', 'travel literature']
['the sun', 'khalid sheikh mohammed']
['halyna hutchinson', 'space mission']
['spelling bee', 'james lawrence']
['berlin marathon', 'quantative easing']
['minsk protests', 'german weather service']
['helen clark', 'kirakira+']
['azza\xa0karam', 'cultural policy in germany']
['demonstrations;violence', 'bath houses']
['the left party', 'electricity outages']
['deep-sea mining', 'la soufriere']
['demilitarized zone'

In [None]:
# Use this to remove singlulars from those words (don't do before because too many errors, e.g., us = u, thomas = thoma)
keywords_clean = [' '.join(list(TextBlob(kw).words.singularize())) for kw in keywords_clean]
print('after removing u2022:', len(keywords_clean))
keywords_clean