# Pipeline to clean the keyword column 

In [16]:
import json
import pandas as pd
import itertools # to flatten lists of lists
import collections # to count
from rapidfuzz import process as pr
import numpy as np
from itertools import chain

In [17]:
import sys
sys.path.append('../src/')

from data.preprocess_keywords import clean_keywords, get_items_with_substring
from data.make_datasets import get_data

## Load data

In [2]:
# Runs for 30s
# Load and extract data within time range
data_file = '../data/raw/CMS_2010_to_June_2022_ENGLISH.json'
start_date = '2019-01-01'
end_date = '2022-01-01'

df_subset = get_data(data_file, start_date, end_date)

## Functions

In [3]:
def get_nb_unique_kws(lst_lst_keywords):
    return len(set(list(itertools.chain(*lst_lst_keywords))))

## Cleaning

In [4]:
lst_lst_keywords = list(df_subset.keywordStrings)
lst_lst_keywords_clean = clean_keywords(lst_lst_keywords)

Cleaning DONE. Number of unique keywords went from 32682 to 30228


## Count occurence of each keyword
Will be used to know which one to keep in fuzzy wuzzy (the most used)

In [12]:
keywords_flat = list(chain(*lst_lst_keywords_clean)) # Flatten list
keywords_freq = collections.Counter(keywords_flat)

In [13]:
# For visualisation, can be removed
keywords_freq_df = pd.DataFrame.from_dict(keywords_freq, orient='index', columns = ['freq'])
keywords_freq_df.sort_values(by='freq', ascending=False).head(10)

Unnamed: 0,freq
coronavirus,3817
germany,3299
covid-19,2641
china,1669
russia,1354
donald trump,1333
asia,1260
us,1087
eu,1078
bundesliga,905


# Rapid Fuzz

In [9]:
# ! Runs for 2min !
# extract unique ones and remove the empty entry
unique_keywords = list(set(keywords_flat))
unique_keywords.remove('')

# run rapid fuzz
ratio_array= pr.cdist(unique_keywords, unique_keywords, score_cutoff = 90)

### Find words correlating together and display

In [10]:
df_array = pd.DataFrame(ratio_array, columns = unique_keywords)

# Count number of non zero values in each row
nb_non_zero = np.count_nonzero(np.asarray(ratio_array), axis=1) 

# Save indices of rows with more than 1 non-zero value
indices_correlating_rows = [i for i, el in enumerate(list(nb_non_zero)) if el>1]

df_array.head()

Unnamed: 0,russian spy,butterfly,taybeh brewing company,robot kitchen,afghansitan,march,hedgehogs,school,greenfield sites,crisis,...,meghan markle,left and right-wing extremism,shutdown,uganda election,chlorpyrifos,mett,new zealand volcano,alexander lukasheko,missing,rectal probes
0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# ! Runs for 1min !
# Make a list of similar keywords
all_similar_kw = []
for i in indices_correlating_rows:
    
    similar_words = [keyword for val, keyword in zip(list(df_array.iloc[i]), unique_keywords) if val!=0]
    
    # Only adds it if it's not there already
    if similar_words not in all_similar_kw:
        all_similar_kw.append(similar_words)

In [19]:
# TEST
items, _, _ = get_items_with_substring(all_similar_kw, 'european fi')
for i in items:
    print(all_similar_kw[i])

['european fires', 'european firms']


In [684]:
all_similar_kw[0:5]

[['sausage', 'sausages'],
 ['minumum wage', 'minimum wage'],
 ['un climate summit', 'climate summit'],
 ['sex abuse scandals', 'sexual abuse scandals', 'sex abuse scandal'],
 ['champiosn league',
  'champions leage',
  'champions league',
  'champion league']]

In [21]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip -q glove.6B.zip

In [23]:
#tokens = [keyword.split() for keyword in df.firstKeyword] # tokenise: split into words
#model_ft = FastText(tokens, vector_size = 500) # model_ft.wv['pollution']: get word vector for a specific word; model_ft.wv.similar_by_word('pollution', topn=5): Look for the 5 most similar words

#word_vectors = [np.array(model_ft.wv[token]) for token in tokens]
#word_vectors_av = np.array([np.average(word_vector, axis=0) for word_vector in word_vectors]) # if: ['dw','akademie'] = 2 vectors = we average them

#features = word_vectors_av
#features.shape

In [25]:
#keywords = ['ayatollah khomeini', 'ayatollah khameini', 'ayatollah khomenei', 'ayatollah ali khameini']
# rapid fuzz, keeps track of similarities above 90
#similarity_matrix= pr.cdist(keywords, keywords, score_cutoff = 90) 
#similarity_matrix

In [26]:
## TODO: Problem, should be considered as the same keyword and not different ones..
## In the line above, add a new word to a sublist instead of making a new list of some elements match ?
#items = get_items_with_substring(all_similar_kw, 'ayatollah')
#print('Eeach new line is considered as a single keyword:')
#for i in items:
#    print(all_similar_kw[i])

# Unify similar keywords
Replace by most frequent one

In [665]:
# Split in the ones which have equal word number, and the ones which don't, becase they need different processing
similar_kws_same_word_nb = [sim_kws for sim_kws in all_similar_kw if len(set([kw.count(' ') for kw in sim_kws]))==1]
similar_kws_diff_word_nb = [sim_kws for sim_kws in all_similar_kw if len(set([kw.count(' ') for kw in sim_kws]))!=1]

In [666]:
#### TEST

# TODO: problem: ['european fires', 'european firms']: word embedings
# TODO: problem: ["new year's", 'countdown', 'resolutions'] replaced by ["new year's", 'countdown', 'solutions']

item = get_items_with_substring(similar_kws_same_word_nb, 'european firms')
for i in item:
    print(similar_kws_same_word_nb[i])

['european fires', 'european firms']


In [629]:
## TODO: Focus on similar_kws_diff_word_nb


In [667]:
# ! Runs for 1min50 !

## Focus on similar_kws_same_word_nb

# TODO: problem: ['european fires', 'european firms']: word embedings
# TODO: problem: ["new year's", 'countdown', 'resolutions'] replaced by ["new year's", 'countdown', 'solutions']

# Replace
right_kw = [sim_kws[np.argmax([keywords_freq[word] for word in sim_kws])] for sim_kws in similar_kws_same_word_nb]

keywords_flat = list(chain.from_iterable(keywords_lower_split_clean_short))

replacement_only = [[right_kw[i] for i, j in enumerate(similar_kws_same_word_nb) if word in j] for word in keywords_flat]

keywords_flat_post = [replacement_only[i][0] if replacement_only[i] != [] else keywords_flat[i] for i in range(len(keywords_flat))]

def gen_list_of_lists(original_list, new_structure):
    assert len(original_list) == sum(new_structure), \
    "The number of elements in the original list and desired structure don't match"
    list_of_lists = [[original_list[i + sum(new_structure[:j])] for i in range(new_structure[j])] \
                     for j in range(len(new_structure))]
    return list_of_lists

keywords_replaced = gen_list_of_lists(keywords_flat_post, [len(x) for x in keywords_lower_split_clean_short])

In [668]:
# Example for visualisation, can be removed later
for sim_kws in all_similar_kw[0:2]:
    for word in sim_kws:
        print('freq of:', word, ' = ', keywords_freq[word])
    print('')

n_show = 2 # how many examples to show

i_show = 0
for kw_before, kw_after in zip(keywords_lower_split_clean_short, keywords_replaced):
    if kw_before != kw_after and i_show < n_show:
        print('kw_before:', kw_before)
        print('kw_after:', kw_after, '\n')
        i_show += 1

freq of: sausage  =  7
freq of: sausages  =  3

freq of: minumum wage  =  1
freq of: minimum wage  =  10

kw_before: ['iran', 'sanctions', 'civil society', 'nuclear deal', 'jcpoa', 'us sanctions', 'tehran', 'hassan rouhani', 'ayatollah khamenei']
kw_after: ['iran', 'sanctions', 'civil society', 'nuclear deal', 'jcpoa', 'us sanctions', 'tehran', 'hassan rouhani', 'ayatollah khomeini'] 

kw_before: ["new year's", 'countdown', 'resolutions']
kw_after: ["new year's", 'countdown', 'solutions'] 



# Save

In [669]:
print('Number of unique keywords before cleaning:', len(set(list(itertools.chain(*keywords)))))
print('Number of unique keywords after first clean:', len(set(list(itertools.chain(*keywords_lower_split_clean_short)))))
print('Number of unique keywords after rapidfuzz replacing:', len(set(list(itertools.chain(*keywords_replaced)))))

Number of unique keywords before cleaning: 32682
Number of unique keywords after first clean: 30228
Number of unique keywords after rapidfuzz replacing: 27988


In [670]:
list_ids = list(df_subset['lastModifiedDate'])
list_dates = list(df_subset['lastModifiedDate'])
list_kws = list(df_subset['keywordStrings'])
list_new_kws = keywords_replaced

df_2019_2020 = pd.DataFrame(list(zip(list_ids, list_dates, list_kws, list_new_kws)), columns=['id', 'lastModifiedDate', 'keywordStrings', 'keywordStringsCleanAfterFuzz'])

df_2019_2020.head()

Unnamed: 0,id,lastModifiedDate,keywordStrings,keywordStringsCleanAfterFuzz
0,2019-01-01T03:57:28.904Z,2019-01-01T03:57:28.904Z,"[NASA, OSIRIS-REx, Bennu, asteroid]","[nasa, osiris-rex, bennu, asteroid]"
1,2019-01-01T06:11:50.527Z,2019-01-01T06:11:50.527Z,"[English Channel, migration, boats, illegal im...","[english channel, migration, boats, illegal im..."
2,2019-01-01T06:14:35.563Z,2019-01-01T06:14:35.563Z,"[Brazil, Jair Bolsonaro, Chicago economics, Ha...","[brazil, jair bolsonaro, chicago economics, ha..."
3,2019-01-01T08:26:11.599Z,2019-01-01T08:26:11.599Z,"[Japan, Tokyo, Harajuku, attack]","[japan, tokyo, harajuku, attack]"
4,2019-01-01T09:05:00.736Z,2019-01-01T09:05:00.736Z,"[Asia, Bangladesh, elections, Kamal Hossain, S...","[asia, bangladesh, elections, kamal hossain, s..."


In [637]:
filepath = '../data/interim/clean_keywords_2019-2021_after_RapidFuzz.json'

# storing the data in JSON format
df_2019_2020.to_json(filepath, orient = 'split', compression = 'infer', index = 'true')
 
# reading the JSON file
# filepath = '../data/interim/clean_keywords_2019-2021_after_FuzzyWuzzy.json'
# df_loaded = pd.read_json(filepath, orient ='split', compression = 'infer')
# flat_keywords = list(itertools.chain(*list(df['keywordStringsCleanAfterFuzz'])))
 