# Pipeline to clean the keyword column 

In [348]:
import json
import pandas as pd
import itertools # to flatten lists of lists
import collections # to count
from rapidfuzz import process as pr
import numpy as np

## Load data

In [349]:
# Opening JSON file
f = open('../data/raw/CMS_2010_to_June_2022_ENGLISH.json')
  
# returns JSON object as a dictionary
data = json.load(f)

# convert to data frame
df = pd.DataFrame.from_dict(data)

#### Finding the subset of the data for 1 Jan 2019 - 1 Jan 2020 based on lastModifiedDate

In [350]:
df = df.sort_values(by='lastModifiedDate') #sort dataframe

datetimes = pd.to_datetime(df['lastModifiedDate'])
df['ts_lastModifiedDate']=datetimes
#df.iloc[ts_start]['ts_lastModifiedDate']

#find start index for subset 2019-2022
ts_start=datetimes[(datetimes > pd.Timestamp(year=2019, month=1, day=1).tz_localize('utc')) 
          & (datetimes < pd.Timestamp(year=2019, month=1, day=2).tz_localize('utc'))].min()
print(ts_start)

#find end date for subset 2019-2022
ts_end=datetimes[(datetimes > pd.Timestamp(year=2022, month=1, day=1).tz_localize('utc')) 
          & (datetimes < pd.Timestamp(year=2022, month=1, day=2).tz_localize('utc'))].min()
print(ts_end)

start_date=datetimes[datetimes == ts_start]
end_date=datetimes[datetimes == ts_end]

#find index for the chosen start and end dates
start_index=start_date.index[0]
print(start_index)
df[df.index == start_date.index[0]]

end_index=end_date.index[0]
print(end_index)
df[df.index == end_date.index[0]]

df_subset=df[start_index:end_index]

2019-01-01 03:57:28.904000+00:00
2022-01-01 02:35:51.098000+00:00
60278
150367


In [351]:
df_subset.columns

Index(['id', 'name', 'shortTitle', 'text', 'teaser', 'shortTeaser', 'kicker',
       'regions', 'keywords', 'keywordStrings', 'thematicFocusCategory',
       'navigations', 'categories', 'departments', 'firstPublicationDate',
       'lastModifiedDate', 'contentDate', 'relatedAutoTopics', 'contentLinks',
       'articles', 'isOpinion', 'geographicLocations', 'contentAssociations',
       'mainContentImageLink', 'images', 'externalLinks', 'topStory',
       'language', 'ts_lastModifiedDate'],
      dtype='object')

In [352]:
keywords = list(df_subset.keywordStrings)

## Functions

In [353]:
# Used mainly for visualisation, get indices of keywords with a certain substring
def get_items_with_substring(lst_lst_keywords, substring):
    indices = [i for i, lst_kw in enumerate(lst_lst_keywords) if any(list(map(lambda x: substring in x, lst_kw)))]
    return indices

## Cleaning

### Cleaning 1: put everything in lowercase

In [354]:
keywords_lower = [list(map(str.casefold, x)) for x in keywords]

In [355]:
# for visualisation only (can remove later on)
print('before:', keywords[0])
print('after: ', keywords_lower[0])

before: ['NASA', 'OSIRIS-REx', 'Bennu', 'asteroid']
after:  ['nasa', 'osiris-rex', 'bennu', 'asteroid']


### Cleaning 2: split keywords that haven't been split

In [356]:
# Split keywords: kw.split splits the keyword in a list of multiple keywords based on substring, itertools.chain flattens the list of lists
keywords_lower_split = [list(itertools.chain(*[kw.split(', ') for kw in lst_kw])) for lst_kw in keywords_lower]
keywords_lower_split = [list(itertools.chain(*[kw.split(' - ') for kw in lst_kw])) for lst_kw in keywords_lower_split] # spaces around '-' to not confuse with the ones within words

In [357]:
# for visualisation only (can remove later on)
items_with_unsplit_keywords = get_items_with_substring(keywords_lower, ', ') + get_items_with_substring(keywords_lower, ' - ')
print('Nb of keywords changed:', len(items_with_unsplit_keywords), '\n')
for i in range(2):
    print('before:', keywords_lower[items_with_unsplit_keywords[i]])
    print('after: ', keywords_lower_split[items_with_unsplit_keywords[i]], '\n')

Nb of keywords changed: 13 

before: ['freedom of speech, press freedom, freedom of expression']
after:  ['freedom of speech', 'press freedom', 'freedom of expression'] 

before: ['media', 'women', 'gender', 'freedom of speech, press freedom, freedom of expression', 'dw akademie', 'gender parity']
after:  ['media', 'women', 'gender', 'freedom of speech', 'press freedom', 'freedom of expression', 'dw akademie', 'gender parity'] 



### Cleaning 3: remove unwanted characters

In [358]:
# Remove '\u2002' and '.', '" ', '"', 'keywords: ' (replace with empty)
keywords_lower_split_clean = keywords_lower_split
keywords_lower_split_clean = [list(map(lambda x: x.replace('\u2002', ''), lst_kw)) for lst_kw in keywords_lower_split_clean]
keywords_lower_split_clean = [list(map(lambda x: x.replace('.', ''), lst_kw)) for lst_kw in keywords_lower_split_clean]
keywords_lower_split_clean = [list(map(lambda x: x.replace('" ', ''), lst_kw)) for lst_kw in keywords_lower_split_clean]
keywords_lower_split_clean = [list(map(lambda x: x.replace('"', ''), lst_kw)) for lst_kw in keywords_lower_split_clean]
keywords_lower_split_clean = [list(map(lambda x: x.replace('keywords: ', ''), lst_kw)) for lst_kw in keywords_lower_split_clean]

# Replace '\xa0' with space
keywords_lower_split_clean = [list(map(lambda x: x.replace('\xa0', ' '), lst_kw)) for lst_kw in keywords_lower_split_clean]

In [359]:
# for visualisation only (can remove later on)
items_with_unwanted_characters = get_items_with_substring(keywords_lower, substring = '"')
print('Nb of keywords changed:', len(items_with_unwanted_characters), '\n')
for i in range(2):
    print('before:', keywords_lower_split[items_with_unwanted_characters[i]])
    print('after: ', keywords_lower_split_clean[items_with_unwanted_characters[i]], '\n')

Nb of keywords changed: 85 

before: ['israel', 'syria', 'iran', 'hezbollah', '"islamic state"']
after:  ['israel', 'syria', 'iran', 'hezbollah', 'islamic state'] 

before: ['emmanuel macron', 'letter', 'national debate', '"yellow vests"', 'marine le pen']
after:  ['emmanuel macron', 'letter', 'national debate', 'yellow vests', 'marine le pen'] 



### Cleaning 4: Clean sentences 
Heuristic: remove keywords that have more than 6 spaces


In [360]:
n_spaces = 6 # if there are more spaces than this number, the keyword is removed
keywords_lower_split_clean_short = [[kw for kw in lst_kw if kw.count(' ')<n_spaces] for lst_kw in keywords_lower_split_clean]

In [361]:
# for visualisation only (can remove later on)
items_with_sentences = [i for i,lst_kw in enumerate(keywords_lower_split_clean) if any(list(map(lambda x: x.count(' ')>=n_spaces, lst_kw)))]
print('Nb of keywords changed:', len(items_with_sentences), '\n')
for i in range(5):
    print('before:', keywords_lower_split_clean[items_with_sentences[i]])
    print('after: ', keywords_lower_split_clean_short[items_with_sentences[i]], '\n')

Nb of keywords changed: 67 

before: ['germany', 'gerd müller', 'german federal ministry for economic cooperation and development', 'zambia', 'malawi', 'namibia', 'angela merkel', 'olaf scholz']
after:  ['germany', 'gerd müller', 'zambia', 'malawi', 'namibia', 'angela merkel', 'olaf scholz'] 

before: ['morocco', 'misogyny', 'domestic violence', 'feminism', 'euro-mediterranean foundation of support to human rights defenders']
after:  ['morocco', 'misogyny', 'domestic violence', 'feminism'] 

before: ['adama dieng', 'united nations', 'special advisor on the prevention of genocide', 'genocide', 'south sudan', 'central african republic', 'africa']
after:  ['adama dieng', 'united nations', 'genocide', 'south sudan', 'central african republic', 'africa'] 

before: ['afd', 'office for the protection of the constitution', 'thomas haldenwang']
after:  ['afd', 'thomas haldenwang'] 

before: ['mormon', 'rome', 'church', 'church of jesus christ of latter-day saints']
after:  ['mormon', 'rome', 'c

# Save

In [362]:
print('Number of unique keywords before cleaning:', len(set(list(itertools.chain(*keywords)))))
print('Number of unique keywords before cleaning:', len(set(list(itertools.chain(*keywords_lower_split_clean_short)))))

Number of unique keywords before cleaning: 32682
Number of unique keywords before cleaning: 30243


In [363]:
#print(keywords[0:2])
#print(keywords_lower_split_clean_short[0:2])

#df_2019_2020 = df_subset.copy()
#df_2019_2020['keywordStringsClean'] = keywords_lower_split_clean_short

#filepath = '../data/interim/clean_keywords_2019-2021_before_FuzzyWuzzy.csv'
#df_2019_2020.to_csv(filepath, index=False)  

[['NASA', 'OSIRIS-REx', 'Bennu', 'asteroid'], ['English Channel', 'migration', 'boats', 'illegal immigration']]
[['nasa', 'osiris-rex', 'bennu', 'asteroid'], ['english channel', 'migration', 'boats', 'illegal immigration']]


## Count occurence of each keyword
Will be used to know which one to keep in fuzzy wuzzy (the most used)

In [364]:
keywords_flat = list(itertools.chain(*keywords_lower_split_clean_short)) # Flatten list
keywords_freq = collections.Counter(keywords_flat)

In [365]:
# For visualisation, can be removed
keywords_freq_df = pd.DataFrame.from_dict(keywords_freq, orient='index', columns = ['freq'])
keywords_freq_df.sort_values(by='freq', ascending=False).head(10)

Unnamed: 0,freq
coronavirus,3817
germany,3299
covid-19,2641
china,1669
russia,1354
donald trump,1333
asia,1260
us,1087
eu,1078
bundesliga,905


# Rapid Fuzz

In [366]:
# extract unique ones and remove the empty entry
unique_keywords = list(set(keywords_flat))
unique_keywords.remove('')

# run rapid fuzz
ratio_array= pr.cdist(unique_keywords, unique_keywords, score_cutoff = 90)

### Find words correlating together and display

In [367]:
df_array = pd.DataFrame(ratio_array, columns = unique_keywords)

# Count number of non zero values in each row
nb_non_zero = np.count_nonzero(np.asarray(ratio_array), axis=1) 

# Save indices of rows with more than 1 non-zero value
indices_correlating_rows = [i for i, el in enumerate(list(nb_non_zero)) if el>1]

In [377]:
# Make a list of similar keywords
all_similar_kw = []
for i in indices_correlating_rows:
    similar_words = [keyword for val, keyword in zip(list(df_array.iloc[i]), unique_keywords) if val!=0]
    all_similar_kw.append(similar_words)

In [372]:
# Make a list of similar keywords
all_similar_kw = []
for i in range(0,40):
    similar_words = [keyword for val, keyword in zip(list(df_array.iloc[indices_correlating_rows[i]]), unique_keywords) if val!=0]
    all_similar_kw.append(similar_words)

# Unify similar keywords
Replace by most frequent one

In [383]:
# Runs for 14min !!
# Split in the ones which have equal word number, and the ones which don'try
# TODO: Problem: UN climate summit, US department of justice: only change if same number of words?

similar_kws_same_word_nb = [sim_kws for sim_kws in all_similar_kw if len(set([kw.count(' ') for kw in sim_kws]))==1]
similar_kws_diff_word_nb = [sim_kws for sim_kws in all_similar_kw if len(set([kw.count(' ') for kw in sim_kws]))!=1]

# Replace
keywords_replaced = keywords_lower_split_clean_short

for sim_kws in similar_kws_same_word_nb:
    
    # Make list of frequencies for those similar kws
    sim_kws_freq = [keywords_freq[word] for word in sim_kws]
    
    for kw in sim_kws:

        # the new keyword is the one with the highest frequency
        right_kw = sim_kws[sim_kws_freq.index(max(sim_kws_freq))]

        # replace similar keywords by the most frequent one
        keywords_replaced = [list(map(lambda x: right_kw if x==kw else x, lst_kw)) for lst_kw in keywords_replaced]

In [385]:
# Example for visualisation, can be removed later
for sim_kws in all_similar_kw[0:2]:
    for word in sim_kws:
        print('freq of:', word, ' = ', keywords_freq[word])
    print('')

n_show = 2 # how many examples to show

i_show = 0
for kw_before, kw_after in zip(keywords_lower_split_clean_short, keywords_replaced):
    if kw_before != kw_after and i_show < n_show:
        print('kw_before:', kw_before)
        print('kw_after:', kw_after, '\n')
        i_show += 1

freq of: sausage  =  7
freq of: sausages  =  3

freq of: minumum wage  =  1
freq of: minimum wage  =  10

kw_before: ['iran', 'sanctions', 'civil society', 'nuclear deal', 'jcpoa', 'us sanctions', 'tehran', 'hassan rouhani', 'ayatollah khamenei']
kw_after: ['iran', 'sanctions', 'civil society', 'nuclear deal', 'jcpoa', 'us sanctions', 'tehran', 'hassan rouhani', 'ayatollah khomeini'] 

kw_before: ["new year's", 'countdown', 'resolutions']
kw_after: ["new year's", 'countdown', 'solutions'] 



# Save

In [386]:
print('Number of unique keywords before cleaning:', len(set(list(itertools.chain(*keywords)))))
print('Number of unique keywords after first clean:', len(set(list(itertools.chain(*keywords_lower_split_clean_short)))))
print('Number of unique keywords after fuzzywuzzy replacing:', len(set(list(itertools.chain(*keywords_replaced)))))

Number of unique keywords before cleaning: 32682
Number of unique keywords after first clean: 30243
Number of unique keywords after fuzzywuzzy replacing: 27962


In [402]:
list_dates = list(df_subset['lastModifiedDate'])
list_kws = list(df_subset['keywordStrings'])
list_new_kws = keywords_replaced

df_2019_2020 = pd.DataFrame(list(zip(list_dates, list_kws, list_new_kws)), columns=['lastModifiedDate', 'keywordStrings', 'keywordStringsCleanAfterFuzz'])

df_2019_2020.head()

Unnamed: 0,lastModifiedDate,keywordStrings,keywordStringsCleanAfterFuzz
0,2019-01-01T03:57:28.904Z,"[NASA, OSIRIS-REx, Bennu, asteroid]","[nasa, osiris-rex, bennu, asteroid]"
1,2019-01-01T06:11:50.527Z,"[English Channel, migration, boats, illegal im...","[english channel, migration, boats, illegal im..."
2,2019-01-01T06:14:35.563Z,"[Brazil, Jair Bolsonaro, Chicago economics, Ha...","[brazil, jair bolsonaro, chicago economics, ha..."
3,2019-01-01T08:26:11.599Z,"[Japan, Tokyo, Harajuku, attack]","[japan, tokyo, harajuku, attack]"
4,2019-01-01T09:05:00.736Z,"[Asia, Bangladesh, elections, Kamal Hossain, S...","[asia, bangladesh, elections, kamal hossain, s..."


In [416]:
filepath = '../data/interim/clean_keywords_2019-2021_after_FuzzyWuzzy.json'

# storing the data in JSON format
df_2019_2020.to_json(filepath, orient = 'split', compression = 'infer', index = 'true')
 
# reading the JSON file
# df_loaded = pd.read_json(filepath, orient ='split', compression = 'infer')
# flat_keywords = list(itertools.chain(*list(df['keywordStringsCleanAfterFuzz'])))
 