# Pipeline to clean the keyword column 

In [68]:
import json
import pandas as pd
import itertools # to flatten lists of lists

## Load data

In [3]:
# Opening JSON file
f = open('../data/raw/CMS_2010_to_June_2022_ENGLISH.json')
  
# returns JSON object as a dictionary
data = json.load(f)

# convert to data frame
df = pd.DataFrame.from_dict(data)

In [5]:
df.columns

Index(['id', 'name', 'shortTitle', 'text', 'teaser', 'shortTeaser', 'kicker',
       'regions', 'keywords', 'keywordStrings', 'thematicFocusCategory',
       'navigations', 'categories', 'departments', 'firstPublicationDate',
       'lastModifiedDate', 'contentDate', 'relatedAutoTopics', 'contentLinks',
       'articles', 'isOpinion', 'geographicLocations', 'contentAssociations',
       'mainContentImageLink', 'images', 'externalLinks', 'topStory',
       'language'],
      dtype='object')

## Cleaning

In [124]:
keywords = list(df.keywordStrings)

### Cleaning 1: put everything in lowercase

In [126]:
keywords_lower = [list(map(str.casefold, x)) for x in keywords]

In [127]:
# for visualisation only (can remove later on)
print('before:', keywords[0])
print('after: ', keywords_lower[0])

before: ['DRC', 'M23', 'FDLR', 'Rwanda', 'Susan Rice', 'UN security council']
after:  ['drc', 'm23', 'fdlr', 'rwanda', 'susan rice', 'un security council']


### Cleaning 2: split keywords that haven't been split

In [128]:
# Split keywords: kw.split splits the keyword in a list of multiple keywords, itertools.chain flattens the list of lists
keywords_lower_split = [list(itertools.chain(*[kw.split(', ') for kw in lst_kw])) for lst_kw in keywords_lower]

In [129]:
# for visualisation only (can remove later on)
items_with_unsplit_keywords = [i for i,lst_kw in enumerate(keywords_lower) if any(list(map(lambda x: ', ' in x, lst_kw)))]
print('before:', keywords_lower[items_with_unsplit_keywords[1]])
print('after: ', keywords_lower_split[items_with_unsplit_keywords[1]])

before: ['#speakup! barometer', 'freedom of speech, press freedom, freedom of expression', 'ghana']
after:  ['#speakup! barometer', 'freedom of speech', 'press freedom', 'freedom of expression', 'ghana']


### Cleaning 3: remove unwanted characters

In [130]:
# Remove '\u2002' and '.', replace '\xa0' with space
keywords_lower_split_clean = keywords_lower_split
keywords_lower_split_clean = [list(map(lambda x: x.replace('\u2002', ''), lst_kw)) for lst_kw in keywords_lower_split_clean]
keywords_lower_split_clean = [list(map(lambda x: x.replace('.', ''), lst_kw)) for lst_kw in keywords_lower_split_clean]
keywords_lower_split_clean = [list(map(lambda x: x.replace('\xa0', ' '), lst_kw)) for lst_kw in keywords_lower_split_clean]

In [131]:
# for visualisation only (can remove later on)
items_with_unwanted_characters = [i for i,lst_kw in enumerate(keywords_lower_split) if any(list(map(lambda x: '\u2002' in x, lst_kw)))]
print('before:', keywords_lower_split[items_with_unwanted_characters[0]])
print('after: ', keywords_lower_split_clean[items_with_unwanted_characters[0]])

before: ['usa', 'trade pact', 'ethiopia', 'agoa', 'mali', '\u2002guinea']
after:  ['usa', 'trade pact', 'ethiopia', 'agoa', 'mali', 'guinea']
