# Pipeline to clean the keyword column 

In [68]:
import json
import pandas as pd
import itertools # to flatten lists of lists

## Load data

In [3]:
# Opening JSON file
f = open('../data/raw/CMS_2010_to_June_2022_ENGLISH.json')
  
# returns JSON object as a dictionary
data = json.load(f)

# convert to data frame
df = pd.DataFrame.from_dict(data)

In [5]:
df.columns

Index(['id', 'name', 'shortTitle', 'text', 'teaser', 'shortTeaser', 'kicker',
       'regions', 'keywords', 'keywordStrings', 'thematicFocusCategory',
       'navigations', 'categories', 'departments', 'firstPublicationDate',
       'lastModifiedDate', 'contentDate', 'relatedAutoTopics', 'contentLinks',
       'articles', 'isOpinion', 'geographicLocations', 'contentAssociations',
       'mainContentImageLink', 'images', 'externalLinks', 'topStory',
       'language'],
      dtype='object')

In [124]:
keywords = list(df.keywordStrings)

## Functions

In [147]:
# Used mainly for visualisation, get indices of keywords with a certain substring
def get_items_with_substring(lst_lst_keywords, substring):
    indices = [i for i, lst_kw in enumerate(lst_lst_keywords) if any(list(map(lambda x: substring in x, lst_kw)))]
    return indices

## Cleaning

### Cleaning 1: put everything in lowercase

In [148]:
keywords_lower = [list(map(str.casefold, x)) for x in keywords]

In [149]:
# for visualisation only (can remove later on)
print('before:', keywords[0])
print('after: ', keywords_lower[0])

before: ['DRC', 'M23', 'FDLR', 'Rwanda', 'Susan Rice', 'UN security council']
after:  ['drc', 'm23', 'fdlr', 'rwanda', 'susan rice', 'un security council']


### Cleaning 2: split keywords that haven't been split

In [189]:
# What we are looking for to split by
substring = ', '

# Split keywords: kw.split splits the keyword in a list of multiple keywords based on substring, itertools.chain flattens the list of lists
keywords_lower_split = [list(itertools.chain(*[kw.split(substring) for kw in lst_kw])) for lst_kw in keywords_lower]

# for visualisation only (can remove later on)
items_with_unsplit_keywords = get_items_with_substring(keywords_lower, substring)
for i in range(4):
    print('before:', keywords_lower[items_with_unsplit_keywords[i]])
    print('after: ', keywords_lower_split[items_with_unsplit_keywords[i]], '\n')

before: ['southeast asia', 'world economic forum', 'asean', 'digitization', 'hanoi, vietnam']
after:  ['southeast asia', 'world economic forum', 'asean', 'digitization', 'hanoi', 'vietnam'] 

before: ['#speakup! barometer', 'freedom of speech, press freedom, freedom of expression', 'ghana']
after:  ['#speakup! barometer', 'freedom of speech', 'press freedom', 'freedom of expression', 'ghana'] 

before: ['ghana', 'africa', 'freedom of speech, press freedom, freedom of expression']
after:  ['ghana', 'africa', 'freedom of speech', 'press freedom', 'freedom of expression'] 

before: ['night grooves, talkshow, music, kim fisher, wigald boning, haus schminke, deutsche welle, dw']
after:  ['night grooves', 'talkshow', 'music', 'kim fisher', 'wigald boning', 'haus schminke', 'deutsche welle', 'dw'] 



### Cleaning 3: remove unwanted characters

In [155]:
# Remove '\u2002' and '.', replace '\xa0' with space
keywords_lower_split_clean = keywords_lower_split
keywords_lower_split_clean = [list(map(lambda x: x.replace('\u2002', ''), lst_kw)) for lst_kw in keywords_lower_split_clean]
keywords_lower_split_clean = [list(map(lambda x: x.replace('.', ''), lst_kw)) for lst_kw in keywords_lower_split_clean]
keywords_lower_split_clean = [list(map(lambda x: x.replace('\xa0', ' '), lst_kw)) for lst_kw in keywords_lower_split_clean]

In [190]:
# for visualisation only (can remove later on)
items_with_unwanted_characters = get_items_with_substring(keywords_lower, substring = '\u2002')
for i in range(4):
    print('before:', keywords_lower_split[items_with_unwanted_characters[i]])
    print('after: ', keywords_lower_split_clean[items_with_unwanted_characters[i]], '\n')

before: ['usa', 'trade pact', 'ethiopia', 'agoa', 'mali', '\u2002guinea']
after:  ['usa', 'trade pact', 'ethiopia', 'agoa', 'mali', 'guinea'] 

before: ['\u2002germany', 'hamburg', 'freiburg', 'coronavirus', 'vaccine', 'protests', 'conspiracy theories', 'antivax']
after:  ['germany', 'hamburg', 'freiburg', 'coronavirus', 'vaccine', 'protests', 'conspiracy theories', 'antivax'] 

before: ['inflation', 'consumer prices', 'poverty', 'food bank', '\u2002pandemic', 'interest rates']
after:  ['inflation', 'consumer prices', 'poverty', 'food bank', 'pandemic', 'interest rates'] 

before: ['sierra leone', 'afcon 2021', 'african cup of nations', 'football', '\u2002soccer', 'african football']
after:  ['sierra leone', 'afcon 2021', 'african cup of nations', 'football', 'soccer', 'african football'] 



### Cleaning 4: other

In [164]:
items = [i for i,lst_kw in enumerate(keywords_lower) if any(list(map(lambda x: 'tropical tiger' in x, lst_kw)))]
print(keywords_lower_split_clean[items[0]])

['tropical tiger asian bush mosquito  mosquito', 'diseases']


In [165]:
items = [i for i,lst_kw in enumerate(keywords_lower) if any(list(map(lambda x: 'beijing olympics speedskating' in x, lst_kw)))]
print(keywords_lower_split_clean[items[0]])

['beijing olympics speedskating doping gold medal']


In [166]:
items = [i for i,lst_kw in enumerate(keywords_lower) if any(list(map(lambda x: 'allies of donald' in x, lst_kw)))]
print(keywords_lower_split_clean[items[0]])

['allies of donald trump are set to challenge the election result in a longshot that is almost certain to fail']


In [181]:
items = [i for i,lst_kw in enumerate(keywords_lower_split_clean) if any(list(map(lambda x: x.count(' ')>3, lst_kw)))]

In [185]:
for item in items: 
    print(keywords_lower_split_clean[item])

['syria', 'damascus', 'taftanaz', 'aleppo', 'syrian observatory for human rights', 'bashar al-assad', 'bashar assad']
['uli borowka battling the booze']
['middle east winter storm', 'snow in middle east', 'israel', 'jordan', 'west bank', 'gaza strip', 'syria', 'egypt', 'worst winter storm in middle east']
['eleven of the year 2012']
['mubarak', 'egypt', 'appeal', 'court', 'judge ahmed ali abdel rahman', 'cairo']
['dw akademie', 'bernard graf von der schulenburg', 'traineeship', 'journalism training', 'deutsche welle']
['syria', 'aleppo', 'syrian observatory for human rights']
['iraq', 'kurdish', 'kirkuk', 'bomb attacks', 'blast', 'the kurdistan democratic party', 'patriotic union of kurdistan party']
['immigrants', 'immigration', 'kröhnert', 'berlin institute for population and development', 'demographics', 'population']
['european court of human rights', 'religious freedom', 'christianity', 'homosexuality', 'crucifix']
['pakistan protests', 'tahir ul qadri', "pakistan people's party",