# Pipeline to clean the keyword column 

In [68]:
import json
import pandas as pd
import itertools # to flatten lists of lists

## Load data

In [3]:
# Opening JSON file
f = open('../data/raw/CMS_2010_to_June_2022_ENGLISH.json')
  
# returns JSON object as a dictionary
data = json.load(f)

# convert to data frame
df = pd.DataFrame.from_dict(data)

In [5]:
df.columns

Index(['id', 'name', 'shortTitle', 'text', 'teaser', 'shortTeaser', 'kicker',
       'regions', 'keywords', 'keywordStrings', 'thematicFocusCategory',
       'navigations', 'categories', 'departments', 'firstPublicationDate',
       'lastModifiedDate', 'contentDate', 'relatedAutoTopics', 'contentLinks',
       'articles', 'isOpinion', 'geographicLocations', 'contentAssociations',
       'mainContentImageLink', 'images', 'externalLinks', 'topStory',
       'language'],
      dtype='object')

In [124]:
keywords = list(df.keywordStrings)

## Functions

In [212]:
# Used mainly for visualisation, get indices of keywords with a certain substring
def get_items_with_substring(lst_lst_keywords, substring):
    indices = [i for i, lst_kw in enumerate(lst_lst_keywords) if any(list(map(lambda x: substring in x, lst_kw)))]
    return indices

## Cleaning

### Cleaning 1: put everything in lowercase

In [213]:
keywords_lower = [list(map(str.casefold, x)) for x in keywords]

In [214]:
# for visualisation only (can remove later on)
print('before:', keywords[0])
print('after: ', keywords_lower[0])

before: ['DRC', 'M23', 'FDLR', 'Rwanda', 'Susan Rice', 'UN security council']
after:  ['drc', 'm23', 'fdlr', 'rwanda', 'susan rice', 'un security council']


### Cleaning 2: split keywords that haven't been split

In [215]:
# What we are looking for to split by
substring = ', '

# Split keywords: kw.split splits the keyword in a list of multiple keywords based on substring, itertools.chain flattens the list of lists
keywords_lower_split = [list(itertools.chain(*[kw.split(substring) for kw in lst_kw])) for lst_kw in keywords_lower]

# for visualisation only (can remove later on)
items_with_unsplit_keywords = get_items_with_substring(keywords_lower, substring)
for i in range(4):
    print('before:', keywords_lower[items_with_unsplit_keywords[i]])
    print('after: ', keywords_lower_split[items_with_unsplit_keywords[i]], '\n')

before: ['southeast asia', 'world economic forum', 'asean', 'digitization', 'hanoi, vietnam']
after:  ['southeast asia', 'world economic forum', 'asean', 'digitization', 'hanoi', 'vietnam'] 

before: ['#speakup! barometer', 'freedom of speech, press freedom, freedom of expression', 'ghana']
after:  ['#speakup! barometer', 'freedom of speech', 'press freedom', 'freedom of expression', 'ghana'] 

before: ['ghana', 'africa', 'freedom of speech, press freedom, freedom of expression']
after:  ['ghana', 'africa', 'freedom of speech', 'press freedom', 'freedom of expression'] 

before: ['night grooves, talkshow, music, kim fisher, wigald boning, haus schminke, deutsche welle, dw']
after:  ['night grooves', 'talkshow', 'music', 'kim fisher', 'wigald boning', 'haus schminke', 'deutsche welle', 'dw'] 



### Cleaning 3: remove unwanted characters

In [216]:
# Remove '\u2002' and '.', '" ', '"', 'keywords: ' (replace with empty)
keywords_lower_split_clean = keywords_lower_split
keywords_lower_split_clean = [list(map(lambda x: x.replace('\u2002', ''), lst_kw)) for lst_kw in keywords_lower_split_clean]
keywords_lower_split_clean = [list(map(lambda x: x.replace('.', ''), lst_kw)) for lst_kw in keywords_lower_split_clean]
keywords_lower_split_clean = [list(map(lambda x: x.replace('" ', ''), lst_kw)) for lst_kw in keywords_lower_split_clean]
keywords_lower_split_clean = [list(map(lambda x: x.replace('"', ''), lst_kw)) for lst_kw in keywords_lower_split_clean]
keywords_lower_split_clean = [list(map(lambda x: x.replace('keywords: ', ''), lst_kw)) for lst_kw in keywords_lower_split_clean]

# Replace '\xa0' with space
keywords_lower_split_clean = [list(map(lambda x: x.replace('\xa0', ' '), lst_kw)) for lst_kw in keywords_lower_split_clean]

In [217]:
# for visualisation only (can remove later on)
items_with_unwanted_characters = get_items_with_substring(keywords_lower, substring = '"')
for i in range(4):
    print('before:', keywords_lower_split[items_with_unwanted_characters[i]])
    print('after: ', keywords_lower_split_clean[items_with_unwanted_characters[i]], '\n')

before: ['kraftwerk', 'electronic music', 'düsseldorf', 'autobahn', '"autobahn"', 'german music', 'germany', 'music']
after:  ['kraftwerk', 'electronic music', 'düsseldorf', 'autobahn', 'autobahn', 'german music', 'germany', 'music'] 

before: ['united gipsy crew', 'nebud dilino', 'roma', 'czech republic', 'gypsies', 'education', '"practical schools" "special schools"']
after:  ['united gipsy crew', 'nebud dilino', 'roma', 'czech republic', 'gypsies', 'education', 'practical schoolsspecial schools'] 

before: ['holocaust', '"march of the living"', 'auschwitz', 'shoa', 'germany', 'nazi', 'world war ii', 'concentration camps', 'history']
after:  ['holocaust', 'march of the living', 'auschwitz', 'shoa', 'germany', 'nazi', 'world war ii', 'concentration camps', 'history'] 

before: ['art', 'artists', 'art market', 'self.marketing', 'art school', 'careers', 'living wage', '"alles für die kunst', '" arte', 'casting shows', 'reality tv']
after:  ['art', 'artists', 'art market', 'selfmarketing

### Cleaning 4: other

In [230]:
items = [i for i,lst_kw in enumerate(keywords_lower_split_clean) if any(list(map(lambda x: x.count(' ')>15, lst_kw)))]

In [231]:
for item in items: 
    print(keywords_lower_split_clean[item])

['as doctors', 'patrick', 'james and steve have little leisure time', "so they're glad when they can watch a little football in peace and quiet at the weekend"]
['gmf14', 'ws1452', 'the power of the neighborhood: how local media organize participation and how dw akademie supports participatory approaches']
['gmf14', 'ws1455', 'co-creating a new multimedia format with audiences – case study of el toque by radio netherlands worldwide']
['de mistura is kerry lavrov lawrow united nations un syria russia us saudi arabia turkey iran qatar geneva peace talks']
['oil glut: efforts to freeze global production unsuccessful - german-israel trade relations – contract extension: zetsche set to remain at daimler three more years']
['germany has agreed to drastically cut subsidies that electricity companies will have to pay for solar power in the last decade', 'power firms have been obliged to pay more than the market rate certain solar power sources']
['tomorrow today', 'deutsche welle', 'space', 'i