# Team DatenWelle

## Keyword merging with FuzzyWuzzy

This notebook loads the data from JSON format and performs some keyword cleaning and merging misspelled duplicates with fuzzyWuuzy package 

In [1]:
#!git pull
#!git status
#!git add 1-anya_exploratory_analysis.ipynb
#!git commit -m 'made a set of keywords 2019-2022 and tried fuzzuwuzzy dedupe(licate) function on 10000 keywords'
#!git push
#!pip install -r ../requirements.txt

#after installed new libraries
#!pip freeze > requirements.txt
#!git add requirements.txt 
#!git add out_dedupl_100323.csv test.csv
#!git commit -m 'added output files f fuzzy wuzzy dedupe'
#!git commit -m 'added library fuzzywuzzy'
#!git push 

In [11]:
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from fuzzywuzzy.process import dedupe
import functools


In [3]:
# Opening JSON file
f = open('../data/raw/CMS_2010_to_June_2022_ENGLISH.json')
  
# returns JSON object as 
# a dictionary
data = json.load(f)

df = pd.DataFrame.from_dict(data)

#print(df.head())

#### Finding the subset of the data for 1 Jan 2019 - 1 Jan 2020 based on lastModifiedDate

In [4]:
df = df.sort_values(by='lastModifiedDate') #sort dataframe

datetimes = pd.to_datetime(df['lastModifiedDate'])
df['ts_lastModifiedDate']=datetimes
#df.iloc[ts_start]['ts_lastModifiedDate']

#find start index for subset 2019-2022
ts_start=datetimes[(datetimes > pd.Timestamp(year=2019, month=1, day=1).tz_localize('utc')) 
          & (datetimes < pd.Timestamp(year=2019, month=1, day=2).tz_localize('utc'))].min()
print(ts_start)
#find end date for subset 2019-2022
ts_end=datetimes[(datetimes > pd.Timestamp(year=2022, month=1, day=1).tz_localize('utc')) 
          & (datetimes < pd.Timestamp(year=2022, month=1, day=2).tz_localize('utc'))].min()
print(ts_end)

start_date=datetimes[datetimes == ts_start]
end_date=datetimes[datetimes == ts_end]

#find index for the chosen start and end dates
start_index=start_date.index[0]
print(start_index)
df[df.index == start_date.index[0]]

end_index=end_date.index[0]
print(end_index)
df[df.index == end_date.index[0]]

df_subset=df[start_index:end_index]


2019-01-01 03:57:28.904000+00:00
2022-01-01 02:35:51.098000+00:00
60278
150367


In [5]:
#df_subset=df_subset[:100]

In [6]:
len(df_subset)

100

### Keywords exploration


In [7]:
df['keywords'] # is keywords in dictionary

55250     [{'name': 'Lisbon Treaty'}, {'name': 'European...
55251     [{'name': 'Berlin'}, {'name': 'New Year'}, {'n...
55252     [{'name': 'Haile Gebrselassie'}, {'name': 'Eth...
55253     [{'name': 'four hills tournament'}, {'name': '...
55254     [{'name': 'human rights'}, {'name': 'demonstra...
                                ...                        
175606    [{'name': 'Women's Euro 2022'}, {'name': 'Ada ...
175333    [{'name': 'Ferda Ataman'}, {'name': 'Federal A...
175380    [{'name': 'documentary'}, {'name': 'hate'}, {'...
175568    [{'name': 'abortion'}, {'name': 'Roe vs. Wade'...
175179    [{'name': 'documentary'}, {'name': 'pollution'...
Name: keywords, Length: 175659, dtype: object

In [8]:
df['keywords'].isna().sum()


0

In [9]:
#create series of keywords sets
def get_keywords(row):
    if row is None:
        return None
    else:
        res_set = set()
        for name_dict in row:
            res_set.add(name_dict['name'])
        return res_set

df['keywords'].apply(get_keywords)

55250     {Angela Merkel, Frederik Reinfeld, Herman Van ...
55251     {2010, Berlin, Brandenburg Gate, New Year, Rig...
55252                 {Haile Gebrselassie, Ethiopia, Trier}
55253     {werner schuster, disqualified, simon ammann, ...
55254     {national bolshevik party, sakharov prize, edu...
                                ...                        
175606    {Vivianne Miedema, Ada Hegerberg, England, Wom...
175333    {discrimination, racism, cancel culture, Feder...
175380    {French Revolution, antisemitism, stereotype, ...
175568    {Roe vs. Wade, data protection, abortion, heal...
175179    {environmental protection, Arica, pollution, C...
Name: keywords, Length: 175659, dtype: object

In [12]:
#extract individual keywords from the sets of sets

# should work but it is very slow for now with current gpus
# 10000 articles in 7 seconds
# df_subset (90090 articles) runs in 10 minutes 10 seconds

# sets=df_subset['keywords'].apply(get_keyword1) #full dataset
sets=df_subset['keywords'].apply(get_keywords)  #2019-2021 subset
#sets=sets[0:10000] #10000 articles
 
kw=functools.reduce(set.union, sets)

In [13]:
kw

{' Turkey',
 '"Islamic State"',
 'AFD',
 'Abiy Ahmed',
 'Accra',
 'Africa',
 'Alexandre Benalla',
 'All Top Stories and News Updates',
 'Alternative for Germany',
 'Annette Widmann-Mauz',
 'Apple',
 'Arab League',
 'Asia',
 'Asian Cup',
 'Awami League',
 'Ayatollah Khamenei',
 'Ayyappa',
 'BJP',
 'BNP',
 'Baltics',
 'Bangladesh',
 'Basel Institute of Commons and Economics',
 'Beijing',
 'Benjamin Netanyahu',
 'Bennu',
 'Berlin',
 'Borussia Dortmund',
 'Bosnia',
 'Bottrop',
 'Brazil',
 'Bucharest',
 'Bundesliga',
 'Business',
 'Chelsea FC',
 'Chicago economics',
 'China',
 'Chiwenga',
 'Christian Pulisic',
 'Christianity',
 'Christmas',
 'Comedy Show',
 'Congress',
 'Crown Prince Mohammed bin Salman',
 'Cuba',
 'Cuban Revolution',
 'Culture & Lifestyle',
 'DR Congo',
 'DW Akademie',
 'David',
 'David Dragicevic',
 'Davor Dragicevic',
 'Democratic Republic of Congo',
 'Denmark',
 'Der Nazi & der Friseur',
 'Dogon',
 'Donald Trump',
 'Drasko Stanivukovic',
 'ECB',
 'EU',
 'Eden Hazard',
 

In [14]:
# # another way to extract individual keywords from the sets of sets that doesn't crash kernel is interrupted
# # so it might be more stable when later applied to the entire dataset

# #runs for 10000 articles in 5 seconds
# #runs for df_subset in  11 min 16 sec 

# sets=df_subset['keywords'].apply(get_keywords)
# #sets=sets[0:10000]
# from tqdm import tqdm
# def get_unique_keywords(sets):
#     result_set = set()
#     for row_set in tqdm(sets.values):
#         #result_set.union(row_set)
#         result_set = result_set.union(row_set)
#     return result_set

# unique_keywords = get_unique_keywords(sets)



In [15]:
#write the set of DW keywords before fuzzyWuzzy into the file
pd.Series(list(unique_keywords)).to_csv('../data/interim/out_2019-2021_keywords_before_FuzzyWuzzy.csv')
pd.Series(list(kw)).to_csv('../data/interim/out_2019-2021_keywords_before_FuzzyWuzzy.csv')

In [16]:
#unique_keywords=kw

In [17]:
#load set of DW keywords before fuzzyWuzzy into the file
uni_kw=pd.read_csv('../data/interim/out_2019-2021_keywords_before_FuzzyWuzzy.csv') 

In [19]:
unique_keywords=set(uni_kw['0'])

In [20]:
unique_keywords

{'Zoji La tunnel',
 'DSA',
 'the CDU',
 'Wadden Sea',
 'Colin Davidson',
 'Jasper Johns',
 'Paul Ryan',
 'Hong Kong autonomy',
 'FinFisher',
 'Stan Lee',
 'gastronomy',
 'second Trump impeachment',
 'Steven Walter',
 'IGA',
 'SARS-CoV',
 'FYR',
 'astronomers',
 'Balloon ban',
 'John Pomber Magufuli',
 'tariffs',
 'Sulawesi',
 'German tradition',
 'German police',
 'carbon capture',
 'AFC',
 'Jupiter',
 'MP-40',
 'Reichstag',
 'Civil Alliance',
 'green energy',
 'Kate Connolly',
 'Frankfurt Airport',
 'Maharastra',
 'Emomali Rakhmon',
 'Starbucks',
 'vandana shiva',
 'Takis Würger',
 'charity concert',
 'monsoon',
 'Johan Gudenus',
 'Burhan Kesici',
 'sex work',
 'Bhashan Char',
 'ride-sharing',
 'Iran Revolutionary Guards',
 'single-use products',
 'crafts',
 'Charli Shield',
 'Mohammad Zarif',
 'ecdysterone',
 'embroidery',
 'Chinese Communist Party',
 'Diavata',
 'Tom Zé',
 'Privacy Shield',
 'Scarlett Johansson',
 'living',
 'Entwicklung',
 'International Labour Organization',
 'fir

In [21]:
# sets_10000=sets

### Playing with FuzzyWuzzy

In [22]:
# #function from tutorial to get simplest matching ratio

# Str1 = "Apple Inc."
# Str2 = "apple Inc"
# Ratio = fuzz.ratio(Str1.lower(),Str2.lower())
# print(Ratio)

In [23]:
help(process)

Help on module fuzzywuzzy.process in fuzzywuzzy:

NAME
    fuzzywuzzy.process - # encoding: utf-8

FUNCTIONS
    dedupe(contains_dupes, threshold=70, scorer=<function token_set_ratio at 0x7fa323279440>)
        This convenience function takes a list of strings containing duplicates and uses fuzzy matching to identify
        and remove duplicates. Specifically, it uses the process.extract to identify duplicates that
        score greater than a user defined threshold. Then, it looks for the longest item in the duplicate list
        since we assume this item contains the most entity information and returns that. It breaks string
        length ties on an alphabetical sort.
        
        Note: as the threshold DECREASES the number of duplicates that are found INCREASES. This means that the
            returned deduplicated list will likely be shorter. Raise the threshold for fuzzy_dedupe to be less
            sensitive.
        
        Args:
            contains_dupes: A list of st

In [24]:
help(fuzz.token_set_ratio)

Help on function token_set_ratio in module fuzzywuzzy.fuzz:

token_set_ratio(s1, s2, force_ascii=True, full_process=True)



In [25]:
## !!VERY SLOW!!! 
# Took 186 minutes to run for 10000 articles

#fuzzy.process.dedupe function returns a list without duplicates. by default it is using 70% similarity ratio
#to explore similarity ratio for individual words use fuzzy.process.extract i.e. process.extract('angela merkel',unique_keywords,limit=20)

print(len(unique_keywords))
#ded_kw=dedupe(unique_keywords)
ded_kw=dedupe(unique_keywords,threshold=90)
print(len(ded_kw))

#write the deduplicated keywords into the file
#pd.Series(list(ded_kw)).to_csv('../data/interim/out_dedupl_10k_articles_only_100323.csv')
pd.Series(list(ded_kw)).to_csv('../data/interim/out_dedupl_2019-2021_articles_only_100323.csv')


In [27]:
# #read from csv isntead of running DEDUP
#ded_kw=pd.read_csv('../data/interim/out_dedupl_10k_articles_only_100323.csv')
ded_kw=pd.read_csv('../data/interim/out_dedupl_2019-2021_articles_only_100323.csv')

In [28]:
deduplicated=set(ded_kw['0'])

In [29]:
deduplicated

{'pension freeze',
 'cybercrime',
 'Guillermo',
 'Nevsehir',
 'Nikos Michaloliakos',
 'Mount Everest',
 'Islamist Ennahda Party',
 'MILF',
 'hoax ap',
 'global monitoring report',
 'Kaiser',
 'Michael Ignatieff',
 'german business in russia',
 'Frelimo',
 'Bernd Neumann',
 'Superheroes',
 'temperatures',
 'hidir Karul',
 'Brazilians',
 'german trains',
 'Child abuse scandals',
 'shark shield',
 'purchase of chrysler',
 'Rolf Rische',
 'Pflug',
 'Sebastian dittmann',
 'Franziska Badenschier',
 'Bettina Wordlaw',
 'aufpASSEn',
 'Tony Cordesman',
 'environmentalists',
 'sequester',
 'Lollipop',
 'Strive Masiyiwa',
 'Shin Dong Hyuk',
 'lindsay graham',
 'Kylie Minogue',
 'Spanish unemployed',
 'Helicopter',
 'strip searched',
 'fresh borrowing',
 'Gelsenkirchen',
 'green energy',
 'households',
 'About My DW',
 'Documentaries and Reports',
 'business sentiment',
 'ArcelorMittal',
 'Instagram',
 'cross-border taxation faults',
 'Candolfo',
 'SP1320',
 'The Never Ending Energy Source',
 'chi

### Exploring ratio of similarity for individual  keywords

In [30]:
process.extract('angela merkel',unique_keywords,limit=40)


[('Angela Merkel', 100),
 ('angela merkel', 100),
 ('ANgela Merkel', 100),
 ('Angela merkel', 100),
 ('Angel Merkel', 96),
 ('Angela Merel', 96),
 ('AfD Angela Merkel', 95),
 ('Merkel', 90),
 ('merkel', 90),
 ('LA', 90),
 ('Chancellor Angela Merkel', 90),
 ('German Chancellor Angela Merkel', 90),
 ('M&A', 86),
 ('Angela I', 86),
 ('Macron; Merkel; Kurz; Austria', 86),
 ("Merkel's Era: The Women of Power", 86),
 ('Angela Kerber', 77),
 ('Kiel', 77),
 ('Angola', 75),
 ('Mekele', 75),
 ('Chancellor Merkel', 73),
 ('Merck', 72),
 ('blame', 72),
 ('Merkel-Plan', 71),
 ('Merkel shaking', 70),
 ('Mekelle', 69),
 ('Daniel Bekele', 69),
 ('SMER', 68),
 ('Lamu', 68),
 ('Erez', 68),
 ('Gera', 68),
 ('Elsa', 68),
 ('Rage', 68),
 ('lamp', 68),
 ('Kela', 68),
 ('EKRE', 68),
 ('name', 68),
 ('MERS', 68),
 ('Ger a', 68),
 ('Angelica Ammar', 67)]

In [31]:
process.extract('planetary defense conference',unique_keywords,limit=40)


[('plane', 90),
 ('planet', 90),
 ('Defense', 90),
 ('defense', 90),
 ('Conference', 90),
 ('conference', 90),
 ('plan', 90),
 ('Plane', 90),
 ('planetary defense', 90),
 ('e-plane', 86),
 ('Climate Conference', 86),
 ('European defense', 86),
 ('activist defense', 86),
 ('conferences', 86),
 ('Defense spending', 86),
 ('EU defense', 86),
 ('Berlin Conference', 86),
 ('defense deals', 86),
 ('European Defense', 86),
 ('donors conference', 86),
 ('defense sector', 86),
 ('air defense', 86),
 ('defense minister', 86),
 ('Conference on Jewish Material Claims Against Germany', 86),
 ('Defense Ministry', 86),
 ('defense spending', 86),
 ('self-defense', 86),
 ('Asia-Pacific Conference of German Business', 86),
 ('US defense deal', 86),
 ('Tory conference', 86),
 ('Claims Conference', 86),
 ('defense system', 86),
 ('party conference', 86),
 ('missile defense', 86),
 ('Global Investigative Journalism Conference', 86),
 ('immune defense', 86),
 ('IT-Defense', 86),
 ('press conference', 86),
 

In [32]:
process.extract('Chosen Soren',unique_keywords,limit=20)

[('ore', 90),
 ('Soren Kragh Andersen', 86),
 ('Orlen', 80),
 ('rent', 77),
 ('Sensors', 77),
 ('sensors', 77),
 ('Rügen', 77),
 ('sensor', 75),
 ('shore', 72),
 ('Chios', 72),
 ('house', 72),
 ('store', 72),
 ('open source', 70),
 ('CEOs', 68),
 ('Cows', 68),
 ('Homs', 68),
 ('hope', 68),
 ('Oran', 68),
 ('PSOE', 68),
 ('Host', 68)]

In [33]:
process.extract('Sex pistols',unique_keywords,limit=20)

[("'IS'", 90),
 ('pistol', 90),
 ('PiS', 90),
 ('sex', 90),
 ('IS', 90),
 ('Sex', 90),
 ('sex worker registration', 86),
 ('child sex abuse ring', 86),
 ('sex toy', 86),
 ('same-sex relationships', 86),
 ('gay sex', 86),
 ('same-sex partnerships', 86),
 ('sex abuse scandals', 86),
 ('same-sex marriage', 86),
 ('same-sex animal couples', 86),
 ('sex abuse scandal', 86),
 ('Sex and Consequences', 86),
 ('tolls', 80),
 ('tools', 80),
 ('Olso', 77)]

In [34]:
process.extract('UEFA',unique_keywords,limit=40)

[('UEFA', 100),
 ('UEFA Super Cup', 90),
 ('UEFA Conference League', 90),
 ('FA', 90),
 ('UEFA Champions League', 90),
 ('UEFA Europa League', 90),
 ('UEFA; Europa Conference League', 90),
 ('UEFA Nations League', 90),
 ('UFA', 86),
 ('Kilauea', 77),
 ('DEFA', 75),
 ('EFSA', 75),
 ('UNEA', 75),
 ('"The Earth Is Blue as an Orange"', 68),
 ('Glenfarclas', 68),
 ('trade fair', 68),
 ('Charta der Vielfalt', 68),
 ('Buena Vista Social Club', 68),
 ('Oriol Junqueras', 68),
 ('debt default', 68),
 ('Belfast Agreement', 68),
 ('radio interface', 68),
 ('Yanis Varoufakis', 68),
 ('Gartenzwergmanufaktur', 68),
 ('EU farm subsidies', 68),
 ('Venezuela elections', 68),
 ('German Animal Welfare Federation', 68),
 ('Haute Couture Fashion', 68),
 ('Docuemantaries', 68),
 ('Animal Welfare Party', 68),
 ('synagogue attack', 68),
 ('Nasser Zefzafi', 68),
 ('sustainable fashion', 68),
 ('Cyclone Fani', 68),
 ('Verfassungsschutz', 68),
 ('Ufa-Palast am Zoo', 68),
 ('bluefin tuna', 68),
 ('trade fairs', 68

In [35]:
process.extract('UAE',unique_keywords,limit=40)

[('UAE', 100),
 ('Israel-UAE deal', 90),
 ('UAE football', 90),
 ('quake', 75),
 ('Vogue', 72),
 ('dengue', 72),
 ('JetBlue', 72),
 ('ad revenue', 72),
 ('Mu Sochua', 72),
 ('West Papua', 72),
 ('Cologne mosque', 72),
 ('Rescue', 72),
 ('Mosque', 72),
 ('mother tongue', 72),
 ('Nicaragua', 72),
 ('rescue', 72),
 ('Al Noor mosque', 72),
 ('Bonoua', 72),
 ('Schloss Bellevue', 72),
 ('tortue', 72),
 ('Corentin Kohoue', 72),
 ('revenue', 72),
 ('mosque', 72),
 ('technique', 72),
 ('cheese fondue', 72),
 ('Al Noor Mosque', 72),
 ('Linwood Mosque', 72),
 ('prorogue', 72),
 ('"Deep Blue"', 72),
 ('Xinhua', 72),
 ('Kasiva Mutua', 72),
 ('Tulua', 72),
 ('T.B Joshua', 72),
 ('Linwood mosque', 72),
 ('Anthony Joshua', 72),
 ('Papua', 72),
 ('rescue mission', 68),
 ('Ligue 1', 68),
 ('overdue books', 68),
 ('blue angel', 68)]

In [36]:
process.extract('United Arab Emirates',unique_keywords,limit=40)

[('United Arab Emirates', 100),
 ('Emir', 90),
 ('arab', 90),
 ('emirate', 90),
 ('uni', 90),
 ('Emirates', 90),
 ('rat', 90),
 ('emir', 90),
 ('Arab Spring', 86),
 ('United Constitutional Patriots', 86),
 ('Ahwazi Arab', 86),
 ('United Nations Development Programme', 86),
 ('United States Department of Justice', 86),
 ('Arab music', 86),
 ('Arab spring', 86),
 ('arab world', 86),
 ('Arab women', 86),
 ('United Nations Security Council', 86),
 ('united states', 86),
 ('United States', 86),
 ('United Front of Ethiopian Federalist and Confederalist Forces', 86),
 ('United Malays National Organization', 86),
 ('Kremlin United States World War II Collective memory', 86),
 ('Arab world', 86),
 ('United Nations General Assembly', 86),
 ('United Nations Environment Programme', 86),
 ('United Nations Office on Drugs and Crime (UNODC)', 86),
 ('United Nations Human Rights Council', 86),
 ('Arab League', 86),
 ('United Nation', 86),
 ('Arab Cup', 86),
 ('Sahrawi Arab Democratic Republic', 86),
 

In [37]:
process.extract('war in Ukraine',unique_keywords,limit=40)

[('Ukraine war', 95),
 ('rain', 90),
 ('ukraine', 90),
 ('Ukraine', 90),
 ('UK', 90),
 ('war', 90),
 ('uk', 90),
 ('Uk', 90),
 ('AR', 90),
 ('Rain', 90),
 ('War', 90),
 ('cultural policy in Germany', 86),
 ('Jewish life in Germany', 86),
 ('Check-in', 86),
 ('ai', 86),
 ('Germans living abroad in EU', 86),
 ('Renegades: Born in the USA', 86),
 ('boer war', 86),
 ('Coronavirus latest: Surge in South Africa fatalities', 86),
 ('Homosexuality in Africa', 86),
 ('trade war', 86),
 ('dirty war', 86),
 ('Ukraine Airlines crash', 86),
 ('Al-Qaida in the Islamic Maghreb', 86),
 ('Grand Egyptian Museum in Cairo', 86),
 ('gang war', 86),
 ('Iraq war', 86),
 ('Centre for Human Rights and Democracy in Africa', 86),
 ('Central Council of Muslims in Germany', 86),
 ('Libya war', 86),
 ('in-vitro fertilization', 86),
 ('Once Upon a Time ... in Hollywood', 86),
 ('war zones', 86),
 ('bed-in', 86),
 ('women war photographers', 86),
 ('African women in power', 86),
 ('building boom in China', 86),
 ('co

In [38]:
process.extract('UK',unique_keywords,limit=40)

[('UK', 100),
 ('uk', 100),
 ('Uk', 100),
 ('Mustafa Farouk', 90),
 ('fukushima', 90),
 ('Teemu Pukki', 90),
 ('ukraine', 90),
 ('UK migrants', 90),
 ('UK protest law', 90),
 ('Change UK', 90),
 ('Nayib Bukele', 90),
 ('Storm Pabuk', 90),
 ('BUK', 90),
 ('Ukraine', 90),
 ('Neukölln', 90),
 ('Ukraine sailors', 90),
 ('UK economy', 90),
 ('UK election', 90),
 ('wassim mukdad', 90),
 ('Lukas Reichel', 90),
 ('Lukas Podolski', 90),
 ('Ukranie', 90),
 ('Ruslan Arashukov', 90),
 ('Tuku', 90),
 ('eastern Ukraine', 90),
 ('ukraine tensions', 90),
 ('Yasukuni Shrine', 90),
 ('Russia Ukraine', 90),
 ('Hannukah', 90),
 ('UK variant', 90),
 ('Lukas Hradecky', 90),
 ('UK-EU trade', 90),
 ('Tashi Wangchuk', 90),
 ('coronavirus UK', 90),
 ('UK politics', 90),
 ('Hajooj Kuka', 90),
 ('Ukrainian flight', 90),
 ('Pavan Sukhdev', 90),
 ('UK court', 90),
 ('Oliver Mtukudzi', 90)]

In [39]:
process.extract('United Kingdom',unique_keywords,limit=40)

[('United Kingdom', 100),
 ('ING', 90),
 ('IT', 90),
 ('UN', 90),
 ('Reception United Kingdom', 90),
 ('uni', 90),
 ('king', 90),
 ('Un', 90),
 ('OM', 90),
 ('King', 90),
 ('United States of America', 86),
 ('United Constitutional Patriots', 86),
 ('United Nations Development Programme', 86),
 ('United Technologies Corp', 86),
 ('United States Department of Justice', 86),
 ('United Nations Security Council', 86),
 ('United Front of Ethiopian Federalist and Confederalist Forces', 86),
 ('United Malays National Organization', 86),
 ('Kremlin United States World War II Collective memory', 86),
 ('United States Postal Service', 86),
 ('United Nations General Assembly', 86),
 ('United Nations Environment Programme', 86),
 ('Kingdom Assembly of Iran', 86),
 ('United Nations Office on Drugs and Crime (UNODC)', 86),
 ('United States Stock Exchange', 86),
 ('United National Movement', 86),
 ('United Nations Human Rights Council', 86),
 ('United Nations Refugee Agency', 86),
 ('united nations de

In [40]:
# #example from the fuzzywuzzy tutorial on token ratio
# Str1 = "The supreme court case of Nixon vs The United States"
# Str2 = "Nixon v. United States"
# Ratio = fuzz.ratio(Str1.lower(),Str2.lower())
# Partial_Ratio = fuzz.partial_ratio(Str1.lower(),Str2.lower())
# Token_Sort_Ratio = fuzz.token_sort_ratio(Str1,Str2)
# Token_Set_Ratio = fuzz.token_set_ratio(Str1,Str2)
# print(Ratio)
# print(Partial_Ratio)
# print(Token_Sort_Ratio)
# print(Token_Set_Ratio)


### "Putting back" merged clean keywords into the dataframe

In [41]:
list_kw=list(unique_keywords)[0]

In [42]:
df_subset['keywordStrings'][0]

['NASA', 'OSIRIS-REx', 'Bennu', 'asteroid']

In [43]:
# i=0
# len(df_subset['keywordStrings'][i])

4

In [72]:
# #n=0
# df_subset['keywordStrings'][i][n]

'OSIRIS-REx'

In [126]:
# 1) loop over each line in dataframe
# 2) loop over each keyword in the line
# 3) find process.extractOne a substitute from deduplicated list
# 4) create a new column in dataframe with merged keywords


def put_clean_kw_into_df(dataframe):
    i = 0
    sample_line = []
    sample_ratio = []
    while i <= len(dataframe): # 10:
        print(i)
        n = 0
        line_wr = []
        ratio_line_wr = []
        #print(line_wr)
        while n < len(dataframe[i]):
            #print(process.extractOne(df_subset['keywordStrings'][i][n],deduplicated)) #print word and ratio 
            line_wr.append(process.extractOne(dataframe[i][n], deduplicated)[0])
            ratio_line_wr.append(process.extractOne(dataframe[i][n], deduplicated)[1])
            n = n+1

        sample_line.append(line_wr)
        sample_ratio.append(ratio_line_wr)

        i = i+1
    return sample_line
        
    
df_lines=put_clean_kw_into_df(df_subset['keywordStrings'])



0
1


In [127]:
print(lines)



In [115]:
df_subset['keywordStrings'][i]


['asteroid', 'asteroid', 'Bennu', 'asteroid']

In [113]:
#figure out list comprehension later
# i=0
# def put_clean_kw_into_df(dataframe):
#     return [process.extractOne(dataframe[n], deduplicated)[0] for dataframe[n] in dataframe]

# put_clean_kw_into_df(df_subset['keywordStrings'][i])

 'Bannu',

In [82]:
# line_wr.append(str(word_ratio_line[0]))

AttributeError: 'NoneType' object has no attribute 'append'

In [67]:
i

0

In [65]:
print(line)

['travelling in Asia', 'bangladesh death sentence', 'EU elections', 'Delwar Hossain Sayedee', 'Sheikh Hasina']


In [61]:
word_ratio_line



In [57]:
word_ratio

('Mohammad Javad Zarif', 90)

In [None]:
df_subset['cleaned_keywordStrings']=word_ratio[0]
df_subset['cleaned_sim_ratio_keywordStrings']=word_ratio[1]

In [None]:
df_subset['cleaned_keywordStrings'][i][n]=word_ratio[0]
df_subset['cleaned_sim_ratio_keywordStrings'][i][n]=word_ratio[1]

In [54]:
df_subset['cleaned_keywordStrings'][0][0]='lalal'

KeyError: 'cleaned_keywordStrings'

In [47]:
n

0