# Team DatenWelle

## Keyword merging with FuzzyWuzzy

This notebook loads the data from JSON format and performs some keyword cleaning and merging misspelled duplicates with fuzzyWuuzy package 

In [1]:
#!git pull
#!git status
#!git add 1-anya_exploratory_analysis.ipynb
#!git commit -m 'made a set of keywords 2019-2022 and tried fuzzuwuzzy dedupe(licate) function on 10000 keywords'
#!git push
#!pip install -r ../requirements.txt

#after installed new libraries
#!pip freeze > requirements.txt
#!git add requirements.txt 
#!git add out_dedupl_100323.csv test.csv
#!git commit -m 'added output files f fuzzy wuzzy dedupe'
#!git commit -m 'added library fuzzywuzzy'
#!git push 

In [28]:
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from fuzzywuzzy.process import dedupe


In [3]:
# Opening JSON file
f = open('../data/raw/CMS_2010_to_June_2022_ENGLISH.json')
  
# returns JSON object as 
# a dictionary
data = json.load(f)

df = pd.DataFrame.from_dict(data)

#print(df.head())

#### Finding the subset of the data for 1 Jan 2019 - 1 Jan 2020 based on lastModifiedDate

In [73]:
df = df.sort_values(by='lastModifiedDate') #sort dataframe

datetimes = pd.to_datetime(df['lastModifiedDate'])
df['ts_lastModifiedDate']=datetimes
#df.iloc[ts_start]['ts_lastModifiedDate']

#find start index for subset 2019-2022
ts_start=datetimes[(datetimes > pd.Timestamp(year=2019, month=1, day=1).tz_localize('utc')) 
          & (datetimes < pd.Timestamp(year=2019, month=1, day=2).tz_localize('utc'))].min()
print(ts_start)
#find end date for subset 2019-2022
ts_end=datetimes[(datetimes > pd.Timestamp(year=2022, month=1, day=1).tz_localize('utc')) 
          & (datetimes < pd.Timestamp(year=2022, month=1, day=2).tz_localize('utc'))].min()
print(ts_end)

start_date=datetimes[datetimes == ts_start]
end_date=datetimes[datetimes == ts_end]

#find index for the chosen start and end dates
start_index=start_date.index[0]
print(start_index)
df[df.index == start_date.index[0]]

end_index=end_date.index[0]
print(end_index)
df[df.index == end_date.index[0]]

df_subset=df[start_index:end_index]


2019-01-01 03:57:28.904000+00:00
2022-01-01 02:35:51.098000+00:00
60278
150367


In [74]:
len(df_subset)

33830

### Keywords exploration


In [5]:
df['keywords'] # is keywords in dictionary

0         [{'name': 'DRC'}, {'name': 'M23'}, {'name': 'F...
1         [{'name': 'telephone'}, {'name': 'hotline'}, {...
2         [{'name': 'fiscal cliff'}, {'name': 'Obama'}, ...
3         [{'name': 'Kim Jong Un'}, {'name': 'Kim Jong I...
4         [{'name': 'fiscal cliff'}, {'name': 'US Senate...
                                ...                        
175654    [{'name': 'Turkey'}, {'name': 'Recep Tayipp Er...
175655    [{'name': 'pollution'}, {'name': 'gold mine'},...
175656    [{'name': 'war'}, {'name': 'Ukraine'}, {'name'...
175657    [{'name': 'France'}, {'name': 'vegetarian'}, {...
175658          [{'name': 'Ecuador'}, {'name': 'protests'}]
Name: keywords, Length: 175659, dtype: object

In [6]:
df['keywords'].isna().sum()


0

In [76]:
#create series of keywords sets
def get_keywords(row):
    if row is None:
        return None
    else:
        res_set = set()
        for name_dict in row:
            res_set.add(name_dict['name'])
        return res_set

df['keywords'].apply(get_keywords)

55250     {Nicolas Sarkozy, Martin Schulz, Angela Merkel...
55251     {Berlin, Right Said Fred, 2010, Brandenburg Ga...
55252                 {Ethiopia, Trier, Haile Gebrselassie}
55253     {garmisch-partenkirchen, four hills tournament...
55254     {eduard limonov, moscow helsinski group, natio...
                                ...                        
175606    {Lena Oberdorf, Women's Euro 2022, England, Vi...
175333    {Ferda Ataman, cancel culture, discrimination,...
175380    {French Revolution, stereotype, anti-Judaism, ...
175568    {constitutional rights, menstruation, Roe vs. ...
175179    {Chile, mining, environmental protection, Swed...
Name: keywords, Length: 175659, dtype: object

In [77]:
#extract individual keywords from the sets of sets

# should work but it is very slow for now with current gpus
# 10000 articles in 7 seconds
# df_subset (90090 articles) runs in 10 minutes 10 seconds

# sets=df_subset['keywords'].apply(get_keyword1) #full dataset
sets=df_subset['keywords'].apply(get_keywords)  #2019-2021 subset
#sets=sets[0:10000] #10000 articles
 
kw=functools.reduce(set.union, sets)

In [79]:
kw

{'Czechoslovakia',
 'Wa Lone',
 'Sierra Leone',
 'Willem Holleeder',
 'electricity outages',
 'morning sickness',
 'Kenya 2022 election',
 'criminal probe',
 'Anak Krakatau',
 'Caucasus',
 'left and right-wing extremism',
 'Heritage',
 'Algiers Agreement Mali',
 'Landslide',
 'Polar bear',
 'passport',
 'death march',
 'Belarus protests',
 'vaccine patents',
 'Digital Security Act',
 'New York Stock Exchange',
 'Molave',
 'storm surge',
 'geisterspiele',
 'peace summit',
 'African migration',
 'Cold War',
 'Snapseed',
 'black boxes',
 'Office for the Protection of the Constitution',
 'Oaxaca',
 "Germany's Women's",
 'WWI',
 'berentz-karas-sea',
 'magic',
 'James Bond',
 'Saint Bernard',
 'Lee Cheuk-yan',
 'Junge Union',
 'Evelyn Hernandez',
 'writing',
 'SPD leader',
 'BIRN',
 'Sexual harrassment',
 'Chow Hang Tung',
 'hyenas',
 'Sea of Azov',
 'Paolo Gentiloni',
 'user data',
 'Japan labor market',
 'airliner',
 'Vegan',
 'brain scan',
 'European Elections',
 'Right Said Fred',
 'viol

In [24]:
# # another way to extract individual keywords from the sets of sets that doesn't crash kernel is interrupted
# # so it might be more stable when later applied to the entire dataset

# #runs for 10000 articles in 5 seconds
# #runs for df_subset in  11 min 16 sec 

# sets=df_subset['keywords'].apply(get_keywords)
# #sets=sets[0:10000]
# from tqdm import tqdm
# def get_unique_keywords(sets):
#     result_set = set()
#     for row_set in tqdm(sets.values):
#         #result_set.union(row_set)
#         result_set = result_set.union(row_set)
#     return result_set

# unique_keywords = get_unique_keywords(sets)



100%|██████████| 90090/90090 [11:16<00:00, 133.25it/s] 


In [80]:
# #write the set of DW keywords before fuzzyWuzzy into the file
# pd.Series(list(unique_keywords)).to_csv('../data/interim/out_2019-2021_keywords_before_FuzzyWuzzy.csv')
# pd.Series(list(kw)).to_csv('../data/interim/out_2019-2021_keywords_before_FuzzyWuzzy.csv')

In [82]:
#unique_keywords=kw

In [81]:
uni_kw=pd.read_csv('../data/interim/out_2019-2021_keywords_before_FuzzyWuzzy.csv')

In [85]:
type(uni_kw)
type(unique_keywords)

set

In [86]:
uni_kw

Unnamed: 0.1,Unnamed: 0,0
0,0,Czechoslovakia
1,1,Wa Lone
2,2,Sierra Leone
3,3,Willem Holleeder
4,4,electricity outages
...,...,...
32699,32699,default
32700,32700,students
32701,32701,DSM
32702,32702,Marium


In [87]:
unique_keywords

{'Czechoslovakia',
 'Wa Lone',
 'Sierra Leone',
 'Willem Holleeder',
 'electricity outages',
 'morning sickness',
 'Kenya 2022 election',
 'criminal probe',
 'Anak Krakatau',
 'Caucasus',
 'left and right-wing extremism',
 'Heritage',
 'Algiers Agreement Mali',
 'Landslide',
 'Polar bear',
 'passport',
 'death march',
 'Belarus protests',
 'vaccine patents',
 'Digital Security Act',
 'New York Stock Exchange',
 'Molave',
 'storm surge',
 'geisterspiele',
 'peace summit',
 'African migration',
 'Cold War',
 'Snapseed',
 'black boxes',
 'Office for the Protection of the Constitution',
 'Oaxaca',
 "Germany's Women's",
 'WWI',
 'berentz-karas-sea',
 'magic',
 'James Bond',
 'Saint Bernard',
 'Lee Cheuk-yan',
 'Junge Union',
 'Evelyn Hernandez',
 'writing',
 'SPD leader',
 'BIRN',
 'Sexual harrassment',
 'Chow Hang Tung',
 'hyenas',
 'Sea of Azov',
 'Paolo Gentiloni',
 'user data',
 'Japan labor market',
 'airliner',
 'Vegan',
 'brain scan',
 'European Elections',
 'Right Said Fred',
 'viol

In [None]:
# sets_10000=sets

### Playing with FuzzyWuzzy

In [None]:
# #function from tutorial to get simplest matching ratio

# Str1 = "Apple Inc."
# Str2 = "apple Inc"
# Ratio = fuzz.ratio(Str1.lower(),Str2.lower())
# print(Ratio)

In [29]:
help(process)

Help on module fuzzywuzzy.process in fuzzywuzzy:

NAME
    fuzzywuzzy.process - # encoding: utf-8

FUNCTIONS
    dedupe(contains_dupes, threshold=70, scorer=<function token_set_ratio at 0x7f06af223e60>)
        This convenience function takes a list of strings containing duplicates and uses fuzzy matching to identify
        and remove duplicates. Specifically, it uses the process.extract to identify duplicates that
        score greater than a user defined threshold. Then, it looks for the longest item in the duplicate list
        since we assume this item contains the most entity information and returns that. It breaks string
        length ties on an alphabetical sort.
        
        Note: as the threshold DECREASES the number of duplicates that are found INCREASES. This means that the
            returned deduplicated list will likely be shorter. Raise the threshold for fuzzy_dedupe to be less
            sensitive.
        
        Args:
            contains_dupes: A list of st

In [61]:
help(fuzz.token_set_ratio)

Help on function token_set_ratio in module fuzzywuzzy.fuzz:

token_set_ratio(s1, s2, force_ascii=True, full_process=True)



In [88]:
## !!VERY SLOW!!! 
# Took 186 minutes to run for 10000 articles

#fuzzy.process.dedupe function returns a list without duplicates. by default it is using 70% similarity ratio
#to explore similarity ratio for individual words use fuzzy.process.extract i.e. process.extract('angela merkel',unique_keywords,limit=20)

print(len(unique_keywords))
#ded_kw=dedupe(unique_keywords)
ded_kw=dedupe(unique_keywords,threshold=90)
print(len(ded_kw))

#write the deduplicated keywords into the file
#pd.Series(list(ded_kw)).to_csv('../data/interim/out_dedupl_10k_articles_only_100323.csv')
pd.Series(list(ded_kw)).to_csv('../data/interim/out_dedupl_2019-2021_articles_only_100323.csv')


32704


KeyboardInterrupt: 

In [89]:
# #read from csv isntead of running DEDUP
ded_kw=pd.read_csv('../data/interim/out_dedupl_10k_articles_only_100323.csv')

In [95]:
deduplicated=set(ded_kw['0'])

In [96]:
deduplicated

{'military mission in Afghanistan',
 "Ya'an",
 'Caucasus',
 'US Germany bilateral relations',
 'Bernd Riegert',
 'prokon',
 'net worth',
 'Daniel Schmidt',
 'President Mahmoud Ahmadinejad',
 'Yuri Andrukhovych',
 'Mario Mandzukic',
 'Maybachufer',
 'capital flight',
 'hausmannsturm',
 'whale mating season',
 'eu executive complaint',
 'pharmacies',
 'planetary defense conference',
 'Mauritius',
 'illegal weapons shipment',
 'Gentleman',
 'Tahir ul Qadri',
 'invasive species',
 'current assessment',
 'kidnappings',
 'Zohib Islam Amiri',
 'Bassem Youssef',
 'Nicosia',
 'Imran Khan',
 'Israeili',
 'wine cellar',
 'goalref',
 'Rafsanjani',
 'diamond heist',
 'low-budget carrier',
 'tragedies',
 'cyber snooping',
 'abdullah azzam brigades',
 'economic institute',
 'Whistleblowers',
 'pictures',
 'YGL',
 'Seibert',
 'Adolf Hitler',
 'Jürgen Klopp',
 'awards ceremonies',
 'Bartholomew',
 'tom donilon',
 'The Nun',
 'graph search',
 'navy arms race',
 'Raquel Amada',
 'conservationists',
 'abd

### Exploring ratio of similarity for individual  keywords

In [49]:
process.extract('angela merkel',unique_keywords,limit=40)


[('Angela Merkel', 100),
 ('angela merkel', 100),
 ('Angela merkel', 100),
 ('AngelaMerkel', 96),
 ('Angela Merel', 96),
 ('Angel Merkel', 96),
 ('AfD Angela Merkel', 95),
 ('Anegla Merkel', 92),
 ('Anglea Merkel', 92),
 ('angele merkel', 92),
 ('Merkel', 90),
 ('angel', 90),
 ('merkel', 90),
 ('angela merkel elections', 90),
 ('merke', 90),
 ('angela', 90),
 ('an', 90),
 ('angela merkel russia', 90),
 ('NG', 90),
 ('la', 90),
 ('GE', 90),
 ('Angela', 90),
 ('Chancellor Angela Merkel', 90),
 ('merk', 90),
 ('me', 90),
 ('chancellor angela merkel', 90),
 ('El', 90),
 ('open letter to Angela Merkel', 90),
 ("Angela Merkel's cell phone", 90),
 ('LA', 90),
 ('merkel third cabinet', 86),
 ('Ingeborg Berggreen-Merkel', 86),
 ('landslide victory for merkel', 86),
 ('Merkel same-sex couples', 86),
 ('M&A', 86),
 ('Angela I', 86),
 ('Altmaier refugee Merkel CDU border Tusk influx Sebastian Conflict Zone coercion EU',
  86),
 ('m&a', 86),
 ('Europäische Union Gipfel Merkel Sarkozy Rettungsfonds'

In [97]:
process.extract('planetary defense conference',unique_keywords,limit=40)


[('plan', 90),
 ('plane', 90),
 ('conference', 90),
 ('planetary defense', 90),
 ('Plane', 90),
 ('defense', 90),
 ('planet', 90),
 ('Conference', 90),
 ('Defense', 90),
 ('Potsdam Conference', 86),
 ('Berlin Conference', 86),
 ('Wannsee Conference', 86),
 ('defense spending', 86),
 ('planetary science', 86),
 ('Climate Conference', 86),
 ('IT-Defense', 86),
 ('donors conference', 86),
 ('missile defense', 86),
 ('Global Investigative Journalism Conference', 86),
 ('Claims Conference', 86),
 ('defense secretary', 86),
 ('Catholic Bishops Conference of the Philippines', 86),
 ('immune defense', 86),
 ('defense ministry', 86),
 ('Defense Ministry', 86),
 ('defense minister', 86),
 ('Yalta Conference', 86),
 ('climate conference', 86),
 ('party conference', 86),
 ('European Defense', 86),
 ('Asia-Pacific Conference of German Business', 86),
 ('conferences', 86),
 ('Conference on Jewish Material Claims Against Germany', 86),
 ('defense budget', 86),
 ('Libya conference', 86),
 ('e-plane', 

In [47]:
process.extract('Chosen Soren',unique_keywords,limit=20)

[('os', 90),
 ('Ho.', 90),
 ('Osen', 90),
 ('ore', 90),
 ('Sen', 90),
 ('SE', 90),
 ('sorben', 82),
 ('Sorgen', 82),
 ('Rehn', 77),
 ('Rügen', 77),
 ('sensors', 77),
 ('Refn', 77),
 ('rena', 77),
 ('Rena', 77),
 ('Sensors', 77),
 ('rehn', 77),
 ('rügen', 77),
 ('rent', 77),
 ('sensor', 75),
 ('censor', 75)]

In [51]:
process.extract('Sex pistols',unique_keywords,limit=20)

[('Sex pistols', 100),
 ('Pis', 90),
 ("'IS'", 90),
 ('Sex', 90),
 ('PiS', 90),
 ('pi', 90),
 ('Pistol', 90),
 ('"IS"', 90),
 ('IS', 90),
 ('sex', 90),
 ('pistol', 90),
 ('PIS', 90),
 ('SE', 90),
 ('st', 90),
 ('St', 90),
 ('same-sex animal couples', 86),
 ('sex between children and adults', 86),
 ('child sex acts online', 86),
 ('Merkel same-sex couples', 86),
 ('same-sex couples. Austria', 86)]

In [53]:
process.extract('UEFA',unique_keywords,limit=40)

[('uefa', 100),
 ('Uefa', 100),
 ('UEFA', 100),
 ('UEFA Euro 2016', 90),
 ('UEFA europa League', 90),
 ('UEFA Champions League', 90),
 ('uefa euro 2012', 90),
 ('UEFA Cup', 90),
 ('UEFA European Championship 2016', 90),
 ('UEFA Nations League', 90),
 ('a+', 90),
 ('UEFA European Championships', 90),
 ('UEFA Youth League', 90),
 ('FA', 90),
 ('UEFA EURO 2016', 90),
 ('UEFA Europa League', 90),
 ('UEFA Super Cup', 90),
 ('UEFA EURO 2012', 90),
 ('UEFA European U-19 Championship', 90),
 ('U', 90),
 ("UEFA Women's Cup", 90),
 ('UEFA SuperCup', 90),
 ('a', 90),
 ('ufa', 86),
 ('EFA', 86),
 ('Kilauea', 77),
 ('Schufa', 77),
 ('schufa', 77),
 ('efsa', 75),
 ('defa', 75),
 ('ECFA', 75),
 ('HEFA', 75),
 ('gefa', 75),
 ('ULFA', 75),
 ('EFSA', 75),
 ('DEFA', 75),
 ('UNEA', 75),
 ('Umea', 75),
 ('Buea', 75),
 ('Yousufzai', 68)]

In [54]:
process.extract('UAE',unique_keywords,limit=40)

[('UAE', 100),
 ('uae', 100),
 ('al Quaeda', 90),
 ('a+', 90),
 ('Quaelgeist', 90),
 ('al-quaeda', 90),
 ('blauaeugig', 90),
 ('U', 90),
 ('UAE football', 90),
 ('Al Quaeda', 90),
 ('a', 90),
 ('quake', 75),
 ('usage', 75),
 ('revue', 72),
 ('Vogue', 72),
 ('Ling Jihua', 72),
 ('reveue', 72),
 ('JetBlue', 72),
 ('revenue', 72),
 ('quechua', 72),
 ('Ayesha Siddiqua', 72),
 ('cheese fondue', 72),
 ('tongue', 72),
 ('Vicente del Bosque', 72),
 ('Bellevue', 72),
 ('sly rogue', 72),
 ('moque', 72),
 ('Linwood mosque', 72),
 ('morgue', 72),
 ('tissue', 72),
 ('Pique', 72),
 ('schloss bellevue', 72),
 ('CrossBlue', 72),
 ('Schloss Bellevue', 72),
 ('Tony Pua', 72),
 ('Xinhua', 72),
 ('papua', 72),
 ('Red Mosque', 72),
 ('dwblue', 72),
 ('technique', 72)]

In [55]:
process.extract('United Arab Emirates',unique_keywords,limit=40)

[('United Arab Emirates', 100),
 ('united arab emirates', 100),
 ('Emirates', 90),
 ('emirates', 90),
 ('MIR', 90),
 ('rat', 90),
 ('ara', 90),
 ('RAB', 90),
 ('united', 90),
 ('Mir', 90),
 ('United', 90),
 ('rates', 90),
 ('arab', 90),
 ('Arab', 90),
 ('rate', 90),
 ('emir', 90),
 ('Emir', 90),
 ('arab emirates', 90),
 ('EMI', 90),
 ('TED', 90),
 ('arab League', 86),
 ('United Party for National Development (UPND).', 86),
 ('United Left', 86),
 ('United Russia', 86),
 ('united fruit', 86),
 ('United Nations Convention to Combat Desertification', 86),
 ('Syria Palmyra Russia Assad Putin Bokova Bulgaria UN United Nations world heritage',
  86),
 ('Arab league', 86),
 ('United Democratic Front against Dictatorship', 86),
 ('arab poetry', 86),
 ('arab protests', 86),
 ('arab spring', 86),
 ('United Nations Security Council', 86),
 ('Sunni Arab', 86),
 ('Arab uprising', 86),
 ('united nations security council', 86),
 ('IRA', 86),
 ("United Nations Population Fund women's initiative", 86),


In [57]:
process.extract('war in Ukraine',unique_keywords,limit=40)

[('Ukraine war', 95),
 ('Wa', 90),
 ('UK', 90),
 ('ar', 90),
 ('RAI', 90),
 ('uk', 90),
 ('Ukrain', 90),
 ('rain', 90),
 ('Ukraine', 90),
 ('Rain', 90),
 ('AR', 90),
 ('war', 90),
 ('ukraine', 90),
 ('Rai', 90),
 ('INE', 90),
 ('War', 90),
 ('Uk', 90),
 ('Prize for Achievement in Africa', 86),
 ('War US', 86),
 ('prime minister in exile', 86),
 ('Willkommen in Deutschland', 86),
 ('in focus', 86),
 ('syria civil war spill over', 86),
 ('German embassy in Sudan', 86),
 ('Civil War', 86),
 ('Max Planck Institute for Polymer Research in Mainz', 86),
 ('the National Institute of Astrophysics in Bologna', 86),
 ('war games', 86),
 ('1965 war', 86),
 ('more men in kindergartens', 86),
 ('mali war', 86),
 ('Afghanistan civil war', 86),
 ('World War', 86),
 ('Immigration in Germany', 86),
 ('correspondents in China', 86),
 ('German construction firm expands operations in Togo', 86),
 ('seeking asylum in Europe', 86),
 ('1967 war', 86),
 ('LGBT conmmunity in Turkey', 86),
 ('Riksbank Prize in E

In [58]:
process.extract('UK',unique_keywords,limit=40)

[('UK', 100),
 ('uk', 100),
 ('Uk', 100),
 ('UK Media', 90),
 ('Arid Uka', 90),
 ('emmanuel mayuka', 90),
 ('Aram Manukyan', 90),
 ('Keukenhof', 90),
 ('aysel tugluk', 90),
 ('kabuki', 90),
 ('Cebio Soukou', 90),
 ('Ukraine military', 90),
 ('Mukhtaran Mai', 90),
 ('Shoukri', 90),
 ('Tukish coup', 90),
 ('UK Labour', 90),
 ('lukas podolski', 90),
 ('Pranab Mukherjee', 90),
 ('Raul Arashukov', 90),
 ('Change UK', 90),
 ('Luke Aikins', 90),
 ('Luka Jovic', 90),
 ('UK Space Agency', 90),
 ('Harajuku', 90),
 ('Ukip', 90),
 ('Duékoué', 90),
 ('Tuku', 90),
 ('ukraine protests', 90),
 ('yasukuni shrine', 90),
 ('BUK', 90),
 ('Putin Yanukovych', 90),
 ('UK house prices', 90),
 ('Ryusuke Sekino', 90),
 ('Lukashenko', 90),
 ('UK. Britain', 90),
 ('Lukas Barfüss', 90),
 ('Yarmuk', 90),
 ('Sam Olukoya', 90),
 ('Jukes', 90),
 ('Lukaschenko', 90)]

In [59]:
process.extract('United Kingdom',unique_keywords,limit=40)

[('United Kingdom', 100),
 ('united kingdom', 100),
 ('united kingom', 96),
 ('United Kingdon', 93),
 ('United Kindgom', 93),
 ('IT', 90),
 ('it', 90),
 ('un', 90),
 ('"It"', 90),
 ('Dom', 90),
 ('Un', 90),
 ('UN', 90),
 ('Reception United Kingdom', 90),
 ('It', 90),
 ('united', 90),
 ('King', 90),
 ('United', 90),
 ('ING', 90),
 ('NG', 90),
 ('Proposed referendum on United Kingdom membership of the European Union',
  90),
 ('. UN', 90),
 ('kingdom', 90),
 ('king', 90),
 ('ed', 90),
 ('TED', 90),
 ('United Party for National Development (UPND).', 86),
 ('United Farm Workers Union', 86),
 ('United Nations Convention to Combat Desertification', 86),
 ('Syria Palmyra Russia Assad Putin Bokova Bulgaria UN United Nations world heritage',
  86),
 ('United Nations in Bonn', 86),
 ('United Democratic Front against Dictatorship', 86),
 ('United States and Europe', 86),
 ('United Technologies Corp', 86),
 ('United Nations Arab League', 86),
 ('The United States of Fear', 86),
 ('United Nations S

In [None]:
# #example from the fuzzywuzzy tooturial on token ratio
# Str1 = "The supreme court case of Nixon vs The United States"
# Str2 = "Nixon v. United States"
# Ratio = fuzz.ratio(Str1.lower(),Str2.lower())
# Partial_Ratio = fuzz.partial_ratio(Str1.lower(),Str2.lower())
# Token_Sort_Ratio = fuzz.token_sort_ratio(Str1,Str2)
# Token_Set_Ratio = fuzz.token_set_ratio(Str1,Str2)
# print(Ratio)
# print(Partial_Ratio)
# print(Token_Sort_Ratio)
# print(Token_Set_Ratio)


### "Putting back" merged clean keywords into the dataframe

In [None]:
# for every keyword in database find matching from the deduplicated list  and substitute