# Team DatenWelle

## Keyword merging with FuzzyWuzzy

This notebook loads the data from JSON format and performs some keyword cleaning and merging misspelled duplicates with fuzzyWuuzy package 

In [1]:
#!git pull
#!git status
#!git add 1-anya_exploratory_analysis.ipynb
#!git commit -m 'made a set of keywords 2019-2022 and tried fuzzuwuzzy dedupe(licate) function on 10000 keywords'
#!git push
#!pip install -r ../requirements.txt

#after installed new libraries
#!pip freeze > requirements.txt
#!git add requirements.txt 
#!git add out_dedupl_100323.csv test.csv
#!git commit -m 'added output files f fuzzy wuzzy dedupe'
#!git commit -m 'added library fuzzywuzzy'
#!git push 

In [28]:
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from fuzzywuzzy.process import dedupe


In [3]:
# Opening JSON file
f = open('../data/raw/CMS_2010_to_June_2022_ENGLISH.json')
  
# returns JSON object as 
# a dictionary
data = json.load(f)

df = pd.DataFrame.from_dict(data)

#print(df.head())

#### Finding the subset of the data for 1 Jan 2019 - 1 Jan 2020 based on lastModifiedDate

In [4]:
datetimes = pd.to_datetime(df['lastModifiedDate'])
df['ts_lastModifiedDate']=datetimes
#df.iloc[ts_start]['ts_lastModifiedDate']


#find start index for subset 2019-2022
ts_start=datetimes[(datetimes > pd.Timestamp(year=2019, month=1, day=1).tz_localize('utc')) 
          & (datetimes < pd.Timestamp(year=2019, month=1, day=2).tz_localize('utc'))].min()
print(ts_start)
#find end date for subset 2019-2022
ts_end=datetimes[(datetimes > pd.Timestamp(year=2022, month=1, day=1).tz_localize('utc')) 
          & (datetimes < pd.Timestamp(year=2022, month=1, day=2).tz_localize('utc'))].min()
print(ts_end)

start_date=datetimes[datetimes == ts_start]
end_date=datetimes[datetimes == ts_end]

#find index for the chosen start and end dates
start_index=start_date.index[0]
print(start_index)
df[df.index == start_date.index[0]]

end_index=end_date.index[0]
print(end_index)
df[df.index == end_date.index[0]]

df_subset=df[start_index:end_index]


2019-01-01 03:57:28.904000+00:00
2022-01-01 02:35:51.098000+00:00
60278
150367


### Keywords exploration


In [5]:
df['keywords'] # is keywords in dictionary

0         [{'name': 'DRC'}, {'name': 'M23'}, {'name': 'F...
1         [{'name': 'telephone'}, {'name': 'hotline'}, {...
2         [{'name': 'fiscal cliff'}, {'name': 'Obama'}, ...
3         [{'name': 'Kim Jong Un'}, {'name': 'Kim Jong I...
4         [{'name': 'fiscal cliff'}, {'name': 'US Senate...
                                ...                        
175654    [{'name': 'Turkey'}, {'name': 'Recep Tayipp Er...
175655    [{'name': 'pollution'}, {'name': 'gold mine'},...
175656    [{'name': 'war'}, {'name': 'Ukraine'}, {'name'...
175657    [{'name': 'France'}, {'name': 'vegetarian'}, {...
175658          [{'name': 'Ecuador'}, {'name': 'protests'}]
Name: keywords, Length: 175659, dtype: object

In [6]:
df['keywords'].isna().sum()


0

In [14]:
#create series of keywords sets
def get_keywords(row):
    if row is None:
        return None
    else:
        res_set = set()
        for name_dict in row:
            res_set.add(name_dict['name'])
        return res_set

df['keywords'].apply(get_keywords)

0         {FDLR, Rwanda, Susan Rice, UN security council...
1         {federal government, telephone, Catholic Churc...
2         {spending cuts, fiscal cliff, debt ceiling, Ob...
3         {pyongyang, south korea, Kim Jong Il, Seoul, l...
4         {fiscal cliff bill, recession, fiscal cliff, S...
                                ...                        
175654    {Finland Sweden, NATO, Turkey, Recep Tayipp Er...
175655     {Anagold, cyanide, Turkey, pollution, gold mine}
175656    {call-up, draft, Russian attack, Russia, speci...
175657                 {France, vegetarian, steak, sausage}
175658                                  {protests, Ecuador}
Name: keywords, Length: 175659, dtype: object

In [20]:
#extract individual keywords from the sets of sets

# should work but it is very slow for now with current gpus
# 10000 articles in 7 seconds
# df_subset (90090 articles) runs in 10 minutes 10 seconds

# sets=df_subset['keywords'].apply(get_keyword1) #full dataset
sets=df_subset['keywords'].apply(get_keywords)  #2019-2021 subset
#sets=sets[0:10000] #10000 articles
 
kw=functools.reduce(set.union, sets)

In [24]:
# # another way to extract individual keywords from the sets of sets that doesn't crash kernel is interrupted
# # so it might be more stable when later applied to the entire dataset

# #runs for 10000 articles in 5 seconds
# #runs for df_subset in  11 min 16 sec 

# sets=df_subset['keywords'].apply(get_keywords)
# #sets=sets[0:10000]
# from tqdm import tqdm
# def get_unique_keywords(sets):
#     result_set = set()
#     for row_set in tqdm(sets.values):
#         #result_set.union(row_set)
#         result_set = result_set.union(row_set)
#     return result_set

# unique_keywords = get_unique_keywords(sets)



100%|██████████| 90090/90090 [11:16<00:00, 133.25it/s] 


In [35]:
#write the set of DW keywords before fuzzyWuzzy into the file
pd.Series(list(unique_keywords)).to_csv('../data/interim/out_2019-2021_keywords_before_FuzzyWuzzy.csv')

In [33]:
len(unique_keywords)

101171

In [None]:
# sets_10000=sets

### Playing with FuzzyWuzzy

In [None]:
# #function from tutorial to get simplest matching ratio

# Str1 = "Apple Inc."
# Str2 = "apple Inc"
# Ratio = fuzz.ratio(Str1.lower(),Str2.lower())
# print(Ratio)

In [29]:
help(process)

Help on module fuzzywuzzy.process in fuzzywuzzy:

NAME
    fuzzywuzzy.process - # encoding: utf-8

FUNCTIONS
    dedupe(contains_dupes, threshold=70, scorer=<function token_set_ratio at 0x7f06af223e60>)
        This convenience function takes a list of strings containing duplicates and uses fuzzy matching to identify
        and remove duplicates. Specifically, it uses the process.extract to identify duplicates that
        score greater than a user defined threshold. Then, it looks for the longest item in the duplicate list
        since we assume this item contains the most entity information and returns that. It breaks string
        length ties on an alphabetical sort.
        
        Note: as the threshold DECREASES the number of duplicates that are found INCREASES. This means that the
            returned deduplicated list will likely be shorter. Raise the threshold for fuzzy_dedupe to be less
            sensitive.
        
        Args:
            contains_dupes: A list of st

In [None]:
# ## !!VERY SLOW!!! 
# # Took 186 minutes to run for 10000 articles

# #fuzzy.process.dedupe function returns a list without duplicates. by default it is using 70% similarity ratio
# #to explore similarity ratio for individual words use fuzzy.process.extract i.e. process.extract('angela merkel',unique_keywords,limit=20)

# print(len(unique_keywords))
# ded_kw=dedupe(unique_keywords)
# print(len(ded_kw))

# #write the deduplicated keywords into the file
# pd.Series(list(ded_kw)).to_csv('../data/interim/out_dedupl_10k_articles_only_100323.csv')


In [31]:
# #read from csv isntead of running DEDUP
ded_kw=pd.read_csv('../data/interim/out_dedupl_10k_articles_only_100323.csv')

In [32]:
ded_kw

Unnamed: 0.1,Unnamed: 0,0
0,0,daniel bieler
1,1,EADS Astrium
2,2,Chosen Soren
3,3,Gabbana
4,4,DJV
...,...,...
11906,11906,dowry
11907,11907,Hellfire missiles
11908,11908,Carlos Moltini
11909,11909,boj


### Exploring ratio of similarity for individual  keywords

In [None]:
process.extract('angela merkel',unique_keywords,limit=20)


In [None]:
# #example from the fuzzywuzzy tooturial on token ratio
# Str1 = "The supreme court case of Nixon vs The United States"
# Str2 = "Nixon v. United States"
# Ratio = fuzz.ratio(Str1.lower(),Str2.lower())
# Partial_Ratio = fuzz.partial_ratio(Str1.lower(),Str2.lower())
# Token_Sort_Ratio = fuzz.token_sort_ratio(Str1,Str2)
# Token_Set_Ratio = fuzz.token_set_ratio(Str1,Str2)
# print(Ratio)
# print(Partial_Ratio)
# print(Token_Sort_Ratio)
# print(Token_Set_Ratio)


In [None]:
keywords=df['keywordStrings'] # is keywords in strings
keywords