# Team DatenWelle

## Keyword merging with FuzzyWuzzy

This notebook loads the data from JSON format and performs some keyword cleaning and merging misspelled duplicates with fuzzyWuuzy package 

In [None]:
#!git pull
#!git status
#!git add 2-anya-keywords_FuzzyWuzzy.ipynb #1-anya_exploratory_analysis.ipynb
#!git commit -m 'added a thing to put keywords back into the dataframe (incomplete)'
#!git push
#!pip install -r ../requirements.txt

#after installed new libraries
#!pip freeze > requirements.txt
#!git add requirements.txt 
#!git add out_dedupl_100323.csv test.csv
#!git commit -m 'added output files f fuzzy wuzzy dedupe'
#!git commit -m 'added library fuzzywuzzy'
#!git push 

In [None]:
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from fuzzywuzzy.process import dedupe
import functools


In [None]:
# Opening JSON file
f = open('../data/raw/CMS_2010_to_June_2022_ENGLISH.json')
  
# returns JSON object as 
# a dictionary
data = json.load(f)

df = pd.DataFrame.from_dict(data)

#print(df.head())

#### Finding the subset of the data for 1 Jan 2019 - 1 Jan 2020 based on lastModifiedDate

In [None]:
df = df.sort_values(by='lastModifiedDate') #sort dataframe

datetimes = pd.to_datetime(df['lastModifiedDate'])
df['ts_lastModifiedDate']=datetimes
#df.iloc[ts_start]['ts_lastModifiedDate']

#find start index for subset 2019-2022
ts_start=datetimes[(datetimes > pd.Timestamp(year=2019, month=1, day=1).tz_localize('utc')) 
          & (datetimes < pd.Timestamp(year=2019, month=1, day=2).tz_localize('utc'))].min()
print(ts_start)
#find end date for subset 2019-2022
ts_end=datetimes[(datetimes > pd.Timestamp(year=2022, month=1, day=1).tz_localize('utc')) 
          & (datetimes < pd.Timestamp(year=2022, month=1, day=2).tz_localize('utc'))].min()
print(ts_end)

start_date=datetimes[datetimes == ts_start]
end_date=datetimes[datetimes == ts_end]

#find index for the chosen start and end dates
start_index=start_date.index[0]
print(start_index)
df[df.index == start_date.index[0]]

end_index=end_date.index[0]
print(end_index)
df[df.index == end_date.index[0]]

df_subset=df[start_index:end_index]


In [None]:
#df_subset=df_subset[:100]

In [None]:
len(df_subset)

### Keywords exploration


In [None]:
df['keywords'] # is keywords in dictionary

In [None]:
df['keywords'].isna().sum()


In [None]:
#create series of keywords sets
def get_keywords(row):
    if row is None:
        return None
    else:
        res_set = set()
        for name_dict in row:
            res_set.add(name_dict['name'])
        return res_set

df['keywords'].apply(get_keywords)

In [None]:
#extract individual keywords from the sets of sets

# should work but it is very slow for now with current gpus
# 10000 articles in 7 seconds
# df_subset (90090 articles) runs in 10 minutes 10 seconds

# sets=df_subset['keywords'].apply(get_keyword1) #full dataset
sets=df_subset['keywords'].apply(get_keywords)  #2019-2021 subset
#sets=sets[0:10000] #10000 articles
 
kw=functools.reduce(set.union, sets)

In [None]:
kw

In [None]:
# # another way to extract individual keywords from the sets of sets that doesn't crash kernel is interrupted
# # so it might be more stable when later applied to the entire dataset

# #runs for 10000 articles in 5 seconds
# #runs for df_subset in  11 min 16 sec 

# sets=df_subset['keywords'].apply(get_keywords)
# #sets=sets[0:10000]
# from tqdm import tqdm
# def get_unique_keywords(sets):
#     result_set = set()
#     for row_set in tqdm(sets.values):
#         #result_set.union(row_set)
#         result_set = result_set.union(row_set)
#     return result_set

# unique_keywords = get_unique_keywords(sets)



In [None]:
#write the set of DW keywords before fuzzyWuzzy into the file
pd.Series(list(unique_keywords)).to_csv('../data/interim/out_2019-2021_keywords_before_FuzzyWuzzy.csv')
pd.Series(list(kw)).to_csv('../data/interim/out_2019-2021_keywords_before_FuzzyWuzzy.csv')

In [None]:
#unique_keywords=kw

In [None]:
#load set of DW keywords before fuzzyWuzzy into the file
uni_kw=pd.read_csv('../data/interim/out_2019-2021_keywords_before_FuzzyWuzzy.csv') 

In [None]:
unique_keywords=set(uni_kw['0'])

In [None]:
unique_keywords

In [None]:
# sets_10000=sets

### Playing with FuzzyWuzzy

In [None]:
# #function from tutorial to get simplest matching ratio

# Str1 = "Apple Inc."
# Str2 = "apple Inc"
# Ratio = fuzz.ratio(Str1.lower(),Str2.lower())
# print(Ratio)

In [None]:
help(process)

In [None]:
help(fuzz.token_set_ratio)

In [None]:
## !!VERY SLOW!!! 
# Took 186 minutes to run for 10000 articles

#fuzzy.process.dedupe function returns a list without duplicates. by default it is using 70% similarity ratio
#to explore similarity ratio for individual words use fuzzy.process.extract i.e. process.extract('angela merkel',unique_keywords,limit=20)

print(len(unique_keywords))
#ded_kw=dedupe(unique_keywords)
ded_kw=dedupe(unique_keywords,threshold=90)
print(len(ded_kw))

#write the deduplicated keywords into the file
#pd.Series(list(ded_kw)).to_csv('../data/interim/out_dedupl_10k_articles_only_100323.csv')
pd.Series(list(ded_kw)).to_csv('../data/interim/out_dedupl_2019-2021_articles_only_100323.csv')


In [None]:
# #read from csv isntead of running DEDUP
#ded_kw=pd.read_csv('../data/interim/out_dedupl_10k_articles_only_100323.csv')
ded_kw=pd.read_csv('../data/interim/out_dedupl_2019-2021_articles_only_100323.csv')

In [None]:
deduplicated=set(ded_kw['0'])

In [None]:
deduplicated

### Exploring ratio of similarity for individual  keywords

In [None]:
process.extract('angela merkel',unique_keywords,limit=40)


In [None]:
process.extract('planetary defense conference',unique_keywords,limit=40)


In [None]:
process.extract('Chosen Soren',unique_keywords,limit=20)

In [None]:
process.extract('Sex pistols',unique_keywords,limit=20)

In [None]:
process.extract('UEFA',unique_keywords,limit=40)

In [None]:
process.extract('UAE',unique_keywords,limit=40)

In [None]:
process.extract('United Arab Emirates',unique_keywords,limit=40)

In [None]:
process.extract('war in Ukraine',unique_keywords,limit=40)

In [None]:
process.extract('UK',unique_keywords,limit=40)

In [None]:
process.extract('United Kingdom',unique_keywords,limit=40)

In [None]:
# #example from the fuzzywuzzy tutorial on token ratio
# Str1 = "The supreme court case of Nixon vs The United States"
# Str2 = "Nixon v. United States"
# Ratio = fuzz.ratio(Str1.lower(),Str2.lower())
# Partial_Ratio = fuzz.partial_ratio(Str1.lower(),Str2.lower())
# Token_Sort_Ratio = fuzz.token_sort_ratio(Str1,Str2)
# Token_Set_Ratio = fuzz.token_set_ratio(Str1,Str2)
# print(Ratio)
# print(Partial_Ratio)
# print(Token_Sort_Ratio)
# print(Token_Set_Ratio)


### "Putting back" merged clean keywords into the dataframe

In [None]:
list_kw=list(unique_keywords)[0]

In [None]:
df_subset['keywordStrings'][0]

In [None]:
# i=0
# len(df_subset['keywordStrings'][i])

In [None]:
# #n=0
# df_subset['keywordStrings'][i][n]

In [None]:
# 1) loop over each line in dataframe
# 2) loop over each keyword in the line
# 3) find process.extractOne a substitute from deduplicated list
# 4) create a new column in dataframe with merged keywords


def put_clean_kw_into_df(dataframe):
    i = 0
    sample_line = []
    sample_ratio = []
    while i <= len(dataframe): # 10:
        print(i)
        n = 0
        line_wr = []
        ratio_line_wr = []
        #print(line_wr)
        while n < len(dataframe[i]):
            #print(process.extractOne(df_subset['keywordStrings'][i][n],deduplicated)) #print word and ratio 
            line_wr.append(process.extractOne(dataframe[i][n], deduplicated)[0])
            ratio_line_wr.append(process.extractOne(dataframe[i][n], deduplicated)[1])
            n = n+1

        sample_line.append(line_wr)
        sample_ratio.append(ratio_line_wr)

        i = i+1
    return sample_line
        
    
df_lines=put_clean_kw_into_df(df_subset['keywordStrings'])



In [None]:
print(lines)

In [None]:
df_subset['keywordStrings'][i]


In [None]:
#figure out list comprehension later
# i=0
# def put_clean_kw_into_df(dataframe):
#     return [process.extractOne(dataframe[n], deduplicated)[0] for dataframe[n] in dataframe]

# put_clean_kw_into_df(df_subset['keywordStrings'][i])

In [None]:
# line_wr.append(str(word_ratio_line[0]))

In [None]:
i

In [None]:
print(line)

In [None]:
word_ratio_line

In [None]:
word_ratio

In [None]:
df_subset['cleaned_keywordStrings']=word_ratio[0]
df_subset['cleaned_sim_ratio_keywordStrings']=word_ratio[1]

In [None]:
df_subset['cleaned_keywordStrings'][i][n]=word_ratio[0]
df_subset['cleaned_sim_ratio_keywordStrings'][i][n]=word_ratio[1]

In [None]:
df_subset['cleaned_keywordStrings'][0][0]='lalal'

In [None]:
n