In [None]:
import pandas as pd

import spacy
from spaczz.matcher import FuzzyMatcher

In [None]:
ls s3-data

In [None]:
df = pd.read_csv("committees-extracted-candidates.csv")
cands = pd.read_csv("s3-data/candidate_list_2010_to_2020.csv")
cands.info()

In [None]:
df.columns

In [None]:
cands.columns

In [None]:
# Drop committees which do not have a potential candidate name.
df_dropped = df.dropna(subset=["processed_candidate"])

# Get rows which were not dropped - tag for manual review
for_review = df.drop(df_dropped.index)

# Empty placeholder dataframe for matched candidate ids
matched_candidate = pd.DataFrame(index=df_dropped.index, columns=["matched_candidate"])

# For testing set to a small number of rows.
subset = df_dropped.iloc[:, :]

In [None]:
for_review

In [None]:
import numpy as np
import time

start = time.time()

multiple_matches = []
no_matches = []

# Create matcher outside of the loop. It is much slower to instantiate it each time inside the loop than to 
# instantiate outside then use `.add` and `.remove`
nlp = spacy.blank("en")
matcher = FuzzyMatcher(nlp.vocab)

for index, candidate in subset.iterrows():

    # Create a matcher
    matcher.add("NAME", [nlp(candidate["processed_candidate"])])
        
    original_matches = cands["name_on_ballot"].apply(lambda x: matcher(nlp(x)) if matcher(nlp(x)) else np.nan)
    original_matches.dropna(inplace=True)
    match_percentage = original_matches.apply(lambda x: [ratio for _, _, _, ratio in x][0])

    # Remove what name from matcher.
    matcher.remove("NAME")
    
    true_match = [match_percentage.idxmax() if match_percentage.max() == 100 else []][0]
    
    if true_match:
        matches_index = [true_match]
    else:
        matches_index = match_percentage[match_percentage>90].index
    
    matches = cands.loc[matches_index]
    
    matches_filter = matches[matches["city"] == candidate["committee_city"]]
    
    if len(matches) == 0:
        no_matches.append({"candidate": candidate, "matches": []})
        
    else:
        id_nums = pd.unique(matches["candidate_name"])
        
        if len(id_nums) > 1 and len(matches_filter):
            id_nums = pd.unique(matches_filter["candidate_name"])

        if len(id_nums)==1 and len(matches_filter):
            # success!
            matched_candidate.loc[index] = id_nums[0]
        else:
            multiple_matches.append({"committee": candidate, "matches": matches})

end = time.time()

print(f"Elapsed time: {end-start}")

In [None]:
matches_filter

In [None]:
matched_candidate.dropna()

In [None]:
len(subset.reindex(matched_candidate.dropna().index))

In [None]:
matched_candidate.columns = ["candidate_name"]

In [None]:
matched_candidate

In [None]:
t2 = pd.concat([subset, matched_candidate], axis=1)

In [None]:
len(t2.dropna(subset=["candidate_name"]))

In [None]:
merged = cands.merge(t2, on=["candidate_name"])

In [None]:
capture = merged.dropna(subset=["election_dt"])

In [None]:
capture.info()

In [None]:
capture.dropna(axis=1, how="all", inplace=True)

In [None]:
capture.info()

In [None]:
capture.to_csv("examine_all_90.csv", index=False)

In [None]:
capture["candidate_name"]

In [None]:
multiple_matches

In [None]:
matches_index

In [None]:
matches["city"]

In [None]:
candidate["committee_city"]

In [None]:
index

In [None]:
subset

In [None]:
len(multiple_matches)

In [None]:
len(no_matches)

In [None]:
mm_pd = pd.DataFrame(multiple_matches)
no_pd = pd.DataFrame(no_matches)

In [None]:
mm_pd.to_pickle("multiple_matches.pkl")
no_pd.to_pickle("no_matches.pkl")