# Data Parsing (Summaries)

In [5]:
# standard imports, add as needed
import pandas as pd
import numpy as np
import os
import re
from random import sample

In [45]:
# Gets file directory list, removes all duplicates in directory
filedir = os.listdir('Data')
filedir = [file for file in filedir if (
    ('(1)' and 'copy' not in file) and (file[-3:] == 'txt'))
]

In [51]:
# pattern matching all the for arguments
pattern_sum = r'(?i)(?=\b(Measure|Impartial|Analysis|Counsel|Auditor)\b).*?(?=\b(Counsel|Analysis)\b)\w*'

### Make first pass through the data, and extract the summaries

Process:
- For each file in filedir, match the line after the Impartial Analysis heading
- Then take the next 500 "words" and then end the string
- Store for analysis, and continue iteration

In [53]:
# Define function for re-usability in next portions of project
def match_maker(fdir, pattern):
    match = []

    for f in fdir:
        # read file
        fp = os.path.join('Data', f)
        file = open(fp, 'r')
        f_text = file.read()

        # use regex to match the first instance of regex, then read the next 500 words.
        regex_match = re.search(pattern, f_text)

        # if regex matched
        if regex_match:
            f_text = f_text[regex_match.end():]
            # cleaning text
            for char in ['-','\n']:
                f_text = f_text.replace(char,' ')

            f_split = f_text.split()
            # debugger, ignore : print(f + " is this long: " + str(len(f_split)))
            # Some matches won't have 500 characters following
            if len(f_split) <= 500:
                f_processed = " ".join(f_split[:(len(f_split) - 1)])
            else:
                f_processed = " ".join(f_split[:500])

            # add to list
            match.append((f, f_processed))
        else:
            match.append((f, 'NO MATCH'))
        file.close()
        
    return match

Now we can use this function to match all of the Impartial Analyses!

In [55]:
# use pattern for summary defined previously
match_sum = match_maker(filedir, pattern_sum)

pct_matched = len([i for i in match_sum if i[1] != 'NO MATCH'])/len(match_sum)
print('We matched ' + str(pct_matched*100) + "% of the Impartial Analyses!")

We matched 79.10787437414656% of the Impartial Analyses!


### But the Arguments are a bit harder....

Plan of Action:
- Let's filter out the file types that are a majority argumentless
- Design a pattern to match most of the arguments for
- Match arguments for
- Compare to master list

In [56]:
# filter our sample ballot files
filter_list = ["SampleBallot", "Res.", "BQ", 'bq', "Bq", "Ord", "BallotQ"]

# define to use as a filter for other parts of project
def list_filter(fdir, fil):
    return [f for f in fdir if any(l for l in fil if l in f) == False]        
        
filedir_args = list_filter(filedir, filter_list)

len(filedir), len(filedir_args)

(2197, 1218)

Looks like it worked! Now lets match them :)

In [59]:
pattern_for = r'(?i)((?!.*?(REBUTTAL).*?)(?=.*\b(ARGUMENT))(?=.*(FAVOR|FOR)).*)'
match_for = match_maker(filedir_args, pattern_for)

total_for = len([i for i in match_for if i[1] != 'NO MATCH'])
pct_matched_for = total_for/len(match_for)
print('We matched ' + str(pct_matched_for * 100) + "% of the docs!")

We matched 88.17733990147784% of the docs!


In [66]:
# Let's compare our results with the master list
master_df = pd.read_csv('MasterList2019Nov14.csv')
# Fill all NaNs with 0, and replace 5's (Never Existed) with 0's.
df_for = master_df.For_Coll.fillna(0).replace(5, 0)
df_for.value_counts().loc[0], total_for

(1966, 1074)

So, we matched about 50% of the For arguments according to the master list, but about 88% of the For arguments from
our filtered list. 

When we use the entire file directory, we get about 75% matches on that. It is still uncertain whether one with a higher likelihood
to have false positives is better than a smaller, higher accuracy and higher precision data set 

In [68]:
helpme= match_maker(filedir, pattern_for)
helpme_for = len([i for i in helpme if i[1] != 'NO MATCH'])
helpme_matched_for = helpme_for/len(helpme)
print('We matched ' + str(helpme_matched_for * 100) + "% of the docs!")

We matched 74.55621301775149% of the docs!


In [69]:
pattern_against = r'(?i)((?!.*?(REBUTTAL).*?)(?=.*\b(ARGUMENT))(?=.*(AGAINST|OPPOSING)).*)'

match_against = matchmaker(filedir_args, pattern_against)

NameError: name 'matchmaker' is not defined

In [109]:
pct_matched = len([i for i in match_against if i[1] != 'NO MATCH'])/len(match_against)
print('We matched ' + str(pct_matched*100) + "% of the docs!")
len([i for i in match_against if i[1] != 'NO MATCH'])

We matched 79.60159362549801% of the docs!


999