# Data Parsing (Summaries)

In [None]:
# standard imports, add as needed
import pandas as pd
import numpy as np
import os
import re
from random import sample

In [None]:
# Gets file directory list, removes all duplicates in directory
filedir = os.listdir('Data')
filedir = [file for file in filedir if (
    ('(1)' and 'copy' not in file) and (file[-3:] == 'txt'))
]

In [None]:
# pattern matching all the for arguments
pattern_sum = r'(?i)(?=\b(Measure|Impartial|Analysis|Counsel|Auditor)\b).*?(?=\b(Counsel|Analysis)\b)\w*'

### Make first pass through the data, and extract the summaries

Process:
- For each file in filedir, match the line after the Impartial Analysis heading
- Then take the next 500 "words" and then end the string
- Store for analysis, and continue iteration

In [None]:
# Define function for re-usability in next portions of project
def match_maker(fdir, pattern):
    match = []

    for f in fdir:
        # read file
        fp = os.path.join('Data', f)
        file = open(fp, 'r', encoding = "utf8")
        f_text = file.read()

        # use regex to match the first instance of regex, then read the next 500 words.
        regex_match = re.search(pattern, f_text)

        # if regex matched
        if regex_match:
            f_text = f_text[regex_match.end():]
            # cleaning text
            for char in ['-','\n']:
                f_text = f_text.replace(char,' ')

            f_split = f_text.split()
            # debugger, ignore : print(f + " is this long: " + str(len(f_split)))
            # Some matches won't have 500 characters following
            if len(f_split) <= 500:
                f_processed = " ".join(f_split[:(len(f_split) - 1)])
            else:
                f_processed = " ".join(f_split[:500])

            # add to list
            match.append((f, f_processed))
        else:
            match.append((f, 'NO MATCH'))
        file.close()
        
    return match

Now we can use this function to match all of the Impartial Analyses!

In [None]:
# use pattern for summary defined previously
match_sum = match_maker(filedir, pattern_sum)

pct_matched = len([i for i in match_sum if i[1] != 'NO MATCH'])/len(match_sum)
print('We matched ' + str(pct_matched*100) + "% of the Impartial Analyses!")

### But the Arguments are a bit harder....

Plan of Action:
- Let's filter out the file types that are a majority argumentless
- Design a pattern to match most of the arguments for
- Match arguments for
- Compare to master list

In [None]:
# filter our sample ballot files
filter_list = ["SampleBallot", "Res.", "BQ", 'bq', "Bq", "Ord", "BallotQ"]

# define to use as a filter for other parts of project
def list_filter(fdir, fil):
    return [f for f in fdir if any(l for l in fil if l in f) == False]        
        
filedir_args = list_filter(filedir, filter_list)

len(filedir), len(filedir_args)

Looks like it worked! Now lets match them :)

In [None]:
pattern_for = r'(?i)((?!.*?(REBUTTAL).*?)(?=.*\b(ARGUMENT))(?=.*(FAVOR|FOR)).*)'
match_for = match_maker(filedir_args, pattern_for)

total_for = len([i for i in match_for if i[1] != 'NO MATCH'])
pct_matched_for = total_for/len(match_for)
print('We matched ' + str(pct_matched_for * 100) + "% of the docs!")

In [None]:
# Let's compare our results with the master list
master_df = pd.read_csv('MasterList2019Nov14.csv')
# Fill all NaNs with 0, and replace 5's (Never Existed) with 0's.
df_for = master_df.For_Coll.fillna(0).replace(5, 0)
df_for.value_counts().loc[0], total_for

So, we matched about 50% of the For arguments according to the master list, but about 88% of the For arguments from
our filtered list. 

When we use the entire file directory, we get about 75% matches on that. It is still uncertain whether one with a higher likelihood
to have false positives is better than a smaller, higher accuracy and higher precision data set 

In [None]:
#helpme= match_maker(filedir, pattern_for)
#helpme_for = len([i for i in helpme if i[1] != 'NO MATCH'])
#helpme_matched_for = helpme_for/len(helpme)
#print('We matched ' + str(helpme_matched_for * 100) + "% of the docs!")

In [None]:

#pattern_against = r'(?i)((?!.*?(REBUTTAL).*?)(?=.*\b(ARGUMENT))(?=.*(AGAINST|OPPOSING)).*)'

#match_against = match_maker(filedir_args, pattern_against)

In [None]:
#pct_matched = len([i for i in match_against if i[1] != 'NO MATCH'])/len(match_against)
#print('We matched ' + str(pct_matched*100) + "% of the docs!")
#len([i for i in match_against if i[1] != 'NO MATCH'])

#Monica's attempts starts from here:

In [None]:
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
list_for = []
for index in range(len(match_for)):
    text_for = match_for[index][1]
    list_for.append(text_for)

In [None]:
assert len(list_for) == 1218
assert type(text_for) == str

In [None]:
str.maketrans?

In [None]:
#Stripping punctuation not working
#for_text = [re.sub(r'[^A-Za-z0-9]','', text_for) for text_for in list_for]
#for_text = [re.sub(r'[^\w\s]','',text_for) for text_for in list_for]
#for_text = [re.sub('\W+','',text_for) for text_for in list_for]

intab = "'■*...■■r■■£►♦❖'!`?,;:._'#^<>"
#print(len(intab))
outtab = "                             "
for_text = [x.translate(str.maketrans(intab, outtab, string.punctuation)) for x in list_for]

In [None]:
#Trying to strip text of punctuation and lower all words

for_text = list(map(str.lower, list_for))

for_text[199]

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True,
                        analyzer='word',
                        max_features=10000,
                        tokenizer=word_tokenize,
                        stop_words=stopwords.words("english"),
                        max_df = 5)

In [None]:
TfidfVectorizer?

In [None]:
for_tfidf = pd.DataFrame(tfidf.fit_transform(list_for).toarray())
for_tfidf.columns = tfidf.get_feature_names()
for_tfidf.index = filedir_args

In [None]:
most_unique = for_tfidf.idxmax(axis=1)
most_unique[511]

In [None]:
for_tfidf.head()


In [None]:
list_sum = []
for index in range(len(match_sum)):
    text_sum = match_sum[index][1]
    list_sum.append(text_sum)
    
intab = "'■*...■■r■■£►♦❖'!`?,;:._'#^<>"
outtab = "                             "
sum_text = [x.translate(str.maketrans(intab, outtab, string.punctuation)) for x in list_sum]

sum_text = list(map(str.lower, list_sum))
sum_text[1]

In [None]:
sum_tfidf = pd.DataFrame(tfidf.fit_transform(list_sum).toarray())
sum_tfidf.columns = tfidf.get_feature_names()
sum_tfidf.index = filedir

In [None]:
sum_most_unique = sum_tfidf.idxmax(axis=1)
sum_most_unique[899]

In [None]:
sum_tfidf

In [None]:
#pivot to readability

In [None]:
from readability import Readability

In [None]:
#Dale Chall Readabiity for Arguments for
for index in range(len(list_for)):
    try:
        text = list_for[index]
        title = filedir_args[index]
        
        r = Readability(text)
        dc_score = r.dale_chall()
        
        print(title, dc_score)
        
        
    except:
        continue

In [None]:
#Gunning Fog Readability for Arguments for
for index in range(len(list_for)):
    try:
        text = list_for[index]
        title = filedir_args[index]
        
        r = Readability(text)
        gf_score = r.gunning_fog()

        print(title, gf_score)
        
        
    except:
        continue

In [None]:
#Dale Chall Readability for Summaries
for index in range(len(list_sum)):
    try:
        text = list_sum[index]
        title = filedir[index]
        
        r = Readability(text)
        dc_score = r.dale_chall()
        
        print(title, dc_score)
        
        
    except:
        continue

In [None]:
#Gunning Fog Readability for Summaries
for index in range(len(list_sum)):
    try:
        text = list_sum[index]
        title = filedir[index]
        
        r = Readability(text)
        gf_score = r.gunning_fog()
        
        print(title, gf_score)
        
        
    except:
        continue