In [None]:
from curses.ascii import isalpha, isdigit
from plistlib import InvalidFileException
import sys
import os
import numpy as np
from scipy import spatial
from Bill import Bill,CongressPerson
import tensorflow as tf
from keras.preprocessing.text import Tokenizer

In [None]:
def make_bills(folder:str):
    bill_locations = os.listdir(folder)
    bill_locations = list(map(lambda loc:folder + '/' + loc, bill_locations))
    bills = []
    for bill_file in bill_locations:
        if bill_file[-3:] == 'txt':
            try:
                bills.append(Bill(bill_file))
            except InvalidFileException as e:
                continue
    return bills

def to_word_vector(translator:dict, bill:Bill):
    """ Turns paper to a vector of word counts using the indicies from translator """
    vector = np.array([0] * len(translator), dtype = 'uint8')
    words = ""
    if type(bill) == str:
        words = get_words(bill)
    else:
        words = get_words_bill(bill)
    for word in words:
        if word in translator:
            index = translator[word]
            vector[index] += 1
    new_vector = []
    for cell in vector:
        if type(bill) == str:
            new_vector += [100000 * (cell / len(bill))]
        else:
            new_vector += [100000 * (cell / len(bill.text))]
    return np.array(new_vector, dtype = 'uint8') 

def make_translator(bills:list):
    """ makes a dictionary with word as input, index in word count vector as output """
    words = set()
    for bill in bills:
        for word in get_words_bill(bill):
            if word not in words:
                words.add(word)
    counter = 0
    translator = dict()
    for word in words:
        translator[word] = counter
        counter += 1

    return translator

def get_words_bill(bill:Bill):
    words = []
    current_word = ''
    for character in bill.text.lower():
        if isdigit(character) or isalpha(character):
            current_word += character
        else:
            if len(current_word) > 0:
                words.append(current_word)
                current_word = ''

    return words

def get_words(bill:str):
    words = []
    current_word = ''
    for character in bill.lower():
        if isdigit(character) or isalpha(character):
            current_word += character
        else:
            if len(current_word) > 0:
                words.append(current_word)
                current_word = ''

    return words


def make_labels(bills):
    parties = set()
    for bill in bills:
        if bill.party not in parties:
            parties.add(bill.party)
    party_list = list(parties)
    labels = []
    for bill in bills:
        label = [0] * len(party_list)
        label[party_list.index(bill.party)] = 1
        label = np.array(label, dtype = 'uint8')
        labels.append(label)
    return labels,party_list

def max_index(list_of_stuff:list) -> int:
    max_num = -1
    max_index = 0
    for i in range(len(list_of_stuff)):
        if list_of_stuff[i] > max_num:
            max_index = i
            max_num = list_of_stuff[i]
    return max_index

In [None]:
GloVe_file = '/Users/lucasgover/Downloads/glove.6B/glove.6B.50d.txt'
emmbed_dict = {}
with open(GloVe_file,'r') as f:
  for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:],'float32')
    emmbed_dict[word]=vector

In [None]:
# Import Bills Here
BillsLocation = '/Users/lucasgover/Desktop/AI431_Projects/AIFinal/Bills_2021-2022'
bills = make_bills(BillsLocation)

In [None]:
def get_words(bill:str):
    words = []
    current_word = ''
    for character in bill.lower():
        if isdigit(character) or isalpha(character):
            current_word += character
        else:
            if len(current_word) > 0:
                words.append(current_word)
                current_word = ''

    return words

In [None]:
#import ideologyTags
SenatorsFile = '/Users/lucasgover/Desktop/Political-Sentiment-Analysis-PG-Capstone/Ideological_Tags/govtrack-stats-2020-house-ideology.csv'
HouseFile = '/Users/lucasgover/Desktop/Political-Sentiment-Analysis-PG-Capstone/Ideological_Tags/govtrack-stats-2020-house-ideology.csv'

StateAndNameToIdeology = dict()

with open (SenatorsFile, "r") as SenateIdeologyCSV:
    data = SenateIdeologyCSV.read()
    senators = data.split('\n')[1:]
    for senator in senators:
        splitSenator = senator.split(',')
        StateAndNameToIdeology[splitSenator[-3] + " " + splitSenator[-1][2:-1]] = float(splitSenator[3])
       
with open (HouseFile, "r") as HouseIdeologyCSV:
    data = HouseIdeologyCSV.read()
    reps = data.split('\n')[1:]
    for rep in reps:
        splitRep = rep.split(',')
        StateAndNameToIdeology[splitRep[-3] + " " + splitRep[-1][2:-1]] = float(splitRep[3])

In [None]:
def bill_to_ideology(bill:Bill, ideologiesDict:dict):
    totalIdeology = 0
    numSponsors = 1
    currentCongressPerson = bill.sponsor
    if (currentCongressPerson.state + " " + currentCongressPerson.full_name.split(',')[0]) in ideologiesDict:
        totalIdeology = ideologiesDict[(currentCongressPerson.state + " " + currentCongressPerson.full_name.split(',')[0])]
        numSponsors = 1
    for sponsor in bill.cosponsors:
        currentCongressPerson = sponsor
        if (currentCongressPerson.state + " " + currentCongressPerson.full_name.split(',')[0]) in ideologiesDict:
            totalIdeology += ideologiesDict[(currentCongressPerson.state + " " + currentCongressPerson.full_name.split(',')[0])]
            numSponsors += 1 
    return totalIdeology / numSponsors

In [None]:
def bills_to_dataset(bills_list:list):
    CHUNK_SIZE = 100
    texts = []
    labels = []
    for bill in bills_list:
        words = get_words(bill.title + bill.text)
        num_chunks = len(words) // CHUNK_SIZE
        if bill.sponsor == None:
            continue
        bill_ideology = bill_to_ideology(bill,StateAndNameToIdeology)
        for i in range(num_chunks):
            texts += [dataset_to_embedding_dataset(words[:CHUNK_SIZE])]
            words = words[CHUNK_SIZE:]
            labels += [bill_ideology]
        print(len(texts))
        print(len(labels))
    return texts,labels

In [None]:
emmbed_dims = len(emmbed_dict[emmbed_dict.keys[0]])


embedding_matrix = np.zeros((len(emmbed_dict), emmbed_dims))
for word, i in emmbed_dict