In [2]:
import pandas as pd
import csv
import os
import collections
import re

In [3]:
def wordCount(file):
    data = file.read()
    words = data.split()
    return len(words)

In [4]:
def sentenceCount(file):
    data = file.read()
    sentences = data.split(".")
    return len(sentences)

In [5]:
def getTimePeriod(file):
    markers = [1780, 1800, 1820, 1840, 1860, 1880, 1900, 1920, 1940, 1960, 1980, 2000, 2020]
    s = file.read()
    lines = s.split("\n")
    for line in lines:
        if line.startswith("<date="):
            date = int(line[len(line)-6:len(line)-2])
    for x in range(12):
        if date >= markers[x] and date < markers[x+1]:
            return int(x)

In [6]:
def avgSentLength(file):
    data = file.read()
    sentences = data.split(".")
    lengths = []
    for sentence in sentences:
        lengths.append(len(sentence.split(" ")))
    sum = 0
    for length in lengths:
        sum = sum+length
    
    return sum / len(lengths)

In [7]:
def syllable_count(word):
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

In [8]:
def avgSyllablePerWord(file):
    data = file.read()
    words = data.split()
    sum = 0
    for word in words:
        sum+=syllable_count(word)
    return sum / len(words)

In [9]:
femaleGendered = ["she", "her", "hers", "woman", "women", "girl", "girls", "female"]
maleGendered = ["he", "him", "his", "man", "men", "boy", "boys", "male"]
equality = ["equality", "equal", "fairness", "equal rights", "equal opportunities", "egalitarian", "egalitarianism", "equity",
           "equitability"]
race = ["white", "black", "asian", "indian", "african", "caucasian", "ethnicity", "european", "hispanic", "indigenous", 
       "racism", "minority", "race", "predjudice", "racial", "latin", "latino", "latina", "jewish", "jew", "african american"
       "whites", "blacks", "people of color", "native american", "native"]
religion = ["religion", "jewish", "christian", "hindu", "judaism", "hinduism", "christianity", "lutherian", "catholic", 
           "catholocism", "protestant", "protestantism", "god", "buddist", "buddism", "religous"]

In [10]:
def countTerms(file, terms):
    wanted = terms
    cnt = 0;
    words = file.read().lower().split(" ")
    for word in words:
        if word in wanted:
            cnt += 1
    return cnt

In [11]:
  
from lxml import html
import requests

r = requests.get("https://www.theguardian.com/news/datablog/2012/oct/15/us-presidents-listed")
r.encoding = 'utf-8'
tree = html.fromstring(r.content)

table = tree.xpath('//table[@class="in-article sortable"]')[0]
table = table.xpath('./tbody')[0]
rows = table.xpath('./tr')

partyDict = {}

for row in rows:
    cols = row.xpath('./td')
    year = cols[0].xpath('./text()')[0].strip()
    party = cols[2].xpath('./text()')[0].strip()
    #print(year, party)
    partyDict[int(year)] = party

    
#f = open("year_party_data.txt", "w+", encoding='utf-8')
#f.write('\n'.join(data))
#f.close()

In [12]:
def getPoliticalParty(file):
    s = file.read()
    lines = s.split("\n")
    for line in lines:
        if line.startswith("<date="):
            year = int(line[len(line)-6:len(line)-2])
    partyRaw = partyDict.get(year)
    if partyRaw == "None":
        return 0
    elif partyRaw == "Federalist":
        return 1
    elif partyRaw == "Democratic-Republican":
        return 2
    elif partyRaw == "Whig":
        return 3
    elif partyRaw == "Republican" and year < 1865:
        return 4
    elif partyRaw == "Democrat" and year < 1865:
        return 5
    elif partyRaw == "National Union":
        return 6
    elif partyRaw == "Democrat" and year >= 1865 and year < 1961:
        return 7
    elif partyRaw == "Republican" and year >= 1865 and year < 1961:
        return 8
    elif partyRaw == "Democrat" and year >= 1961:
        return 9
    elif partyRaw == "Republican" and year >= 1961:
        return 10
 
    
    
    
    return partyRaw
    

In [None]:
column_names = ["speech", "word_count", "sentence_count", "average_words",
               "average_syl_per_word", "flesch_kincaid_score",  "flesch_kincaid_grade_level",
               "total_gendered_terms", "female_gendered_terms",
                "male_gendered_terms",
               "terms_of_equality", "terms_for_race", 
                "terms_for_religion", "political_party", "president", 
               "time_period"]
df = pd.DataFrame(columns = column_names)
df.set_index("speech")
#f = open("corpus/adams/adams_speeches_000.txt", 'r')

#reader = csv.reader(f) 

speeches = []
wordCounts = []
sentenceCounts = []
avgSentCounts = []
avgSylCounts = []
femaleCounts = []
maleCounts = []
equalityCounts = []
raceCounts = []
religionCounts = []
presidents = []
timePeriods = []
parties = []
gradeLevels = []

# Path is diff depending on where you run from
try:
    os.listdir('corpus/')
    corpus_path = "corpus/"
except FileNotFoundError:
    corpus_path = "../corpus/"
    
for higherLevelFilename in os.listdir(corpus_path):
    #f = open("corpus/" + higherLevelFilename+ "/"+filename, 'r')
    for filename in os.listdir(corpus_path+higherLevelFilename):
        f = open(corpus_path+higherLevelFilename+"/"+filename, 'rt', errors='ignore')
        speeches.append(f)  
        presidents.append(higherLevelFilename)
        sentenceCounts.append(sentenceCount(f))
        f.close()
        f = open(corpus_path+higherLevelFilename+"/"+filename, 'rt', errors='ignore')
        wordCounts.append(wordCount(f))
        f.close()
        f = open(corpus_path+higherLevelFilename+"/"+filename, 'rt', errors='ignore')
        avgSentCounts.append(avgSentLength(f))
        f.close()
        f = open(corpus_path+higherLevelFilename+"/"+filename, 'rt', errors='ignore')
        avgSylCounts.append(avgSyllablePerWord(f))
        f.close()
        f = open(corpus_path+higherLevelFilename+"/"+filename, 'rt', errors='ignore')
        femaleCounts.append(countTerms(f, femaleGendered))
        f.close()
        f = open(corpus_path+higherLevelFilename+"/"+filename, 'rt', errors='ignore')
        maleCounts.append(countTerms(f, maleGendered))
        f.close()
        f = open(corpus_path+higherLevelFilename+"/"+filename, 'rt', errors='ignore')
        equalityCounts.append(countTerms(f, equality))
        f.close()
        f = open(corpus_path+higherLevelFilename+"/"+filename, 'rt', errors='ignore')
        raceCounts.append(countTerms(f, race))
        f.close()
        f = open(corpus_path+higherLevelFilename+"/"+filename, 'rt', errors='ignore')
        religionCounts.append(countTerms(f, religion))
        f.close()
        f = open(corpus_path+higherLevelFilename+"/"+filename, 'rt', errors='ignore')
        timePeriods.append(getTimePeriod(f))
        f.close()
        f = open(corpus_path+higherLevelFilename+"/"+filename, 'rt', errors='ignore')
        parties.append(getPoliticalParty(f))
        f.close()


df["speech"] = speeches
df["word_count"] = wordCounts
df["sentence_count"] = sentenceCounts
df["average_words"] = avgSentCounts
df["average_syl_per_word"] = avgSylCounts
df["flesch_kincaid_grade_level"] = (df["average_words"] * 0.39 + df["average_syl_per_word"] * 11.8) - 15.59
df["female_gendered_terms"] = femaleCounts
df["male_gendered_terms"] = maleCounts
df["total_gendered_terms"] = df["male_gendered_terms"] + df["female_gendered_terms"]
df["terms_of_equality"] = equalityCounts
df["terms_for_race"] = raceCounts
df["terms_for_religion"] = religionCounts
df["president"] = presidents
df["time_period"] = timePeriods
df["political_party"] = parties
df["flesch_kincaid_score"] = 206.835 - (1.015*df["average_words"]) - (84.6 * df["average_syl_per_word"])



In [None]:
df

In [193]:
df.to_csv(r'final_proj_dataset.csv')