In [1]:
import sqlite3
import requests
from bs4 import BeautifulSoup
import re
from collections import Counter
from nltk.corpus import stopwords

In [2]:
# create SQL DB
conn = sqlite3.connect('texas.sqlite')
cur = conn.cursor()

cur.execute('''CREATE TABLE IF NOT EXISTS TexasInmate
    (execution INTEGER UNIQUE, last_statement TEXT, age INTEGER,
     execution_date TEXT, race TEXT, county TEXT)''')

<sqlite3.Cursor at 0x1a12074420>

In [3]:
# BeautifulSoup to pull out table and content rows
url = 'http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table')
rows = table.find_all('tr')

In [4]:
# helper function

base_url = 'http://www.tdcj.state.tx.us/death_row/dr_info/'

def get_lastwords(link):
    inmate = link.split('/')
    inmate = inmate[-1]
    if inmate != 'no_last_statement.html':
        url = base_url + inmate
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        paragraphs = soup.find_all('p')
        words_list = [x.get_text() for x in paragraphs]
        words = ' '.join(words_list)
        target = 'Last Statement: '
        statement_index = words.find(target) + len(target)
        last_statement = words[statement_index:]
        stripped_words = re.sub('[^A-Za-z 0-9]+', '', last_statement)
        list_of_words = stripped_words.split()

        return ' '.join(list_of_words)
    else:
        return 'This offender declined to make a last statement.'

In [5]:
for row in rows:
    if row.find_all('td'):
        elements = row.find_all('td')
        last_words = get_lastwords(elements[2].a['href'])
        data = (elements[0].get_text(), last_words,
                elements[6].get_text(),elements[7].get_text(),
                elements[8].get_text(), elements[9].get_text())
        cur.execute('''INSERT OR IGNORE INTO TexasInmate (execution,
            last_statement, age, execution_date, race, county)
            VALUES ( ?, ?, ?, ?, ?, ? )''', data)


conn.commit()
cur.close()

In [6]:
stop_words = set(stopwords.words('english'))

conn = sqlite3.connect('texas.sqlite')
cur = conn.cursor()

cur.execute('SELECT last_statement FROM TexasInmate')

<sqlite3.Cursor at 0x1a12bac6c0>

In [7]:
bag_of_words = Counter()

for statement in cur:
    if statement[0] == 'This offender declined to make a last statement.':
        continue
    else:
        words = statement[0].split()
        new_word_list = []
        for word in words:
            if word in stop_words:
                continue
            else:
                new_word_list.append(word)
        bag_of_words.update(new_word_list)

In [8]:
common_words = ['I', 'Im', 'to','yall','going','To','You','dont','Yes','And','The']
for word in common_words:
    stop_words.add(word)


for word in stop_words:
    if word in bag_of_words:
        del bag_of_words[word]
print(bag_of_words)

Counter({'love': 761, 'family': 352, 'know': 337, 'sorry': 272, 'want': 247, 'would': 243, 'like': 231, 'God': 229, 'say': 195, 'thank': 176, 'hope': 160, 'life': 151, 'Lord': 149, 'forgive': 146, 'Thank': 129, 'people': 128, 'peace': 122, 'see': 117, 'one': 117, 'Jesus': 114, 'Warden': 104, 'done': 101, 'take': 95, 'tell': 93, 'strong': 92, 'pain': 91, 'go': 89, 'ready': 88, 'heart': 87, 'ask': 86, 'friends': 80, 'death': 80, 'get': 80, 'years': 79, 'find': 78, 'give': 78, 'right': 76, 'man': 75, 'everybody': 73, 'us': 70, 'time': 69, 'come': 69, 'This': 68, 'home': 67, 'keep': 66, 'He': 66, 'back': 65, 'care': 65, 'We': 65, 'could': 62, 'pray': 61, 'never': 61, 'always': 61, 'caused': 60, 'Christ': 59, 'forgiveness': 59, 'My': 57, 'way': 57, 'got': 57, 'Father': 57, 'good': 56, 'Thats': 56, 'didnt': 55, 'Ill': 54, 'loved': 53, 'It': 52, 'much': 52, 'let': 52, 'If': 52, 'everyone': 51, 'make': 51, 'innocent': 51, 'everything': 50, 'nothing': 49, 'lot': 48, 'day': 48, 'cant': 48, 'Tell

In [9]:
most_common = bag_of_words.most_common(30)

In [10]:
words = []
counts = []

for word in most_common:
    words.append(word[0])
    counts.append(word[1])
    
    

In [11]:
words

['love',
 'family',
 'know',
 'sorry',
 'want',
 'would',
 'like',
 'God',
 'say',
 'thank',
 'hope',
 'life',
 'Lord',
 'forgive',
 'Thank',
 'people',
 'peace',
 'see',
 'one',
 'Jesus',
 'Warden',
 'done',
 'take',
 'tell',
 'strong',
 'pain',
 'go',
 'ready',
 'heart',
 'ask']