# Importing libraries

In [1]:
import csv
import os
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))

csv.field_size_limit(1000000000)

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\fazal\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fazal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fazal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fazal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


131072

# Reading the file and performing basic cleanup

In [2]:
# read csv file from data folder
df = pd.read_csv(os.path.join('data', 'state-of-the-union.csv'), names=['year', 'speech'], skiprows=1)

df['speech'] = df['speech'].str.replace('\nState of the Union Address\n', '')
df['speech'] = df['speech'].str.replace('\nAddress to Joint Session of Congress \n', '')
df['speech'] = df['speech'].str.replace('\nAddress on Administration Goals (Budget Message)\n', '')
df['speech'] = df['speech'].str.replace('\nAddress on Administration Goals\n', '')
df['speech'] = df['speech'].str.replace('\nAddress to Congress \n', '')

df['president'] = df['speech']

df['president'] = df['president'].str.split('\n').str[0]
df['date'] = df['speech'].str.split('\n').str[1]

temp_date = df[df['date'] == 'Address on Administration Goals (Budget Message)']['speech'].str.split('\n').str[3]
df['date'][df['date'] == 'Address on Administration Goals (Budget Message)'] = temp_date.values[0]

# delete first 3 lines of speech
df['speech'] = df['speech'].str.split('\n').str[3:]
# make a string list
df['speech'] = df['speech'].str.join(' ')
# replace \ with ''
df['speech'] = df['speech'].str.replace('\\\'', '')

  df['speech'] = df['speech'].str.replace('\nAddress on Administration Goals (Budget Message)\n', '')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date'][df['date'] == 'Address on Administration Goals (Budget Message)'] = temp_date.values[0]
  df['speech'] = df['speech'].str.replace('\\\'', '')


# Performing Lemmatization

In [3]:
lemmatizer = WordNetLemmatizer()


def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in word_tokenize(text)]


df['speech'] = df['speech'].apply(lemmatize_text)
df['speech'] = df['speech'].apply(lambda x: [item for item in x if item not in stop_words])
df['speech'] = df['speech'].apply(lambda x: ' '.join(x))

print(df['speech'].head(10))

# Performing Stemming

In [8]:
# perform stemming
stemmer = PorterStemmer()


def stem_text(text):
    return [stemmer.stem(w) for w in word_tokenize(text)]


df['speech'] = df['speech'].apply(stem_text)
df['speech'] = df['speech'].apply(lambda x: [item for item in x if item not in stop_words])
df['speech'] = df['speech'].apply(lambda x: ' '.join(x))

# remove punctuation
# import string
#
# df['speech'] = df['speech'].str.replace('[{}]'.format(string.punctuation), '')


print(df['speech'].head(10))

0    fellowcitizen senat hou repr meet feel much sa...
1    fellowcitizen senat hou repr vain may expect p...
2    fellowcitizen senat hou repr abat satisfact me...
3    fellowcitizen senat hou repr sinc commenc term...
4    fellowcitizen senat hou repr call mind graciou...
5    fellowcitizen senat hou repr trust deceiv indu...
6    fellowcitizen senat hou repr recur intern situ...
7    gentlemen senat gentlemen hou repr wa time app...
8    gentlemen senat gentlemen hou repr rever resig...
9    gentlemen senat gentlemen hou repr peculiar sa...
Name: speech, dtype: object


In [6]:
# remove punctuation
import string

df['speech'] = df['speech'].str.replace('[{}]'.format(string.punctuation), '')
print(df['speech'].head(10))

0    fellowcitizen senat hous repres  meet feel muc...
1    fellowcitizen senat hous repres   vain may exp...
2    fellowcitizen senat hous repres  abat satisfac...
3    fellowcitizen senat hous repres  sinc commenc ...
4    fellowcitizen senat hous repres  call mind gra...
5    fellowcitizen senat hous repres  trust deceiv ...
6    fellowcitizen senat hous repres  recur intern ...
7    gentlemen senat gentlemen hous repres  wa time...
8    gentlemen senat gentlemen hous repres  rever r...
9    gentlemen senat gentlemen hous repres  peculia...
Name: speech, dtype: object


  df['speech'] = df['speech'].str.replace('[{}]'.format(string.punctuation), '')


In [7]:
from gensim import corpora

# create a dictionary from a list of speeches
dictionary = corpora.Dictionary(df['speech'])

# convert the dictionary to a bag of words
corpus = [dictionary.doc2bow(speech) for speech in df['speech']]

print(corpus[0])

TypeError: doc2bow expects an array of unicode tokens on input, not a single string

In [9]:
import gensim

tokens = gensim.utils.simple_preprocess("A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation", min_len=4)
print(tokens)

['value', 'trying', 'copy', 'slice', 'from', 'dataframe', 'caveats', 'documentation']
