Assignment 2.2 - Build Your Text Classifiers
Zach Hill
DSC-550-T302
08DEC2019

In [1]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# nltk.download('stopwords')
# nltk.download('punkt')
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
tvect = TfidfVectorizer()
dvect = DictVectorizer(sparse=False)

In [3]:
df = pd.read_json('./Income.json')

In [4]:
df

Unnamed: 0,# of kids,Income,State
0,5,25000,CA
1,5,122500,NY
2,2,142007,TX
3,2,42007,TX
4,0,14704,TX
5,1,200704,TX
6,1,120070,CA
7,3,207040,NY
8,3,48000,NY
9,3,79000,NY


In [5]:
df.columns

Index(['# of kids', 'Income', 'State'], dtype='object')

In [6]:
for col in df.columns:
    # Set headers to lower
    col_clean = col.lower()
    # Remove Punctuation from headers
    col_clean = col_clean.translate(col_clean.maketrans('', '', string.punctuation)).lstrip(' ')
    # Remove stop words from headers
    if (col_clean.find(' ')!=-1):
        word_tokens = word_tokenize(col_clean)
        col_new = [word for word in word_tokens if word not in stop_words]
        col_clean = ''.join([str(word) for word in col_new])
        df.rename(columns = {col:col_clean}, inplace = True)
    else:
        if col_clean in stop_words:
            col_clean = [word for word in word_tokens if word not in stop_words]
            df.rename(columns = {col:col_clean}, inplace = True)
        else:
            df.rename(columns = {col:col_clean}, inplace = True)

In [7]:
df.columns

Index(['kids', 'income', 'state'], dtype='object')

In [8]:
for col in df.columns:
    df.rename(columns = {col:ps.stem(col)}, inplace = True)

In [9]:
df.columns

Index(['kid', 'incom', 'state'], dtype='object')

In [10]:
for col in df:
    df[col] = df[col].apply(lambda x: x.lower() if(isinstance(x, str)) else x)
    print(df[col])


0    5
1    5
2    2
3    2
4    0
5    1
6    1
7    3
8    3
9    3
Name: kid, dtype: int64
0     25000
1    122500
2    142007
3     42007
4     14704
5    200704
6    120070
7    207040
8     48000
9     79000
Name: incom, dtype: int64
0    ca
1    ny
2    tx
3    tx
4    tx
5    tx
6    ca
7    ny
8    ny
9    ny
Name: state, dtype: object


In [11]:
tvect.fit_transform(df['state'])

<10x3 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [12]:
tvect.get_feature_names()

['ca', 'ny', 'tx']

In [13]:
data_dict = [{"Red": 2, "Blue": 4},
             {"Red": 4, "Blue": 3},
             {"Red": 1, "Yellow": 2},
             {"Red": 2, "Yellow": 2}]

In [14]:
features = dvect.fit_transform(data_dict)

In [15]:
features

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [16]:
doc_1_word_count = {"Red": 2, "Blue": 4}
doc_2_word_count = {"Red": 4, "Blue": 3}
doc_3_word_count = {"Red": 1, "Yellow": 2}
doc_4_word_count = {"Red": 2, "Yellow": 2}

In [17]:
doc_word_counts = [doc_1_word_count,
doc_2_word_count,
doc_3_word_count,
doc_4_word_count]

In [18]:
dvect.fit_transform(doc_word_counts)

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [19]:
p = pd.read_table('./positive.txt', header = None, encoding = 'latin-1')
n = pd.read_table('./negative.txt', header = None, encoding = 'latin-1')

  """Entry point for launching an IPython kernel.
  


The code below will build tokenized lists for sentiment analysis using words from around 10000 reviews. To avoid imbalanced samples, each has the same number of reviews. These tokenized lists can be applied in text analysis to predict how other reviews would be considered (positive or negative).

Source: https://pythonprogramming.net/new-data-set-training-nltk-tutorial/

In [20]:
# Tokenize positive reviews and remove punctuation
p_tokens = p[0].apply(RegexpTokenizer('\w+').tokenize)
p_clean = []

# Remove stop words from tokenized list and create new, clean list
for val in p_tokens:
    p_clean += [word for word in val if word not in stop_words]
    
# Build positive comment features and names vectors
p_feat = tvect.fit_transform(p_clean)
p_feat_names = tvect.get_feature_names()


# Tokenize negative reviews and remove punctuation
n_tokens = n[0].apply(RegexpTokenizer('\w+').tokenize)
n_clean = []

# Remove stop words from tokenized list and create new, clean list
for val in n_tokens:
    n_clean += [word for word in val if word not in stop_words]
    
# Build negative comment features and names vectors
n_feat = tvect.fit_transform(n_clean)
n_feat_names = tvect.get_feature_names()

In [21]:
p_feat

<57821x12417 sparse matrix of type '<class 'numpy.float64'>'
	with 57652 stored elements in Compressed Sparse Row format>

In [22]:
n_feat

<56729x12758 sparse matrix of type '<class 'numpy.float64'>'
	with 56543 stored elements in Compressed Sparse Row format>