In [7]:
## Import Libraries ##

import json
from pprint import pprint
from pandas import *
from pandas.io.json import json_normalize


# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *


In [9]:
## Get Data ##

#reference on data: https://www.kaggle.com/c/random-acts-of-pizza/data
# pull in the training and test data
with open('/Users/erikaananda/Documents/MIDS/W207/Final Project/data/train.json', encoding='utf-8') as data_file:
    trainData = json.loads(data_file.read())   
with open('/Users/erikaananda/Documents/MIDS/W207/Final Project/data/test.json', encoding='utf-8') as data_file:
    testData = json.loads(data_file.read())    

# create a dev data set 
devData = trainData[0:1000]
trainData = trainData[1000:]

# show how the data looks in its original format
pprint("data in json format:")
pprint(trainData[1])

# create a normalized view
allTData = json_normalize(trainData)
print("\nSize of the normalized Data:", allTData.shape)
print("\nnormalized data columns:", list(allTData))

allDData = json_normalize(devData)


'data in json format:'
{'giver_username_if_known': 'N/A',
 'number_of_downvotes_of_request_at_retrieval': 2,
 'number_of_upvotes_of_request_at_retrieval': 2,
 'post_was_edited': False,
 'request_id': 't3_yemx8',
 'request_number_of_comments_at_retrieval': 1,
 'request_text': 'My boyfriend and I live in Saint Augustine, Florida and have '
                 'been having a rough time financially the past few months.  '
                 "In and out of various jobs, we've had to survive off of "
                 'coscto sized ramen packs, and pasta and olive oil.  I '
                 'applied for food stamps a couple days ago, and am waiting to '
                 "hear back from them.  It's getting a little trite, and we're "
                 "quite hungry tonight, a hot pizza would be a delight.  We'll "
                 'happily pay it forward in the future.  Much love.',
 'request_text_edit_aware': 'My boyfriend and I live in Saint Augustine, '
                            'Florida and ha

In [16]:
## Create subsets of data for analysis ###

# create a flat dataset without the subreddits list
flatData = allTData.drop('requester_subreddits_at_request', 1)
# create a separate dataset with just subreddits, indexed on request id
# we can creata a count vector on the words, run Naive Bayes against it, 
# and add the probabilities to our flat dataset
subredTData = allTData[['request_id','requester_subreddits_at_request']]
subredTData.set_index('request_id', inplace=True)

subredDData= allDData[['request_id','requester_subreddits_at_request']]
subredDData.set_index('request_id', inplace=True)

# our training labels
trainLabel = allTData['requester_received_pizza']

devLabel = allDData['requester_received_pizza']

# what do these look like?
print(list(flatData))
print(subredTData.shape)
print(subredTData['requester_subreddits_at_request'][1])

# create a corpus of subreddits to vectorize
trainCorpus = []
for index in range(len(subredTData)):
    trainCorpus.append(' '.join(subredTData['requester_subreddits_at_request'][index]))
    
devCorpus = []
for index in range(len(subredDData)):
    devCorpus.append(' '.join(subredDData['requester_subreddits_at_request'][index]))


['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_upvotes_minus_downvotes_at_request', 'requester_upvotes_minus_downvotes_at_retrieval', '

In [76]:
## Analyze ##

# create a train vector
vectorizer = CountVectorizer(min_df=1)
trainVector = vectorizer.fit_transform(trainCorpus)
devVector = vectorizer.transform(devCorpus)
print ("\nThe size of the vocabulary for the training data is", trainVector.shape[1])
print("\nFirst 5 feature Names:", vectorizer.get_feature_names()[1:6])

# get the best regularization
regStrength = [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 6.0, 10.0]
catCoef = []

for c in regStrength:
    modelLogit = LogisticRegression(penalty='l2', C=c)
    modelLogit.fit(trainVector, trainLabel)
    logitScore = round(modelLogit.score(devVector, devLabel), 3)
    print("For C = ", c, "Logistic regression accuracy:", logitScore)



numWeights = 5


# Fit a logistic regression on training data
modelLogit = LogisticRegression(penalty='l2', C=.1)

modelLogit.fit(trainVector, trainLabel)

indexArray = np.zeros(numWeights)
sortIndex = np.argsort(modelLogit.coef_)
iLen = len(sortIndex[0])
print("\nTop 5 Weighted Features:")

for index in range((iLen - numWeights) , iLen):
    lookup = sortIndex[0][index]
    weight =  round(modelLogit.coef_[0][lookup], 3)
    print(vectorizer.get_feature_names()[sortIndex[0][index]], weight)


The size of the vocabulary for the training data is 6450

First 5 feature Names: ['1000words', '100movies365days', '100pushups', '100sets', '1558']
For C =  0.0001 Logistic regression accuracy: 0.74
For C =  0.001 Logistic regression accuracy: 0.74
For C =  0.01 Logistic regression accuracy: 0.74
For C =  0.1 Logistic regression accuracy: 0.753
For C =  0.5 Logistic regression accuracy: 0.742
For C =  1.0 Logistic regression accuracy: 0.734
For C =  2.0 Logistic regression accuracy: 0.727
For C =  6.0 Logistic regression accuracy: 0.715
For C =  10.0 Logistic regression accuracy: 0.704

Top 5 Weighted Features:
sopa 0.325
nfl 0.325
bioshock 0.339
offbeat 0.347
assistance 0.407


In [81]:
# vectorize the text
reqTText = allTData[['request_id','request_text']]
reqTText.set_index('request_id', inplace=True)
#print(reqTText[:5])
reqDText = allDData[['request_id','request_text']]
reqDText.set_index('request_id', inplace=True)

vectorizer = CountVectorizer(min_df=1)
reqTVector = vectorizer.fit_transform(reqTText['request_text'])
reqDVector = vectorizer.transform(reqDText['request_text'])

print ("\nThe size of the vocabulary for the training text data is", reqTVector.shape[1])
print("\nFirst 5 feature Names:", vectorizer.get_feature_names()[1:6])

# get the best regularization
regStrength = [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 6.0, 10.0]
catCoef = []

for c in regStrength:
    modelLogit = LogisticRegression(penalty='l2', C=c)
    modelLogit.fit(reqTVector, trainLabel)
    logitScore = round(modelLogit.score(reqDVector, devLabel), 4)
    print("For C = ", c, "Logistic regression accuracy:", logitScore)

numWeights = 25


# Fit a logistic regression on training data
modelLogit = LogisticRegression(penalty='l2', C=.0001)

modelLogit.fit(reqTVector, trainLabel)

indexArray = np.zeros(numWeights)
sortIndex = np.argsort(modelLogit.coef_)
iLen = len(sortIndex[0])
print("\nTop", numWeights, "Weighted Features:")

for index in range((iLen - numWeights) , iLen):
    lookup = sortIndex[0][index]
    weight =  round(modelLogit.coef_[0][lookup], 3)
    print(vectorizer.get_feature_names()[sortIndex[0][index]], weight)



The size of the vocabulary for the training text data is 10833

First 5 feature Names: ['000', '0000', '0011011001111000', '00pm', '012468']
For C =  0.0001 Logistic regression accuracy: 0.74
For C =  0.001 Logistic regression accuracy: 0.74
For C =  0.01 Logistic regression accuracy: 0.74
For C =  0.1 Logistic regression accuracy: 0.722
For C =  0.5 Logistic regression accuracy: 0.698
For C =  1.0 Logistic regression accuracy: 0.694
For C =  2.0 Logistic regression accuracy: 0.689
For C =  6.0 Logistic regression accuracy: 0.675
For C =  10.0 Logistic regression accuracy: 0.666

Top  25 Weighted Features:
paying 0.001
check 0.001
thursday 0.001
http 0.001
went 0.001
ve 0.001
thank 0.001
he 0.001
tough 0.001
kindness 0.001
back 0.001
exchange 0.001
bucks 0.001
paycheck 0.001
sunday 0.001
com 0.001
rice 0.001
bills 0.001
request 0.001
wife 0.001
her 0.001
imgur 0.001
jpg 0.002
edit 0.002
she 0.002


In [82]:
# vectorize the edited text
reqTText = allTData[['request_id','request_text_edit_aware']]
reqTText.set_index('request_id', inplace=True)
#print(reqTText[:5])
reqDText = allDData[['request_id','request_text_edit_aware']]
reqDText.set_index('request_id', inplace=True)

vectorizer = CountVectorizer(min_df=1)
reqTVector = vectorizer.fit_transform(reqTText['request_text_edit_aware'])
reqDVector = vectorizer.transform(reqDText['request_text_edit_aware'])

print ("\nThe size of the vocabulary for the training text data is", reqTVector.shape[1])
print("\nFirst 5 feature Names:", vectorizer.get_feature_names()[1:6])

# get the best regularization
regStrength = [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 6.0, 10.0]
catCoef = []

for c in regStrength:
    modelLogit = LogisticRegression(penalty='l2', C=c)
    modelLogit.fit(reqTVector, trainLabel)
    logitScore = round(modelLogit.score(reqDVector, devLabel), 4)
    print("For C = ", c, "Logistic regression accuracy:", logitScore)

numWeights = 25


# Fit a logistic regression on training data
modelLogit = LogisticRegression(penalty='l2', C=.01)

modelLogit.fit(reqTVector, trainLabel)

indexArray = np.zeros(numWeights)
sortIndex = np.argsort(modelLogit.coef_)
iLen = len(sortIndex[0])
print("\nTop", numWeights, "Weighted Features:")

for index in range((iLen - numWeights) , iLen):
    lookup = sortIndex[0][index]
    weight =  round(modelLogit.coef_[0][lookup], 3)
    print(vectorizer.get_feature_names()[sortIndex[0][index]], weight)


The size of the vocabulary for the training text data is 10569

First 5 feature Names: ['000', '0000', '0011011001111000', '00pm', '012468']
For C =  0.0001 Logistic regression accuracy: 0.74
For C =  0.001 Logistic regression accuracy: 0.74
For C =  0.01 Logistic regression accuracy: 0.741
For C =  0.1 Logistic regression accuracy: 0.723
For C =  0.5 Logistic regression accuracy: 0.702
For C =  1.0 Logistic regression accuracy: 0.686
For C =  2.0 Logistic regression accuracy: 0.684
For C =  6.0 Logistic regression accuracy: 0.671
For C =  10.0 Logistic regression accuracy: 0.662

Top 25 Weighted Features:
paying 0.001
date 0.001
father 0.001
almost 0.001
call 0.001
thursday 0.001
ve 0.001
again 0.001
check 0.001
went 0.001
last 0.001
exchange 0.001
tough 0.001
request 0.001
paycheck 0.001
back 0.001
bucks 0.001
jpg 0.001
sunday 0.001
he 0.001
rice 0.001
bills 0.001
wife 0.001
her 0.002
she 0.002


In [83]:
# vectorize the request title
reqTText = allTData[['request_id','request_title']]
reqTText.set_index('request_id', inplace=True)
#print(reqTText[:5])
reqDText = allDData[['request_id','request_title']]
reqDText.set_index('request_id', inplace=True)

vectorizer = CountVectorizer(min_df=1)
reqTVector = vectorizer.fit_transform(reqTText['request_title'])
reqDVector = vectorizer.transform(reqDText['request_title'])

print ("\nThe size of the vocabulary for the training text data is", reqTVector.shape[1])
print("\nFirst 5 feature Names:", vectorizer.get_feature_names()[1:6])

# get the best regularization
regStrength = [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 6.0, 10.0]
catCoef = []

for c in regStrength:
    modelLogit = LogisticRegression(penalty='l2', C=c)
    modelLogit.fit(reqTVector, trainLabel)
    logitScore = round(modelLogit.score(reqDVector, devLabel), 4)
    print("For C = ", c, "Logistic regression accuracy:", logitScore)

numWeights = 25


# Fit a logistic regression on training data
modelLogit = LogisticRegression(penalty='l2', C=.01)

modelLogit.fit(reqTVector, trainLabel)

indexArray = np.zeros(numWeights)
sortIndex = np.argsort(modelLogit.coef_)
iLen = len(sortIndex[0])
print("\nTop", numWeights, "Weighted Features:")

for index in range((iLen - numWeights) , iLen):
    lookup = sortIndex[0][index]
    weight =  round(modelLogit.coef_[0][lookup], 3)
    print(vectorizer.get_feature_names()[sortIndex[0][index]], weight)


The size of the vocabulary for the training text data is 3839

First 5 feature Names: ['00243364', '02', '08', '09', '10']
For C =  0.0001 Logistic regression accuracy: 0.74
For C =  0.001 Logistic regression accuracy: 0.74
For C =  0.01 Logistic regression accuracy: 0.74
For C =  0.1 Logistic regression accuracy: 0.735
For C =  0.5 Logistic regression accuracy: 0.718
For C =  1.0 Logistic regression accuracy: 0.711
For C =  2.0 Logistic regression accuracy: 0.692
For C =  6.0 Logistic regression accuracy: 0.672
For C =  10.0 Logistic regression accuracy: 0.659

Top 25 Weighted Features:
mother 0.039
ramen 0.04
days 0.04
kids 0.041
was 0.041
daughter 0.041
couple 0.042
rough 0.042
city 0.042
wife 0.043
texas 0.045
single 0.045
will 0.05
get 0.051
forward 0.051
help 0.053
having 0.057
friday 0.061
make 0.063
last 0.07
tonight 0.072
or 0.073
until 0.074
night 0.085
little 0.096
