In [33]:
## Import Libraries ##

import json
from pprint import pprint
from pandas import *
from pandas.io.json import json_normalize


# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *


In [36]:
## Get Data ##

#reference on data: https://www.kaggle.com/c/random-acts-of-pizza/data
# pull in the training and test data
with open('/Users/erikaananda/Documents/MIDS/W207/Final Project/data/train.json', encoding='utf-8') as data_file:
    trainData = json.loads(data_file.read())   
with open('/Users/erikaananda/Documents/MIDS/W207/Final Project/data/test.json', encoding='utf-8') as data_file:
    testData = json.loads(data_file.read())    

# create a dev data set 
devData = trainData[0:1000]
trainData = trainData[1000:]

# show how the data looks in its original format
#pprint("data in json format:")
#pprint(trainData[1])

# create a normalized view
allTData = json_normalize(trainData)
print("\nSize of the normalized Data:", allTData.shape)
#print("\nnormalized data columns:", list(allTData))

allDData = json_normalize(devData)



Size of the normalized Data: (3040, 32)


In [39]:
## Create subsets of data for analysis ###

# create a flat dataset without the subreddits list
flatData = allTData.drop('requester_subreddits_at_request', 1)
# create a separate dataset with just subreddits, indexed on request id
# we can creata a count vector on the words, run Naive Bayes against it, 
# and add the probabilities to our flat dataset
subredTData = allTData[['request_id','requester_subreddits_at_request']]
subredTData.set_index('request_id', inplace=True)

subredDData= allDData[['request_id','requester_subreddits_at_request']]
subredDData.set_index('request_id', inplace=True)

# our training labels
trainLabel = allTData['requester_received_pizza']

devLabel = allDData['requester_received_pizza']

# what do these look like?
#print(list(flatData))
print(subredTData.shape)
#print(subredTData['requester_subreddits_at_request'][1])

# create a corpus of subreddits to vectorize
trainCorpus = []
for index in range(len(subredTData)):
    trainCorpus.append(' '.join(subredTData['requester_subreddits_at_request'][index]))

devCorpus = []
for index in range(len(subredDData)):
    devCorpus.append(' '.join(subredDData['requester_subreddits_at_request'][index]))
    


(3040, 1)


In [32]:
# combine all text sources into a single corpus
fldTText = allTData[['request_id','request_text', 'request_text_edit_aware', 'request_title']]
fldDText = allDData[['request_id','request_text', 'request_text_edit_aware', 'request_title']]

trainCorpus = []
for index in range(len(subredTData)):
    a = ' '.join(subredTData['requester_subreddits_at_request'][index])
    b = (a, fldTText['request_text'][index], fldTText['request_text_edit_aware'][index],
        fldTText['request_title'][index])
    trainCorpus.append(' '.join(b))

devCorpus = []
for index in range(len(subredDData)):
    a = ' '.join(subredDData['requester_subreddits_at_request'][index])
    b = (a, fldDText['request_text'][index], fldDText['request_text_edit_aware'][index],
         fldDText['request_title'][index])
    devCorpus.append(' '.join(b))


    
vectorizer = CountVectorizer(min_df=1)
tVector = vectorizer.fit_transform(trainCorpus)
dVector = vectorizer.transform(devCorpus)

print ("\nThe size of the vocabulary for the training text data is", tVector.shape[1])
print("\nFirst 5 feature Names:", vectorizer.get_feature_names()[1:6], "\n")

# get the best regularization
regStrength = [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 6.0, 10.0]
catCoef = []

for c in regStrength:
    modelLogit = LogisticRegression(penalty='l2', C=c)
    modelLogit.fit(tVector, trainLabel)
    logitScore = round(modelLogit.score(dVector, devLabel), 4)
    print("For C = ", c, "Logistic regression accuracy:", logitScore)

modelLogit = LogisticRegression(penalty='l2', C=.001)
modelLogit.fit(tVector, trainLabel)

print(max(modelLogit.coef_[0]))
numWeights = 25

sortIndex = np.argsort(modelLogit.coef_)
iLen = len(sortIndex[0])
print("\nTop", numWeights, "Weighted Features:")

for index in range((iLen - numWeights) , iLen):
    lookup = sortIndex[0][index]
    weight =  round(modelLogit.coef_[0][lookup], 5)
    print(vectorizer.get_feature_names()[sortIndex[0][index]], weight)


The size of the vocabulary for the training text data is 17213

First 5 feature Names: ['000', '0000', '0011011001111000', '00243364', '00pm'] 

For C =  0.0001 Logistic regression accuracy: 0.74
For C =  0.001 Logistic regression accuracy: 0.741
For C =  0.01 Logistic regression accuracy: 0.736
For C =  0.1 Logistic regression accuracy: 0.703
For C =  0.5 Logistic regression accuracy: 0.689
For C =  1.0 Logistic regression accuracy: 0.689
For C =  2.0 Logistic regression accuracy: 0.683
For C =  6.0 Logistic regression accuracy: 0.674
For C =  10.0 Logistic regression accuracy: 0.671
0.0381825219695

Top 25 Weighted Features:
her 0.0206
currently 0.02085
assistance 0.02103
imgur 0.0211
back 0.02125
bills 0.02141
sunday 0.02275
ask 0.02278
well 0.02286
people 0.02291
until 0.02301
paycheck 0.02332
jpg 0.02429
rice 0.02451
com 0.02458
forward 0.02499
about 0.02562
check 0.02601
ve 0.0274
days 0.02786
from 0.0286
thank 0.03243
wife 0.03335
edit 0.03378
last 0.03818
