In [24]:
## Import Libraries ##
from nltk.stem import *
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize


import json
from pprint import pprint
from pandas import *
from pandas.io.json import json_normalize
import re
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *


In [25]:
## Get Data ##

#reference on data: https://www.kaggle.com/c/random-acts-of-pizza/data
# pull in the training and test data
with open('/Users/erikaananda/Documents/MIDS/W207/Final Project/data/train.json', encoding='utf-8') as data_file:
    trainData = json.loads(data_file.read())   
with open('/Users/erikaananda/Documents/MIDS/W207/Final Project/data/test.json', encoding='utf-8') as data_file:
    testData = json.loads(data_file.read())    

# create a dev data set 
devData = trainData[0:1000]
trainData = trainData[1000:]

# show how the data looks in its original format
#pprint("data in json format:")
#pprint(trainData[1])

# create a normalized view
allTrainData = json_normalize(trainData)
print("\nSize of the normalized Data:", allTrainData.shape)
#print("\nnormalized data columns:", list(allTData))

allDevData = json_normalize(devData)
allTestData = json_normalize(testData)


Size of the normalized Data: (3040, 32)


In [26]:
## Create subsets of data for analysis ###

# create a flat dataset without the subreddits list
flatData = allTrainData.drop('requester_subreddits_at_request', 1)
# create a separate dataset with just subreddits, indexed on request id
# we can creata a count vector on the words, run Naive Bayes against it, 
# and add the probabilities to our flat dataset
subredTData = allTrainData[['request_id','requester_subreddits_at_request']]
subredTData.set_index('request_id', inplace=True)

subredDData= allDevData[['request_id','requester_subreddits_at_request']]
subredDData.set_index('request_id', inplace=True)

# our training labels
trainLabel = allTrainData['requester_received_pizza']

devLabel = allDevData['requester_received_pizza']

# what do these look like?
print(subredTData.shape)


(3040, 1)


In [27]:
## Clean Data ##
def cleanData(dataSet, trimInd, tokenInd):
    stemmer = SnowballStemmer("english")
    returnData = []
        
    for line in dataSet:
        # if the line is already tokenized, stem the tokens, and join the line 
        # to continue data cleanup
        if tokenInd:
            line = ([stemmer.stem(word) for word in line])
            line = ' '.join(line)
        else:
        #tokenize so nltk stemmer can stem each word
            tokens = "".join([i for i in line]).split(" ")
            # stem and return string format
            line = ' '.join([stemmer.stem(token) for token in tokens])
        #lower case
        line = line.lower()
        # replace digits with the number 1
        line = re.sub('\d+', '1', line)
        #remove non alphanumeric text
        line = re.sub('\W', ' ', line)
        # remove underscores
        line = re.sub('_', '', line)
        # truncate words longer than 15 characters
        if trimInd:
            regex_long_words = re.compile(r"(\w{1,15})\w*")   
            listMatches = re.findall(regex_long_words, line)
            line = ' '.join(listMatches)
        
        
        returnData.append(line)

    return returnData



In [30]:
# Clean All the Text
pandas.options.mode.chained_assignment = None  # default='warn'
# clean data
cleanSubRedCol = cleanData(subredTData['requester_subreddits_at_request'], trimInd = False, tokenInd = True)
cleanTrainSubRed = subredTData
cleanTrainSubRed['requester_subreddits_at_request'] = cleanSubRedCol

cleanTrainReqText = cleanData(allTrainData['request_text'], trimInd = True, tokenInd = False)
cleanTrainReqTextEdit = cleanData(allTrainData['request_text_edit_aware'], trimInd = True, tokenInd = False)
cleanTrainReqTitle = cleanData(allTrainData['request_title'], trimInd = False, tokenInd = False)

allCleanTrainData = allTrainData[['request_id', 'request_text', 'request_text_edit_aware', 'request_title']]
allCleanTrainData['request_text'] = cleanTrainReqText
allCleanTrainData['request_text_edit_aware'] = cleanTrainReqTextEdit                                                                            
allCleanTrainData['request_title'] = cleanTrainReqTitle 

cleanSubRedCol = cleanData(subredDData['requester_subreddits_at_request'], trimInd = False, tokenInd = True)
cleanDevSubRed = subredDData
cleanDevSubRed['requester_subreddits_at_request'] = cleanSubRedCol

cleanDevReqText = cleanData(allDevData['request_text'], trimInd = True, tokenInd = False)
cleanDevReqTextEdit = cleanData(allDevData['request_text_edit_aware'], trimInd = True, tokenInd = False)
cleanDevReqTitle = cleanData(allDevData['request_title'], trimInd = False, tokenInd = False)

allCleanDevData = allDevData[['request_id', 'request_text', 'request_text_edit_aware', 'request_title']]
allCleanDevData['request_text'] = cleanDevReqText
allCleanDevData['request_text_edit_aware'] = cleanDevReqTextEdit                                                                            
allCleanDevData['request_title'] = cleanDevReqTitle 



In [33]:


trainCorpus = []
for index in range(len(cleanTrainSubRed)):
    a = ' '.join(cleanTrainSubRed['requester_subreddits_at_request'][index])
    b = (a, allCleanTrainData['request_text'][index], allCleanTrainData['request_text_edit_aware'][index],
         allCleanTrainData['request_title'][index])
    trainCorpus.append(' '.join(b))

devCorpus = []
for index in range(len(subredDData)):
    a = ' '.join(cleanDevSubRed['requester_subreddits_at_request'][index])
    b = (a, allCleanDevData['request_text'][index], allCleanDevData['request_text_edit_aware'][index],
         allCleanDevData['request_title'][index])
    devCorpus.append(' '.join(b))
    


vectorizer = CountVectorizer(min_df=1)
tVector = vectorizer.fit_transform(trainCorpus)
dVector = vectorizer.transform(devCorpus)

print ("\nThe size of the vocabulary for the training text data is", tVector.shape[1])
print("\nFirst 5 feature Names:", vectorizer.get_feature_names()[1:6], "\n")

# get the best regularization
regStrength = [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 6.0, 10.0]
catCoef = []

for c in regStrength:
    modelLogit = LogisticRegression(penalty='l1', C=c)
    modelLogit.fit(tVector, trainLabel)
    logitScore = round(modelLogit.score(dVector, devLabel), 4)
    print("For C = ", c, "Logistic regression accuracy:", logitScore)

modelLogit = LogisticRegression(penalty='l1', C=.001)
modelLogit.fit(tVector, trainLabel)

print(max(modelLogit.coef_[0]))
numWeights = 25

sortIndex = np.argsort(modelLogit.coef_)
iLen = len(sortIndex[0])
print("\nTop", numWeights, "Weighted Features:")

for index in range((iLen - numWeights) , iLen):
    lookup = sortIndex[0][index]
    weight =  round(modelLogit.coef_[0][lookup], 5)
    print(vectorizer.get_feature_names()[sortIndex[0][index]], weight)


The size of the vocabulary for the training text data is 10374

First 5 feature Names: ['11111n', '11e1cfb', '1a', '1a1', '1a1a1fcafd1'] 

For C =  0.0001 Logistic regression accuracy: 0.74
For C =  0.001 Logistic regression accuracy: 0.74
For C =  0.01 Logistic regression accuracy: 0.74
For C =  0.1 Logistic regression accuracy: 0.739
For C =  0.5 Logistic regression accuracy: 0.69
For C =  1.0 Logistic regression accuracy: 0.669
For C =  2.0 Logistic regression accuracy: 0.661
For C =  6.0 Logistic regression accuracy: 0.649
For C =  10.0 Logistic regression accuracy: 0.639
0.0

Top 25 Weighted Features:
followings 0.0
follows 0.0
fold 0.0
fluid 0.0
fluently 0.0
flu 0.0
flip 0.0
flippin 0.0
flkmm 0.0
flma 0.0
float 0.0
floating 0.0
flood 0.0
flooded 0.0
floor 0.0
floored 0.0
florida 0.0
floridian 0.0
flour 0.0
flourish 0.0
flow 0.0
flowing 0.0
floxacin 0.0
flight 0.0
ಠಠ 0.0
