## What is this file?

This file is a storehouse for our work that did not make it into our final submitted model. We approached the Random Acts of Pizza challenge from many angles and you'll find several of those approaches below. We hope it will provide additional context around how we thought through the challenge and some of the things we learned through this process.

-------------------------------------------------------

In [1]:
## Import Libraries ##

import json
from pprint import pprint
from pandas import *
from pandas.io.json import json_normalize
from vaderSentiment import vaderSentiment

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt
import mlxtend
import scipy

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import recall_score

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.ensemble import BaggingClassifier
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.metrics import roc_curve, roc_auc_score

# SK-learn libraries for pre/processing data
from sklearn import preprocessing
from sklearn.decomposition import LatentDirichletAllocation as LDA

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

# Miscellaneous libraries
from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline
import datetime as dt



In [2]:
## Get Data ##

# Reference for data: https://www.kaggle.com/c/random-acts-of-pizza/data
# Pull in the training and test data
with open('data/train.json', encoding='utf-8') as data_file:
    trainData = json.loads(data_file.read())   

with open('data/test.json', encoding='utf-8') as data_file:
    testData = json.loads(data_file.read())    

# create a dev data set 
devData = trainData[0:1000]
trainData = trainData[1000:]

# show how the data looks in its original format
#pprint("data in json format:")
#pprint(trainData[1])

# create a normalized view
allTData = json_normalize(trainData)
print("\nSize of the normalized Data:", allTData.shape)
print("\nnormalized data columns:", list(allTData))

allDData = json_normalize(devData)


Size of the normalized Data: (3040, 32)

normalized data columns: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request

### Section 1: Setting Up & Processing Data

In [3]:
## Import Libraries ##
import json
from pprint import pprint
from pandas import *
from pandas.io.json import json_normalize
from vaderSentiment import vaderSentiment

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import recall_score

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

In [15]:
# Useful functions for analysis
def roc_curve1(y_true, y_pred_prob):
    """This function plots the ROC curve
    Inputs: y_true, correct label
            y_pred_prob, predicted probabilities
    """
    fpr, tpr, thr = roc_curve(y_true, y_pred_prob)
   
    plt.figure()
    plt.plot(fpr,tpr)
    plt.xlabel("False positive rate")
    plt.ylabel("True positive rate")
    plt.title("ROC Curve")
    plt.show()
    

def score_rep(y_true, y_pred, desc):
    """Function to print out comprehensive report for classification test
    Inputs: y_true, correct label
            y_pred, predicted label from model
            desc, description of model
    Output: classification report
    """
    print(desc)
    print("-"*75)
    print("Accuracy: ", metrics.accuracy_score(y_true, y_pred))
    print("Area under curve of ROC: ", metrics.roc_auc_score(y_true, y_pred))
    print("Classification report:\n")
    print(metrics.classification_report(y_true, y_pred))
    print("-"*75)
    


### vaderSentiment Analysis

In [6]:
# Quick learning exercise to figure out how
# to get vaderSentiment to work

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()
test = "It was one of the worst movies I've seen, despite good reviews."

def print_sentiment_scores(sentence):
    snt = analyser.polarity_scores(sentence)
    print("{:-<40} {}".format(sentence, str(snt)))
    print(snt['compound'])

print_sentiment_scores("It was one of the worst movies I've seen, despite good reviews.")

#sentences = "VADER is smart, handsome, and funny."

#print_sentiment_scores(sentences)

sentences = ["VADER is smart, handsome, and funny.", "VADER is silly, ugly, and rude!"]

for sentence in sentences:
    print("\n")
    print(sentence)
    vs = print_sentiment_scores(sentence)


It was one of the worst movies I've seen, despite good reviews. {'neg': 0.394, 'neu': 0.606, 'pos': 0.0, 'compound': -0.7584}
-0.7584


VADER is smart, handsome, and funny.
VADER is smart, handsome, and funny.---- {'neg': 0.0, 'neu': 0.254, 'pos': 0.746, 'compound': 0.8316}
0.8316


VADER is silly, ugly, and rude!
VADER is silly, ugly, and rude!--------- {'neg': 0.617, 'neu': 0.281, 'pos': 0.103, 'compound': -0.7574}
-0.7574


In [5]:
# Setting up for Titles

title = allTData[['request_title', 'requester_received_pizza']].copy()
title.columns = ['Title', 'Got Pizza']
print("\n")
print("Titles and Pizza Success\n")
print(title.head(10))
pizza_title = title.groupby(['Got Pizza'])
print("\n")





Titles and Pizza Success

                                               Title Got Pizza
0  [Request] Just got dumped, no food in the free...     False
1  [Request]  Saint Augustine, US.  Boyfriend and...     False
2        [Request] I'd love a Buffalo Chicken Puzza!      True
3  [REQUEST]- I start class next week and i am st...     False
4        [Request] Pizza for finals in Northern Iowa      True
5      [request]lovepark,il. Preggers and very sad..     False
6  [Request] My friend is letting me stay with hi...     False
7  [Request] Painting our apartment today, would ...     False
8  [REQUEST] Pennsylvania, USA living off PB&amp;...     False
9          [Request] UK - Broke Student Exam Special      True




In [7]:
import pandas as pd

titles = allTData['request_title']

    
df = pd.DataFrame(data = allTData)

df = df[['request_title', 'requester_received_pizza']]


titles = allTData['request_title']

def print_sentiment_scores(sentence):
    snt = analyser.polarity_scores(sentence)
    snt['compund']
    #print("{:-<40} {}".format(sentence, str(snt)))
    #print(snt['compound'])
    
scores = []

for title in titles:
    scores.append(analyser.polarity_scores(title)['compound'])

#print(scores[1:10])

df["Vader Scores"] = scores

#df.head(10)

df = df.drop('request_title', axis = 1)

print(df)

#df.head(10)

     requester_received_pizza  Vader Scores
0                       False       -0.5994
1                       False        0.2263
2                        True        0.6696
3                       False        0.2960
4                        True        0.0000
5                       False        0.0000
6                       False        0.4588
7                       False        0.6369
8                       False        0.0000
9                        True       -0.0258
10                      False        0.5106
11                      False       -0.3400
12                      False       -0.7650
13                      False        0.0000
14                      False        0.0000
15                      False       -0.4926
16                       True        0.0000
17                      False        0.5562
18                      False       -0.4998
19                      False        0.5859
20                       True        0.6757
21                      False   

In [8]:
import pandas as pd

titles = allDData['request_title']

df_d = pd.DataFrame(data = allDData)

df_d = df_d[['request_title', 'requester_received_pizza']]

def print_sentiment_scores(sentence):
    snt = analyser.polarity_scores(sentence)
    snt['compund']
    #print("{:-<40} {}".format(sentence, str(snt)))
    #print(snt['compound'])
    
scores_d = []

for title in titles:
    scores_d.append(analyser.polarity_scores(title)['compound'])

df_d["Vader Scores"] = scores_d

df_d = df_d.drop('request_title', axis = 1)

print(df_d)



    requester_received_pizza  Vader Scores
0                      False        0.6124
1                      False       -0.2960
2                      False        0.6696
3                      False        0.0000
4                      False        0.8455
5                       True        0.0000
6                      False        0.4019
7                      False        0.0000
8                      False       -0.3862
9                       True        0.0000
10                      True        0.4939
11                     False        0.6369
12                     False        0.7430
13                     False       -0.4215
14                     False        0.0000
15                     False        0.0000
16                      True       -0.0772
17                     False       -0.4215
18                      True        0.1280
19                     False       -0.2500
20                     False        0.7959
21                     False       -0.7269
22         

In [16]:
import pandas as pd

tTitles = allTData['request_title']
dTitles = allDData['request_title']

titleTSentiment = []
titleDSentiment = []

for title in tTitles:
    snt = analyser.polarity_scores(title)
    compoundScore = snt['compound']
    titleTSentiment.append(compoundScore)

titleTSentiment = pd.DataFrame(titleTSentiment)
    
for title in dTitles:
    snt = analyser.polarity_scores(title)
    compoundScore = snt['compound']
    titleDSentiment.append(compoundScore)

titleDSentiment = pd.DataFrame(titleDSentiment)

C = 100
modelLogit = LogisticRegression(penalty = 'l2', C = C)

trainLabel = allTData['requester_received_pizza']
devLabel = allDData['requester_received_pizza']

modelLogit.fit(titleTSentiment,trainLabel)
score_rep(devLabel,modelLogit.predict(titleDSentiment),'Logistic Regression, C = 0.001')


Logistic Regression, C = 0.001
---------------------------------------------------------------------------
Accuracy:  0.74
Area under curve of ROC:  0.5
Classification report:

             precision    recall  f1-score   support

      False       0.74      1.00      0.85       740
       True       0.00      0.00      0.00       260

avg / total       0.55      0.74      0.63      1000

---------------------------------------------------------------------------


  'precision', 'predicted', average, warn_for)
