##### Code for preliminary submission 

By:

David, Jayashree, and Nikki

w207, final project

In [10]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt
import nltk
import json
import pandas as pd

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedShuffleSplit
# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *
import sys
from subprocess import check_output
print(sys.version)

#make the depreciation warnings go away
import warnings
#I'm tired of the warnings on functions the professor asks us to use :) 
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=RuntimeWarning)

3.5.2 |Anaconda 4.1.1 (x86_64)| (default, Jul  2 2016, 17:52:12) 
[GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]


In [2]:

### Random acts of Pizza
### from https://www.reddit.com/r/Random_Acts_Of_Pizza/
### Purpose of Random Acts of Pizza:
###
###The Original Random Pizza Delivery Service
### We are open for active Redditors that have not engaged in questionable online 
### behavior (relax, we don't know about all of it, you're probably fine). See The 
### Pizza Library for everything you need to know to get started.
### https://www.reddit.com/r/Random_Acts_Of_Pizza/wiki/index

In [11]:
### get the training data
a = pd.read_json('train.json', orient='columns')


In [12]:
#use the edited text, as it is used in the test data and the regular request_text is not
#add in request_title, as it is present in both
data_set = a[['request_text_edit_aware','request_title']]

data_labels = a['requester_received_pizza']

sss = StratifiedShuffleSplit(test_size=0.25, random_state=1)
for train_index, test_index in sss.split(data_set, data_labels):
    X_train, X_test = data_set.iloc[train_index], data_set.iloc[test_index]
    y_train, y_test = data_labels.iloc[train_index], data_labels.iloc[test_index]

In [13]:
#Method for preliminary test:
#try the Pipeline implementation from sk learn
#from http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html

In [16]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion


class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.

    The data is expected to be stored in a 2D data structure, where the first
    index is over features and the second is over samples.  i.e.

    >> len(data[key]) == n_samples

    Please note that this is the opposite convention to scikit-learn feature
    matrixes (where the first index corresponds to sample).

    ItemSelector only requires that the collection implement getitem
    (data[key]).  Examples include: a dict of lists, 2D numpy array, Pandas
    DataFrame, numpy record array, etc.

    >> data = {'a': [1, 5, 2, 5, 2, 8],
               'b': [9, 4, 1, 4, 1, 3]}
    >> ds = ItemSelector(key='a')
    >> data['a'] == ds.transform(data)

    ItemSelector is not designed to handle data grouped by sample.  (e.g. a
    list of dicts).  If your data is structured this way, consider a
    transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.

    Parameters
    ----------
    key : hashable, required
        The key corresponding to the desired value in a mappable.
    """
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]


class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        return [{'length': len(text),
                 'num_sentences': text.count('.')}
                for text in posts]


class SubjectBodyExtractor(BaseEstimator, TransformerMixin):
    """Extract the subject & body from a usenet post in a single pass.

    Takes a sequence of strings and produces a dict of sequences.  Keys are
    `subject` and `body`.
    """
    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        features = np.recarray(shape=(len(posts),),
                               dtype=[('subject', object), ('body', object)])
        for i, text in enumerate(posts):
            headers, _, bod = text.partition('\n\n')
            bod = strip_newsgroup_footer(bod)
            bod = strip_newsgroup_quoting(bod)
            features['body'][i] = bod

            prefix = 'Subject:'
            sub = ''
            for line in headers.split('\n'):
                if line.startswith(prefix):
                    sub = line[len(prefix):]
                    break
            features['subject'][i] = sub

        return features


pipeline = Pipeline([
    # Extract the subject & body
    #('subjectbody', SubjectBodyExtractor()),

    # Use FeatureUnion to combine the features from subject and body
    ('union', FeatureUnion(
        transformer_list=[

            # Pipeline for pulling features from the post's subject line
            ('get-title', Pipeline([
                ('selector', ItemSelector(key='request_title')),
                ('cv', CountVectorizer(analyzer='char_wb', 
                                       #vocabulary=vocabulary,
                                       max_df=0.5, ngram_range=(1,3)))
            ])),

            # Pipeline for standard bag-of-words model for body
            ('get-request', Pipeline([
                ('selector', ItemSelector(key='request_text_edit_aware')),
                ('cv', CountVectorizer(analyzer='char_wb', 
                                       #vocabulary=vocabulary,
                                       max_df=0.5, ngram_range=(1,3)))
            ])),
            
           #('get-number', Pipeline([
           #     ('selector', ItemSelector(key='number'))
           # ]))

        ]
    )),

    # Use a SVC classifier on the combined features
    ('nb',LogisticRegression(penalty='l2',C=100)),
])



In [17]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
target_names = ['Got pizza', 'No pizza']
print(classification_report(y_test, y_pred, target_names=target_names))

             precision    recall  f1-score   support

  Got pizza       0.77      0.79      0.78       762
   No pizza       0.31      0.29      0.30       248

avg / total       0.66      0.67      0.66      1010



In [18]:
#run against the test data
test = pd.read_json('test.json')
x_fin_test = test[['request_text_edit_aware', 'request_title']]
pred1 = pipeline.predict(x_fin_test)

In [19]:
#Prep for submission
def inter(x):
    return int(x)
vint = np.vectorize(inter)
pred2 = vint(pred1)
fin_df = pd.concat([test['request_id'],pd.DataFrame(pred2, columns=['requester_received_pizza'])],axis =1)

In [20]:
fin_df.head()

Unnamed: 0,request_id,requester_received_pizza
0,t3_i8iy4,0
1,t3_1mfqi0,0
2,t3_lclka,1
3,t3_1jdgdj,0
4,t3_t2qt4,0


In [None]:
fin_df.to_csv('pizza_submission.csv', sep = ',', header=True, index=False)