### What's in a name?

The purpose of the notebook is to describe my efforts to predict whether or not a subreddit post will receive more than the average number of upvotes.

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline, FeatureUnion
from feature_extraction import Blob, Words
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.decomposition import TruncatedSVD



# Steps 1 - 4
1.  Reads in csv-formatted subreddit data
    1a. You can use the grab_posts function to collect posts inline
2.  Remove outliers using z-scores greater than -2.5 and less than 2.5
3.  Create Label data (It is the boolean set to whether or not a title will receive more than average number of upvotes )
4.  Split data into training and test# 

In [2]:
#Read in posts
#from grab_posts import grab_posts
#data = grab_posts(sub='todayilearned', start='01/24/2017', end='01/24/2018', number=30000)
data = pd.read_csv('processed_datascience.csv.bz2')


## Throw out outliers by including rows with Z-Scores less than 2.5 and greater than -2.5
data['z_scores'] = np.abs((data.ups-data.ups.mean())/data.ups.std())
data = data[data['z_scores']<= 2.5]

# Create Label column defining whether or not the article's upvotes exceed the average vote for the subreddit
data['gtavg'] = data.ups > data.ups.mean()

train_X, test_X, train_y, test_y = train_test_split(data.title, 
                                                    data.gtavg, 
                                                    test_size=0.20,
                                                    random_state=25)

print('AVERAGE: ', data.ups.mean())

AVERAGE:  7.52031863311


# Step 5-6:  
The workflow pipeline consists of two segments. The first is feature extraction, decomposition,and reduction. The second is training a classifier.  A reminder, all feature information is derieved from the title of each post
5.  Feature Extraction
    
    1.  Words:
            Creates a dataframe containing the number of the following in each title:
                1. Consonants
                2. Vowels
                3. Words
                4. Characters
    2.  Blobs:
            Creates a dataframe containing the following characteristics of each title
                1. Noun Phrases
                2. Subjectivity - Sentiment Score
                3. Polarity - Sentiment Score
    3.  Tf-Idf:
            Vectorizes text.  Tf-idf creates a word vector in which a word is weighted by it occurence not only in the title it was derived from but also the entire corpus.  It works like this.
                1.  Use uni-grams to tri-grams, english stop_words, and sublinear term frequency
                2.  TruncatedSVD reduces features through LSA decomposition
                3.  SelectPercentile selects the top 10% of features that improve prediction

6.  Train Classifier:  MultinomialNB
    1.  Choose this classifier for it's speed.  I examine other classifiers and tuned hyperparameters but the results were pretty much the same.
    2.  Fit on training data
    3.  predict test set
    4.  Measure accuracy
    5.  Cross validate Score
    

In [3]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        transformer_list=[
                      
            ('words', Words()),
                      
            ('title', Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(1,3), sublinear_tf=True, stop_words='english')),
                ('svd', TruncatedSVD(n_components=120)),
                ('normalize', MinMaxScaler(copy=False)),
                ('selector', SelectPercentile(f_classif, percentile=10))
            ])),
            
            ('blob', Pipeline([
                ('all', Blob()),
                ('minmax', MinMaxScaler()),
            ])),
            
            ])),
    ('clf', MultinomialNB()),
        ])

pipeline.fit(train_X, train_y)

y_pred = pipeline.predict(test_X)
accuracy_score(y_pred=y_pred, y_true=test_y)

#cross_val_score(pipeline, train_X, train_y, cv=5)

0.71148297749567224

In [6]:
pipeline.predict(pd.Series(["According to Glassdoor, Data Scientists are the #1 best job in the US 3 years in a row. Is this true?"]))[0]

False

In [None]:
joblib.dump(pipeline, 'datascience.pkl')

In [5]:
'''
from sklearn.model_selection import GridSearchCV
from time import time
from pprint import pprint

parameters = {
    'clf_n_estimators':(120,300,500,800,1200),
    'clf__max_depth':(5,8,15,25,30, None),
    'clf__min_samples_split':(2,5,10,15,100),
    'clf__min_samples_leaf':(1,2,5,10),
    'clf__max_features':('log2','sqrt',None),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(train_X, train_y)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


y = grid_search.predict(test_X)

accuracy_score(y_pred=y, y_true=test_y)
'''

'\nfrom sklearn.model_selection import GridSearchCV\nfrom time import time\nfrom pprint import pprint\n\nparameters = {\n    \'clf_n_estimators\':(120,300,500,800,1200),\n    \'clf__max_depth\':(5,8,15,25,30, None),\n    \'clf__min_samples_split\':(2,5,10,15,100),\n    \'clf__min_samples_leaf\':(1,2,5,10),\n    \'clf__max_features\':(\'log2\',\'sqrt\',None),\n}\n\ngrid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)\n\nprint("Performing grid search...")\nprint("pipeline:", [name for name, _ in pipeline.steps])\nprint("parameters:")\npprint(parameters)\nt0 = time()\ngrid_search.fit(train_X, train_y)\nprint("done in %0.3fs" % (time() - t0))\nprint()\n\nprint("Best score: %0.3f" % grid_search.best_score_)\nprint("Best parameters set:")\nbest_parameters = grid_search.best_estimator_.get_params()\nfor param_name in sorted(parameters.keys()):\n    print("\t%s: %r" % (param_name, best_parameters[param_name]))\n\n\ny = grid_search.predict(test_X)\n\naccuracy_score(y_pred=y, y