In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from feature_extraction import Blob, Words
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from time import time
from pprint import pprint
from gp_test import main

In [2]:
data = pd.read_csv('processed_posts.csv.bz2')
#data = pd.read_csv('processed_python.csv.bz2', encoding='ISO-8859-1', compression='bz2')
#data = main(sub='todayilearned', start='01/24/2017', end='01/24/2018', number=30000)
data.title = data.title.astype('str')
data.drop_duplicates('title', inplace=True)

In [3]:
# data = pd.read_csv('processed_todayilearned.csv.bz2', encoding='ISO-8859-1')
data = data[np.abs(data.ups-data.ups.mean())<=(data.ups.std())]

data = data[data['subreddit'] == 'r/todayilearned']
data['gtavg'] = data['ups'] > data.ups.mean()
train_X, test_X, train_y, test_y = train_test_split(data.title, 
                                                    data.gtavg, 
                                                    test_size=0.20,
                                                    random_state=42)

In [4]:
a = data[data['gtavg']==False].shape[0]
b = data[data['gtavg']==True].shape[0]
print('False: ', (a/(a+b)*100))
print('True: ', (b/(a+b)*100))

False:  81.20558041589892
True:  18.794419584101078


In [5]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        transformer_list=[
                      
            ('words', Words()),
                      
            ('title', Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(1,3), sublinear_tf=True, stop_words='english')),
                ('svd', TruncatedSVD(n_components=120)),
                ('normalize', MinMaxScaler(copy=False)),
                ('selector', SelectPercentile(f_classif, percentile=10))
            ])),
            
            ('blob', Pipeline([
                ('all', Blob()),
                ('minmax', MinMaxScaler()),
            ])),
            
            ])),
    ('clf', MultinomialNB()),
        ])

In [6]:
pipeline.fit(train_X, train_y)

Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('words', Words()), ('title', Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=T...    transformer_weights=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [7]:
y = pipeline.predict(test_X)
accuracy_score(y_pred=y, y_true=test_y)

0.81973684210526321

In [8]:
cross_val_score(pipeline, train_X, train_y, cv=5)

array([ 0.80952381,  0.80952381,  0.81054366,  0.81054366,  0.81054366])

In [9]:
index = 2
print (test_X.iloc[index])
print (test_y.iloc[index])
print (y[index])
pipeline.predict(pd.Series(['Inspiration']))[0]

 that other than sugar and water, the main ingredient in Mountain Dew is orange juice.
False
False


False

In [10]:

'''
parameters = {
    'clf_n_estimators':(120,300,500,800,1200),
    'clf__max_depth':(5,8,15,25,30, None),
    'clf__min_samples_split':(2,5,10,15,100),
    'clf__min_samples_leaf':(1,2,5,10),
    'clf__max_features':('log2','sqrt',None),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(train_X, train_y)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


y = grid_search.predict(test_X)

accuracy_score(y_pred=y, y_true=test_y)
'''

'\nparameters = {\n    \'clf_n_estimators\':(120,300,500,800,1200),\n    \'clf__max_depth\':(5,8,15,25,30, None),\n    \'clf__min_samples_split\':(2,5,10,15,100),\n    \'clf__min_samples_leaf\':(1,2,5,10),\n    \'clf__max_features\':(\'log2\',\'sqrt\',None),\n}\n\ngrid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)\n\nprint("Performing grid search...")\nprint("pipeline:", [name for name, _ in pipeline.steps])\nprint("parameters:")\npprint(parameters)\nt0 = time()\ngrid_search.fit(train_X, train_y)\nprint("done in %0.3fs" % (time() - t0))\nprint()\n\nprint("Best score: %0.3f" % grid_search.best_score_)\nprint("Best parameters set:")\nbest_parameters = grid_search.best_estimator_.get_params()\nfor param_name in sorted(parameters.keys()):\n    print("\t%s: %r" % (param_name, best_parameters[param_name]))\n\n\ny = grid_search.predict(test_X)\n\naccuracy_score(y_pred=y, y_true=test_y)\n'