In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from feature_extraction import Blob, Words
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from time import time
from pprint import pprint

In [2]:
#data = pd.read_csv('processed_posts.csv.bz2')
data = pd.read_csv('processed_datascience.csv.bz2', encoding='ISO-8859-1', compression='bz2')
data.drop_duplicates('title', inplace=True)

data['gt5'] = data['ups'] > 5
data['gt20'] = data['ups'] > 20
data['gt10'] = data['ups'] > 10
data['gt50'] = data['ups'] > 50
data['gt100'] = data['ups'] > 100
# data['wordcount'] = data['title'].str.split().apply(len)
# data['charcount'] = data.title.str.len()
# data['vowels'] = data.title.str.findall(r'[aeiou]').apply(len)
# data['consonants'] = data.title.str.findall(r'[^aeiou]').apply(len)

#data = data[data['subreddit'] == 'r/books']
train_X, test_X, train_y, test_y = train_test_split(data.title, 
                                                    data.gt10, 
                                                    test_size=0.20,
                                                    random_state=42)

In [9]:
print(data[data['gt10']==False].shape)
print(data[data['gt10']==True].shape)

(6205, 11)
(1898, 11)


In [10]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        transformer_list=[
                      
            ('words', Words()),
                      
            ('title', Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(1,3), sublinear_tf=True, stop_words='english')),
                ('svd', TruncatedSVD(n_components=120)),
                ('normalize', MinMaxScaler(copy=False)),
                ('selector', SelectPercentile(f_classif, percentile=10))
            ])),
            
            ('blob', Pipeline([
                ('all', Blob()),
                ('minmax', MinMaxScaler()),
            ])),
            
            ])),
    ('clf', ExtraTreesClassifier()),
        ])

In [15]:
parameters = {
    'clf_n_estimators':(120,300,500,800,1200),
    'clf__max_depth':(5,8,15,25,30, None),
    'clf__min_samples_split':(2,5,10,15,100),
    'clf__min_samples_leaf':(1,2,5,10),
    'clf__max_features':('log2','sqrt',None),
}

In [16]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(train_X, train_y)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['union', 'clf']
parameters:
{'clf__max_depth': (5, 8, 15, 25, 30, None),
 'clf__max_features': ('log2', 'sqrt', None),
 'clf__min_samples_leaf': (1, 2, 5, 10),
 'clf__min_samples_split': (2, 5, 10, 15, 100)}
Fitting 3 folds for each of 360 candidates, totalling 1080 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 18.9min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 45.6min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 82.0min
[Parallel(n_jobs=-1)]: Done 1080 out of 1080 | elapsed: 111.9min finished


done in 6728.687s

Best score: 0.763
Best parameters set:
	clf__max_depth: 5
	clf__max_features: None
	clf__min_samples_leaf: 5
	clf__min_samples_split: 10


In [22]:
y = grid_search.predict(test_X)

In [23]:
accuracy_score(y_pred=y, y_true=test_y)

0.78223318938926589

In [10]:
pipeline.fit(train_X, train_y)

Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('words', Words()), ('title', Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=T...      presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))])

In [11]:
y = pipeline.predict(test_X)
accuracy_score(y_pred=y, y_true=test_y)

0.76681061073411472

In [None]:
cross_val_score(pipeline, train_X, train_y, cv=5)

In [62]:
index = 2
print (test_X.iloc[index])
print (test_y.iloc[index])
print (y[index])
# pipeline.predict(pd.Series(['My Job Search as a PhD Student']))[0]

b'We shall miss Hans Rosling, but let\xe2\x80\x99s learn from his genius'
True
False


In [None]:
data.to_csv('gt_politics.csv', index=False)

In [None]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        transformer_list=[
                      
            ('wordcount', WordCount()),
            
            ('charcount', CharCount()),
            
            ('title', TfidfVectorizer(ngram_range=(1,4))),
            
            ('vowel', Vowels()),
            
            ('consonants', Consonants()),
            
            ('polarity', Polarity()),
            
            ('subjectivity', Subjectivity()),
            
            ('noun_phrases', Nouns()),
            
            ])),
    ('clf', ExtraTreesClassifier()),
        ])