In [1]:
from pymongo import MongoClient 
import pandas as pd
from sklearn.metrics import accuracy_score
import numpy as np

try: 
    connection = MongoClient() 
    print("Connected successfully!!!") 
except:   
    print("Could not connect to MongoDB")
    
database = connection.flair_database
coll_train = database.training_data4
coll_test = database.testing_data4

Connected successfully!!!


In [2]:
training_data = pd.DataFrame(list(coll_train.find()))

In [3]:
testing_data = pd.DataFrame(list(coll_test.find()))

In [4]:
final_features = [u'author', u'comments', u'is_original_content',
       u'is_reddit_media_domain', u'is_video', u'over_18', u'permalink',
       u'secure_media', u'selftext', u'send_replies', u'title', u'url',
       u'edited', u'num_comments', u'num_duplicates', u'subreddit_subscribers',
       u'ups', u'upvote_ratio']

In [5]:
y_test = testing_data['link_flair_text']
y_train = training_data['link_flair_text']

In [6]:
training_data = training_data[final_features]
testing_data = testing_data[final_features]

In [10]:

def get_given_features_from_data(feature_list, data):
#     print feature_list[0]
    final_data = np.array([x for x in data[feature_list[0]]])
    final_data = np.atleast_2d(final_data)
    if(final_data.shape[0]==1):
        final_data = final_data.T
#     print final_data.shape
    for i in range(1,len(feature_list)):
        feature = feature_list[i]
#         print feature
        data_features = np.array([x for x in data[feature]])
        reshaped_array = np.atleast_2d(data_features)
        if(reshaped_array.shape[0]==1):
            reshaped_array = reshaped_array.T
        final_data = np.concatenate((final_data,reshaped_array),axis=1)
#     print final_data.shape
    return final_data

def get_train_test_for_feature_list(feauture_list,train,test):
    final_training_data = get_given_features_from_data(feauture_list,train)
    final_testing_data = get_given_features_from_data(feauture_list,test)
    return final_training_data,final_testing_data
    

from sklearn.neural_network import MLPClassifier
def train_and_predict1(x_train,x_test,y_train,y_test):
    clf = MLPClassifier(solver='lbfgs',alpha=1e-5,hidden_layer_sizes=(2),random_state=1).fit(x_train, y_train)
    y_predictions = clf.predict(x_test)
    return accuracy_score(y_test, y_predictions),clf

from sklearn.ensemble import RandomForestClassifier
def train_and_predict2(x_train,x_test,y_train,y_test):
    clf = RandomForestClassifier(n_estimators=1000, max_depth=10,random_state=1).fit(x_train, y_train)
    y_predictions = clf.predict(x_test)
    return accuracy_score(y_test, y_predictions),clf

from sklearn.svm import LinearSVC
def train_and_predict3(x_train,x_test,y_train,y_test):
    clf = LinearSVC(random_state=1, tol=1e-5,C=0.1).fit(x_train,y_train)
    y_predictions = clf.predict(x_test)
    return accuracy_score(y_test, y_predictions),clf

def get_accuracy_for_features(feature_list,train,test):
    print(feature_list)
    x_train,x_test = get_train_test_for_feature_list(feature_list,train,test)
    acc1,mlp = train_and_predict1(x_train,x_test,y_train,y_test)
    acc2,rf = train_and_predict2(x_train,x_test,y_train,y_test)
    acc3,svc = train_and_predict3(x_train,x_test,y_train,y_test)
#     print "Features: " + feature_list
    print("MLP " + str(acc1))
    print("RF " + str(acc2))
    print("SVC " + str(acc3))
    print("")
    return acc1,acc2,acc3,mlp,rf,svc

In [11]:

acc1,acc2,acc3,mlp,rf,svc = get_accuracy_for_features(final_features,training_data,testing_data)


['author', 'comments', 'is_original_content', 'is_reddit_media_domain', 'is_video', 'over_18', 'permalink', 'secure_media', 'selftext', 'send_replies', 'title', 'url', 'edited', 'num_comments', 'num_duplicates', 'subreddit_subscribers', 'ups', 'upvote_ratio']
MLP 0.6111111111111112
RF 0.5909090909090909
SVC 0.5959595959595959



In [12]:
for f in final_features:
    acc1,acc2,acc3,mlp,rf,svc = get_accuracy_for_features([f],training_data,testing_data)

['author']
MLP 0.4797979797979798
RF 0.494949494949495
SVC 0.4797979797979798

['comments']
MLP 0.6060606060606061
RF 0.5808080808080808
SVC 0.601010101010101

['is_original_content']
MLP 0.4797979797979798
RF 0.4797979797979798
SVC 0.4797979797979798

['is_reddit_media_domain']
MLP 0.4797979797979798
RF 0.4797979797979798
SVC 0.4797979797979798

['is_video']
MLP 0.4797979797979798
RF 0.4797979797979798
SVC 0.4797979797979798

['over_18']
MLP 0.4797979797979798
RF 0.4797979797979798
SVC 0.4797979797979798

['permalink']
MLP 0.4797979797979798
RF 0.494949494949495
SVC 0.5050505050505051

['secure_media']
MLP 0.4797979797979798
RF 0.4797979797979798
SVC 0.4797979797979798

['selftext']
MLP 0.4696969696969697
RF 0.4797979797979798
SVC 0.4797979797979798

['send_replies']
MLP 0.4797979797979798
RF 0.4797979797979798
SVC 0.4797979797979798

['title']
MLP 0.5151515151515151
RF 0.5303030303030303
SVC 0.5202020202020202

['url']
MLP 0.51010101010101
RF 0.48484848484848486
SVC 0.525252525252525

In [13]:
get_accuracy_for_features([u'author', u'comments', u'permalink',
       u'secure_media', u'selftext', u'title', u'url'],training_data,testing_data)

['author', 'comments', 'permalink', 'secure_media', 'selftext', 'title', 'url']
MLP 0.601010101010101
RF 0.5909090909090909
SVC 0.5909090909090909



(0.601010101010101,
 0.5909090909090909,
 0.5909090909090909,
 MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
               beta_2=0.999, early_stopping=False, epsilon=1e-08,
               hidden_layer_sizes=2, learning_rate='constant',
               learning_rate_init=0.001, max_iter=200, momentum=0.9,
               n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
               random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
               validation_fraction=0.1, verbose=False, warm_start=False),
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=10, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=1000,
                        n_jobs=None, oob_score=False, random_state=1, 

In [14]:
get_accuracy_for_features([u'comments', u'selftext', u'title'],training_data,testing_data)

['comments', 'selftext', 'title']
MLP 0.5909090909090909
RF 0.5909090909090909
SVC 0.5909090909090909



(0.5909090909090909,
 0.5909090909090909,
 0.5909090909090909,
 MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
               beta_2=0.999, early_stopping=False, epsilon=1e-08,
               hidden_layer_sizes=2, learning_rate='constant',
               learning_rate_init=0.001, max_iter=200, momentum=0.9,
               n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
               random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
               validation_fraction=0.1, verbose=False, warm_start=False),
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=10, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=1000,
                        n_jobs=None, oob_score=False, random_state=1,

In [15]:
get_accuracy_for_features([u'comments',u'title'],training_data,testing_data)

['comments', 'title']
MLP 0.601010101010101
RF 0.5757575757575758
SVC 0.5858585858585859



(0.601010101010101,
 0.5757575757575758,
 0.5858585858585859,
 MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
               beta_2=0.999, early_stopping=False, epsilon=1e-08,
               hidden_layer_sizes=2, learning_rate='constant',
               learning_rate_init=0.001, max_iter=200, momentum=0.9,
               n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
               random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
               validation_fraction=0.1, verbose=False, warm_start=False),
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=10, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=1000,
                        n_jobs=None, oob_score=False, random_state=1, 

In [16]:
get_accuracy_for_features(training_data.select_dtypes(include=[np.number]).columns,training_data,testing_data)

Index(['is_original_content', 'is_reddit_media_domain', 'is_video', 'over_18',
       'send_replies', 'edited', 'num_comments', 'num_duplicates',
       'subreddit_subscribers', 'ups', 'upvote_ratio'],
      dtype='object')
MLP 0.4797979797979798
RF 0.5050505050505051
SVC 0.494949494949495



(0.4797979797979798,
 0.5050505050505051,
 0.494949494949495,
 MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
               beta_2=0.999, early_stopping=False, epsilon=1e-08,
               hidden_layer_sizes=2, learning_rate='constant',
               learning_rate_init=0.001, max_iter=200, momentum=0.9,
               n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
               random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
               validation_fraction=0.1, verbose=False, warm_start=False),
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=10, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=1000,
                        n_jobs=None, oob_score=False, random_state=1, 

In [17]:
get_accuracy_for_features([u'is_original_content', u'is_reddit_media_domain', u'is_video',
       u'over_18', u'send_replies'],training_data,testing_data)

['is_original_content', 'is_reddit_media_domain', 'is_video', 'over_18', 'send_replies']
MLP 0.4797979797979798
RF 0.4797979797979798
SVC 0.4797979797979798



(0.4797979797979798,
 0.4797979797979798,
 0.4797979797979798,
 MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
               beta_2=0.999, early_stopping=False, epsilon=1e-08,
               hidden_layer_sizes=2, learning_rate='constant',
               learning_rate_init=0.001, max_iter=200, momentum=0.9,
               n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
               random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
               validation_fraction=0.1, verbose=False, warm_start=False),
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=10, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=1000,
                        n_jobs=None, oob_score=False, random_state=1,

In [18]:
get_accuracy_for_features([u'author', u'comments', u'permalink',
       u'secure_media', u'selftext', u'title', u'url',
       u'edited', u'num_comments', u'num_duplicates', u'subreddit_subscribers',
       u'ups', u'upvote_ratio'],training_data,testing_data)

['author', 'comments', 'permalink', 'secure_media', 'selftext', 'title', 'url', 'edited', 'num_comments', 'num_duplicates', 'subreddit_subscribers', 'ups', 'upvote_ratio']
MLP 0.6212121212121212
RF 0.5858585858585859
SVC 0.601010101010101



(0.6212121212121212,
 0.5858585858585859,
 0.601010101010101,
 MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
               beta_2=0.999, early_stopping=False, epsilon=1e-08,
               hidden_layer_sizes=2, learning_rate='constant',
               learning_rate_init=0.001, max_iter=200, momentum=0.9,
               n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
               random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
               validation_fraction=0.1, verbose=False, warm_start=False),
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=10, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=1000,
                        n_jobs=None, oob_score=False, random_state=1, 

In [19]:
get_accuracy_for_features([u'author', u'comments', u'is_original_content',
       u'is_reddit_media_domain', u'is_video', u'over_18', u'permalink',
       u'secure_media', u'selftext', u'send_replies', u'title', u'url'],training_data,testing_data)

['author', 'comments', 'is_original_content', 'is_reddit_media_domain', 'is_video', 'over_18', 'permalink', 'secure_media', 'selftext', 'send_replies', 'title', 'url']
MLP 0.4797979797979798
RF 0.5808080808080808
SVC 0.5909090909090909



(0.4797979797979798,
 0.5808080808080808,
 0.5909090909090909,
 MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
               beta_2=0.999, early_stopping=False, epsilon=1e-08,
               hidden_layer_sizes=2, learning_rate='constant',
               learning_rate_init=0.001, max_iter=200, momentum=0.9,
               n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
               random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
               validation_fraction=0.1, verbose=False, warm_start=False),
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=10, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=1000,
                        n_jobs=None, oob_score=False, random_state=1,

In [20]:

acc1,acc2,acc3,mlp,rf,svc = get_accuracy_for_features(final_features,training_data,testing_data)


['author', 'comments', 'is_original_content', 'is_reddit_media_domain', 'is_video', 'over_18', 'permalink', 'secure_media', 'selftext', 'send_replies', 'title', 'url', 'edited', 'num_comments', 'num_duplicates', 'subreddit_subscribers', 'ups', 'upvote_ratio']
MLP 0.6111111111111112
RF 0.5909090909090909
SVC 0.5959595959595959



In [21]:
import joblib 
joblib.dump(mlp, 'classifier.pkl') 

['classifier.pkl']