In [1]:
import json
import pickle
from time import time

import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.cross_validation import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
# Classifiers obtained from http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html

# Original data

In [2]:
qas_df = pd.read_csv('data/qas.csv')

In [3]:
columns = ['Unnamed: 0', 'text', 'id']
qa_df = qas_df.drop(columns, axis=1)

In [4]:
len(qa_df[qa_df.accepted==True])

27233

In [5]:
len(qa_df[qa_df.accepted==False])

45614

In [6]:
qa_df.head()

Unnamed: 0,score_q,view_count,answer_count,comment_count_q,code_line_count_q,reputation_q,comment_count_a,code_line_count_a,score_a,accepted,reputation_a,percent_answered_questions_q,percent_accepted_answers_a
0,3.0,177.0,1.0,3.0,0.0,16.0,4.0,3.0,0.0,False,947.0,0.0,15.0
1,0.0,44.0,1.0,0.0,7.0,42.0,0.0,3.0,0.0,False,11986.0,0.0,50.0
2,1.0,127.0,1.0,0.0,0.0,28.0,0.0,0.0,0.0,False,6920.0,0.0,33.0
3,2.0,68.0,2.0,0.0,2.0,28.0,0.0,0.0,1.0,False,1055.0,0.0,40.0
4,2.0,68.0,2.0,0.0,2.0,28.0,1.0,0.0,0.0,False,21855.0,0.0,33.0


In [7]:
qa_df = qa_df.reindex(np.random.permutation(qa_df.index))

In [8]:
qa_df.head()

Unnamed: 0,score_q,view_count,answer_count,comment_count_q,code_line_count_q,reputation_q,comment_count_a,code_line_count_a,score_a,accepted,reputation_a,percent_answered_questions_q,percent_accepted_answers_a
1985,274.0,27223.0,13.0,10.0,21.0,1431.0,0.0,0.0,29.0,False,24157.0,83.0,0.0
12547,0.0,1308.0,3.0,2.0,5.0,6.0,0.0,37.0,0.0,False,18.0,0.0,50.0
44993,1.0,330.0,2.0,9.0,123.0,94.0,0.0,10.0,0.0,False,5808.0,40.0,43.0
51432,0.0,532.0,2.0,4.0,15.0,360.0,16.0,4.0,0.0,False,4795.0,67.0,29.0
51739,0.0,164.0,1.0,0.0,9.0,32.0,0.0,0.0,0.0,False,1837.0,0.0,33.0


In [9]:
qa_df.shape

(72847, 13)

# Classification

In [10]:
K = 10

In [11]:
def update(d1, d2):
    d = dict(d1)
    d.update(d2)
    return d

In [12]:
parameters = {
#     'vect__max_df': (0.5, 0.8, 1.0),
#     'vect__min_df': (0.0, 0.2, 1),
    'tfidf__norm': (None, 'l1', 'l2'),
    'tfidf__use_idf': (True, False),
}

classifiers = [
    ('BernoulliNB', BernoulliNB, update(parameters, {
        'clf__alpha': (1.0, 1e-1, 1e-2, 1e-3),
        'clf__fit_prior': (False, True),
    })),
#     ('KNeighborsClassifier', KNeighborsClassifier, update(parameters, {
#         'clf__weights': ('uniform', 'distance'),
#         'clf__metric': ('euclidean', 'manhattan', 'minkowski')
#     })),
    ('LinearSVC', LinearSVC, [
        update(parameters, {
#         'vect__min_df': (0.0, 1),
        'clf__loss': ('squared_hinge', 'hinge',),
        'clf__tol': (1e-4, 1e-5, 1e-6),
        'clf__class_weight': (None, 'balanced', {False: 0.63, True: 0.37}),
        }),
        update(parameters, {
#         'vect__min_df': (0.0, 1),
        'clf__penalty': ('l1', 'l2'),
        'clf__dual': (False,),
        'clf__tol': (1e-4, 1e-5, 1e-6),
        'clf__class_weight': (None, 'balanced', {False: 0.63, True: 0.37}),
        }),
    ]),
#     ('MultinomialNB', MultinomialNB, update(parameters, {
#         'clf__alpha': (1.0, 1e-1, 1e-2, 1e-3),
#         'clf__fit_prior': (False, True),
#     })),
#     ('NearestCentroid', NearestCentroid, update(parameters, {
#     })),
    ('PassiveAggressiveClassifier', PassiveAggressiveClassifier, update(parameters, {
#         'vect__min_df': (0.0, 1),
        'clf__loss': ('squared_hinge', 'hinge'),
        'clf__n_iter': (5, 10, 20),
        'clf__class_weight': (None, 'balanced', {False: 0.63, True: 0.37}),
        'clf__warm_start': (False, True),
    })),
#     ('Perceptron', Perceptron, update(parameters, {
#         'vect__min_df': (0.0, 1),
#         'tfidf__norm': (None, 'l2'),
#         'clf__penalty': (None, 'l1', 'l2', 'elasticnet'),
#         'clf__alpha': (1e-4, 1e-5, 1e-6),
#         'clf__class_weight': (None, 'balanced', {0: 0.25, 10: 0.28, 20: 0.23, 30: 0.24}),
#         'clf__warm_start': (False, True),
#     })),
#     ('RandomForestClassifier', RandomForestClassifier, update(parameters, {
#         'clf__n_estimators': (10, 20, 30),
#         'clf__criterion': ('gini', 'entropy'),
#         'clf__max_depth': (None, 5, 10),
#         'clf__class_weight': (None, {0: 0.25, 10: 0.28, 20: 0.23, 30: 0.24}),
#         'clf__warm_start': (False, True),
#     })),
#     ('RidgeClassifier', RidgeClassifier, update(parameters, {
#         'clf__alpha': (1.0, 1e-1, 1e-2, 1e-3),
#         'clf__class_weight': (None, 'balanced', {False: 0.63, True: 0.37}),
#         'clf__normalize': (True, False),
#         'clf__tol': (1e-1, 1e-2, 1e-3),
#     })),
#     ('SGDClassifier', SGDClassifier, [
#         update(parameters, {
#         'vect__min_df': (0.0, 1),
#         'clf__loss': ('hinge', 'log', 'modified_huber', 'perceptron',),
#         'clf__penalty': ('none', 'l1', 'l2', 'elasticnet'),
#         'clf__class_weight': (None, 'balanced'),
#         'clf__warm_start': (False, True),
#         }),
#         update(parameters, {
#         'vect__min_df': (0.0, 1),
#         'clf__loss': ('hinge', 'log', 'modified_huber', 'perceptron',),
#         'clf__penalty': ('none', 'l1', 'l2', 'elasticnet'),
#         'clf__alpha': (1e-5, 1e-6),
#         'clf__class_weight': ({0: 0.25, 10: 0.28, 20: 0.23, 30: 0.24},),
#         'clf__warm_start': (False, True),
#         }),
#     ]),
]

In [13]:
# parameters = {
# }

# classifiers = [
#     ('BernoulliNB', BernoulliNB, {}),
# #     ('KNeighborsClassifier', KNeighborsClassifier, {}),
#     ('LinearSVC', LinearSVC, {}),
# #     ('MultinomialNB', MultinomialNB, {}),
# #     ('NearestCentroid', NearestCentroid, {}),
#     ('PassiveAggressiveClassifier', PassiveAggressiveClassifier, {}),
# #     ('Perceptron', Perceptron, {}),
# #     ('RandomForestClassifier', RandomForestClassifier, {}),
#     ('RidgeClassifier', RidgeClassifier, {}),
# #     ('SGDClassifier', SGDClassifier, {}),
# ]

In [14]:
def benchmark(grid_search_cv, X_train, X_test, y_train, y_test, name):
    print('Training %s...' % name)
    t0 = time()
    grid_search_cv.fit(X_train, y_train)
    training_time = time() - t0
    print('Training time: %0.3fs' % training_time)
    print()
    score = float('%0.2f' % grid_search_cv.best_score_)
    print('Best score:', score)
    print('Best parameters:', grid_search_cv.best_params_)
    print()
    print('Testing %s...' % name)
    t0 = time()
    pred = grid_search_cv.predict(X_test)
    testing_time = time() - t0
    print('Testing time:  %0.3fs' % testing_time)
    print()
    print('Metrics:')
    accuracy = metrics.accuracy_score(y_test, pred)
    accuracy = float('%0.2f' % accuracy)
    print('accuracy  =', accuracy)
    precision = metrics.precision_score(y_test, pred, pos_label=True, average='binary')
    precision = float('%0.2f' % precision)
    print('precision =', precision)
    recall = metrics.recall_score(y_test, pred, pos_label=True, average='binary')
    recall = float('%0.2f' % recall)
    print('recall    =', recall)
    f1_score = metrics.f1_score(y_test, pred, pos_label=True, average='binary')
    f1_score = float('%0.2f' % f1_score)
    print('f1_score  =', f1_score)
    print()
    print('Classification report:')
    print(metrics.classification_report(y_test, pred, target_names=['No' ,'Yes']))
    print()
    print('Confusion matrix:')
    print(metrics.confusion_matrix(y_test, pred))
    print()
    return {
        'best_estimator': grid_search_cv.best_estimator_,
        'best_score': score,
        'best_parameters': grid_search_cv.best_params_,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
    }

In [34]:
X = qa_df.drop('accepted', axis=1)
y = qa_df['accepted']

In [16]:
y.value_counts()

False    45614
True     27233
Name: accepted, dtype: int64

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)

In [18]:
skf = StratifiedKFold(y_train, K)

In [19]:
# print('Performing grid search with cross-validation...')
# print('=' * 80)
# print()
# best_estimators = []
# for name, clf, parameters in classifiers:
#     pipeline = Pipeline([
# #         ('vect', CountVectorizer()),
#         ('tfidf', TfidfTransformer()),
#         ('clf', clf()),
#     ])
#     grid_search_cv = GridSearchCV(pipeline, parameters, cv=skf, scoring='f1_weighted', error_score=0, n_jobs=4)
#     best_estimators.append(benchmark(grid_search_cv, X_train, X_test, y_train, y_test, name))
#     print('-' * 80)

Performing grid search with cross-validation...

Training BernoulliNB...
Training time: 27.730s

Best score: 0.78
Best parameters: {'tfidf__norm': None, 'tfidf__use_idf': True, 'clf__alpha': 1.0, 'clf__fit_prior': True}

Testing BernoulliNB...
Testing time:  0.007s

Metrics:
accuracy  = 0.78
precision = 0.67
recall    = 0.81
f1_score  = 0.73

Classification report:
             precision    recall  f1-score   support

         No       0.87      0.77      0.81      4562
        Yes       0.67      0.81      0.73      2723

avg / total       0.80      0.78      0.78      7285


Confusion matrix:
[[3490 1072]
 [ 521 2202]]

--------------------------------------------------------------------------------
Training LinearSVC...
Training time: 2734.398s

Best score: 0.88
Best parameters: {'tfidf__norm': None, 'clf__dual': False, 'clf__penalty': 'l2', 'clf__tol': 0.0001, 'clf__class_weight': None, 'tfidf__use_idf': True}

Testing LinearSVC...
Testing time:  0.006s

Metrics:
accuracy  = 0.87
p

In [None]:
# Performing grid search with cross-validation...
# ================================================================================

# Training BernoulliNB...
# Training time: 27.730s

# Best score: 0.78
# Best parameters: {'tfidf__norm': None, 'tfidf__use_idf': True, 'clf__alpha': 1.0, 'clf__fit_prior': True}

# Testing BernoulliNB...
# Testing time:  0.007s

# Metrics:
# accuracy  = 0.78
# precision = 0.67
# recall    = 0.81
# f1_score  = 0.73

# Classification report:
#              precision    recall  f1-score   support

#          No       0.87      0.77      0.81      4562
#         Yes       0.67      0.81      0.73      2723

# avg / total       0.80      0.78      0.78      7285


# Confusion matrix:
# [[3490 1072]
#  [ 521 2202]]

# --------------------------------------------------------------------------------
# Training LinearSVC...
# Training time: 2734.398s

# Best score: 0.88
# Best parameters: {'tfidf__norm': None, 'clf__dual': False, 'clf__penalty': 'l2', 'clf__tol': 0.0001, 'clf__class_weight': None, 'tfidf__use_idf': True}

# Testing LinearSVC...
# Testing time:  0.006s

# Metrics:
# accuracy  = 0.87
# precision = 0.84
# recall    = 0.82
# f1_score  = 0.83

# Classification report:
#              precision    recall  f1-score   support

#          No       0.89      0.90      0.90      4562
#         Yes       0.84      0.82      0.83      2723

# avg / total       0.87      0.87      0.87      7285


# Confusion matrix:
# [[4121  441]
#  [ 485 2238]]

# --------------------------------------------------------------------------------
# Training PassiveAggressiveClassifier...
# Training time: 205.425s

# Best score: 0.73
# Best parameters: {'tfidf__norm': None, 'clf__warm_start': False, 'clf__loss': 'hinge', 'clf__n_iter': 20, 'tfidf__use_idf': False, 'clf__class_weight': None}

# Testing PassiveAggressiveClassifier...
# Testing time:  0.004s

# Metrics:
# accuracy  = 0.8
# precision = 0.87
# recall    = 0.56
# f1_score  = 0.68

# Classification report:
#              precision    recall  f1-score   support

#          No       0.78      0.95      0.86      4562
#         Yes       0.87      0.56      0.68      2723

# avg / total       0.81      0.80      0.79      7285


# Confusion matrix:
# [[4325  237]
#  [1195 1528]]

# --------------------------------------------------------------------------------

In [20]:
# with open('data/so_best_estimators.pickle', 'wb') as f:
#     pickle.dump(best_estimators, f)

# Feature selection

In [185]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import VarianceThreshold

In [186]:
X = qa_df.drop('accepted', axis=1)
y = qa_df['accepted']

In [187]:
X.head()

Unnamed: 0,score_q,view_count,answer_count,comment_count_q,code_line_count_q,reputation_q,comment_count_a,code_line_count_a,score_a,reputation_a,percent_answered_questions_q,percent_accepted_answers_a
1985,274.0,27223.0,13.0,10.0,21.0,1431.0,0.0,0.0,29.0,24157.0,83.0,0.0
12547,0.0,1308.0,3.0,2.0,5.0,6.0,0.0,37.0,0.0,18.0,0.0,50.0
44993,1.0,330.0,2.0,9.0,123.0,94.0,0.0,10.0,0.0,5808.0,40.0,43.0
51432,0.0,532.0,2.0,4.0,15.0,360.0,16.0,4.0,0.0,4795.0,67.0,29.0
51739,0.0,164.0,1.0,0.0,9.0,32.0,0.0,0.0,0.0,1837.0,0.0,33.0


In [188]:
X.shape

(72847, 12)

In [189]:
y.head()

1985     False
12547    False
44993    False
51432    False
51739    False
Name: accepted, dtype: bool

In [190]:
y.shape

(72847,)

In [191]:
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_inv = sel.fit_transform(X)
X_inv = pd.DataFrame(X_inv, index=X.index, columns=X.columns)

In [192]:
X_inv.head()

Unnamed: 0,score_q,view_count,answer_count,comment_count_q,code_line_count_q,reputation_q,comment_count_a,code_line_count_a,score_a,reputation_a,percent_answered_questions_q,percent_accepted_answers_a
1985,274.0,27223.0,13.0,10.0,21.0,1431.0,0.0,0.0,29.0,24157.0,83.0,0.0
12547,0.0,1308.0,3.0,2.0,5.0,6.0,0.0,37.0,0.0,18.0,0.0,50.0
44993,1.0,330.0,2.0,9.0,123.0,94.0,0.0,10.0,0.0,5808.0,40.0,43.0
51432,0.0,532.0,2.0,4.0,15.0,360.0,16.0,4.0,0.0,4795.0,67.0,29.0
51739,0.0,164.0,1.0,0.0,9.0,32.0,0.0,0.0,0.0,1837.0,0.0,33.0


In [193]:
X_inv.shape

(72847, 12)

In [194]:
y_inv = y[X_inv.index]

In [195]:
y_inv.head()

1985     False
12547    False
44993    False
51432    False
51739    False
Name: accepted, dtype: bool

In [196]:
y_inv.shape

(72847,)

In [197]:
X_pos = X_inv
for c in X.columns:
    X_pos = X_pos[X_pos[c] >= 0]

In [198]:
X_pos.shape

(65517, 12)

In [199]:
X_pos.head()

Unnamed: 0,score_q,view_count,answer_count,comment_count_q,code_line_count_q,reputation_q,comment_count_a,code_line_count_a,score_a,reputation_a,percent_answered_questions_q,percent_accepted_answers_a
1985,274.0,27223.0,13.0,10.0,21.0,1431.0,0.0,0.0,29.0,24157.0,83.0,0.0
12547,0.0,1308.0,3.0,2.0,5.0,6.0,0.0,37.0,0.0,18.0,0.0,50.0
44993,1.0,330.0,2.0,9.0,123.0,94.0,0.0,10.0,0.0,5808.0,40.0,43.0
51432,0.0,532.0,2.0,4.0,15.0,360.0,16.0,4.0,0.0,4795.0,67.0,29.0
51739,0.0,164.0,1.0,0.0,9.0,32.0,0.0,0.0,0.0,1837.0,0.0,33.0


In [200]:
y_pos = y_inv[X_pos.index]

In [201]:
y_pos.shape

(65517,)

In [202]:
feature_names = list(X.columns)

In [203]:
feature_names

['score_q',
 'view_count',
 'answer_count',
 'comment_count_q',
 'code_line_count_q',
 'reputation_q',
 'comment_count_a',
 'code_line_count_a',
 'score_a',
 'reputation_a',
 'percent_answered_questions_q',
 'percent_accepted_answers_a']

In [204]:
ch2 = SelectKBest(chi2, k=2)
X_train_best = ch2.fit_transform(X_pos, y_pos)
X_test_best = ch2.transform(X_test)
if feature_names:
    feature_names = [feature_names[i] for i in ch2.get_support(indices=True)]
feature_names = np.asarray(feature_names)

In [205]:
feature_names

array(['reputation_q', 'reputation_a'], 
      dtype='<U12')

In [26]:
# split a training set and a test set
y_train = train_labels
y_test = test_labels

print("Extracting features from the training data using a sparse vectorizer")
t0 = time()
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
X_train = vectorizer.fit_transform(data_train.sentence.tolist())
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test.sentence.tolist())
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()

# mapping from integer feature name to original token string
feature_names = vectorizer.get_feature_names()
opts_select_chi2 = 50

print("Extracting %d best features by a chi-squared test" %
      opts_select_chi2)
ch2 = SelectKBest(chi2, k=opts_select_chi2)
X_train2 = ch2.fit_transform(X_train, y_train)
X_test2 = ch2.transform(X_test)
if feature_names:
    # keep selected feature names
    feature_names = [feature_names[i] for i
                     in ch2.get_support(indices=True)]
print()

feature_names = np.asarray(feature_names)

NameError: name 'vectorizer' is not defined

In [23]:
# feature_names