#StackOverflow answer classifier

In [2]:
import re
import os
from html.parser import HTMLParser
from time import time
from xml.etree import ElementTree as etree
from xml.etree.ElementTree import Element

import numpy as np
import pandas as pd

##Data

In [3]:
qas_df = pd.read_csv('data/qas.csv')

###b) Non textual features

In [4]:
columns = ['Unnamed: 0', 'text', 'id']
qa_df = qas_df.drop(columns, axis=1)

In [5]:
len(qa_df[qa_df.accepted==True])

27233

In [6]:
len(qa_df[qa_df.accepted==False])

45614

In [7]:
qa_df.head()

Unnamed: 0,score_q,view_count,answer_count,comment_count_q,code_line_count_q,reputation_q,comment_count_a,code_line_count_a,score_a,accepted,reputation_a,percent_answered_questions_q,percent_accepted_answers_a
0,3,177,1,3,0,16,4,3,0,False,947,0,15
1,0,44,1,0,7,42,0,3,0,False,11986,0,50
2,1,127,1,0,0,28,0,0,0,False,6920,0,33
3,2,68,2,0,2,28,0,0,1,False,1055,0,40
4,2,68,2,0,2,28,1,0,0,False,21855,0,33


In [8]:
qa_df = qa_df.reindex(np.random.permutation(qa_df.index))

In [9]:
qa_df.head()

Unnamed: 0,score_q,view_count,answer_count,comment_count_q,code_line_count_q,reputation_q,comment_count_a,code_line_count_a,score_a,accepted,reputation_a,percent_answered_questions_q,percent_accepted_answers_a
3896,0,349,5,2,9,302,0,5,0,False,8174,67,0
66384,0,84,1,1,5,0,0,7,1,False,651,0,0
51233,0,56,1,0,17,10850,2,15,2,True,269859,100,46
31946,2,512,1,2,35,33,0,8,2,True,33,100,100
62908,3,114,3,0,0,12,5,0,3,False,138,0,0


##scikit-learn

In [10]:
import logging

from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.cross_validation import cross_val_score, StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.utils.extmath import density

In [11]:
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

In [12]:
from sklearn.feature_extraction import DictVectorizer

v = DictVectorizer(sparse=False)
X = v.fit_transform(qa_df.ix[0:10].T.to_dict().values())
X

array([[  0.00000000e+00,   1.00000000e+00,   3.00000000e+00, ...,
          0.00000000e+00,   3.00000000e+00,   1.77000000e+02],
       [  0.00000000e+00,   2.00000000e+00,   5.00000000e+00, ...,
          2.00000000e+00,   1.00000000e+00,   1.46200000e+03],
       [  1.00000000e+00,   2.00000000e+00,   5.00000000e+00, ...,
          5.00000000e+00,   2.00000000e+00,   8.03000000e+02],
       ..., 
       [  1.00000000e+00,   1.00000000e+00,   1.00000000e+00, ...,
          2.00000000e+00,  -2.00000000e+00,   2.38100000e+03],
       [  0.00000000e+00,   2.00000000e+00,   4.00000000e+00, ...,
          0.00000000e+00,   1.00000000e+00,   1.64000000e+02],
       [  0.00000000e+00,   3.00000000e+00,   5.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   1.00000000e+02]])

In [13]:
def size_mb(docs):
    return sum(len(s) for s in docs) / 1e6

# data_train_size_mb = size_mb(data_train['sentence'])
# data_test_size_mb = size_mb(data_test['sentence'])

# print("%d documents - %0.3fMB (training set)" % (
#     len(data_train), data_train_size_mb))
# print("%d documents - %0.3fMB (test set)" % (
#     len(data_test), data_test_size_mb))
# print("%d categories" % len(categories))
# print()

In [14]:
>>> from sklearn.metrics import confusion_matrix
>>> y_true = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1]
>>> y_pred = [1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1]
>>> print(confusion_matrix(y_true, y_pred))
#     | Pred
#     |-----
# True| 0 1
# ----------
#  0  | 4 1
#  1  | 2 5


[[4 1]
 [2 5]]


In [27]:
def benchmark(clf, X_train, X_test, y_train, y_test, name):
    print("_" * 80)
    print("Training: %s" % name)
    print(clf)
    
#     X_train = X_train.T.to_dict().values()
#     X_test = X_test.T.to_dict().values()
#     y_train = y_train.T.to_dict().values()
#     y_test = y_test.T.to_dict().values()
    
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    precision = metrics.precision_score(y_test, pred)
    print("precision:   %0.3f" % score)

    recall = metrics.recall_score(y_test, pred)
    print("recall:   %0.3f" % score)

    print("classification report:")
    print(metrics.classification_report(y_test, pred, target_names=['No', 'Yes']))

    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    print()
    return score, precision, recall

In [16]:
# Params
K = 10
parameters = {
#     'vect__max_df': (0.5, 0.75, 1.0),
#     'vect__max_features': (None, 1000, 5000, 10000, 50000),
#     'vect__ngram_range': ((1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3)),  # unigrams or bigrams or trigrams
    'vect__stop_words': (None, stopwords.words('english')),
    'tfidf__use_idf': (True, False),
#     'tfidf__norm': ('l1', 'l2'),
#     'clf__alpha': (1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6),
#     'clf__penalty': ('l1', 'l2', 'elasticnet'),
#     'clf__n_iter': (10, 50, 80),
#     'clf__loss': ('log', 'modified_huber'),
}

In [28]:
# Classifiers
clf_list = [
    (RidgeClassifier(alpha=.00001, tol=1e-2, solver="lsqr"), "Ridge classifier"),
    (Perceptron(alpha=.00001, n_iter=50), "Perceptron"),
    (PassiveAggressiveClassifier(n_iter=50), "Passive-aggressive"),
#     NO USAR (KNeighborsClassifier(n_neighbors=10), "kNN"),
#     MUY LENTO (RandomForestClassifier(n_estimators=100), "Random forest"),
    (LinearSVC(loss='squared_hinge', penalty='l2', dual=False, tol=1e-3), 'Linear SVC'),
    (SGDClassifier(alpha=.000001, n_iter=50, penalty='l1'), 'SGDClassifier'),
    (NearestCentroid(), 'Nearest Centroid'),
#     (MultinomialNB(alpha=.00001), 'Multinomial NB'),
    (BernoulliNB(alpha=.00001), 'Bernoulli NB'),
    (LinearSVC(penalty="l1", dual=False, tol=1e-3), 'Linear SVC'),
#     (GaussianNB(), 'Gaussian NB')
]

In [29]:
from collections import defaultdict
# results = defaultdict(lambda: defaultdict(list))
results = defaultdict(list)

X = qa_df.drop('accepted', axis=1)
y = qa_df['accepted']

X = np.array(list(X.T.to_dict().values()))
y = np.array(list(y.T.to_dict().values()))

X_data, X_val, y_data, y_val = train_test_split(X, y, train_size=0.9, test_size=0.1, random_state=100)

# X_data = X_data.reset_index(drop=True)
# X_val = X_val.reset_index(drop=True)
# y_data = y_data.reset_index(drop=True)
# y_val = y_val.reset_index(drop=True)

for train_index, test_index in StratifiedKFold(y_data, 2):#K):
    X_train, X_test = X_data[train_index], X_data[test_index]
    y_train, y_test = y_data[train_index], y_data[test_index]
    
    for clf, name in clf_list:
        pipeline = Pipeline([
            ('vect', DictVectorizer()),
            ('clf', clf),
        ])
#         grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
#         benchmark_results = benchmark(grid_search, X_train, X_test, y_train, y_test)
        benchmark_results = benchmark(pipeline, X_train, X_test, y_train, y_test, name)
        results[name].append(benchmark_results)

________________________________________________________________________________
Training: Ridge classifier
Pipeline(steps=[('vect', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True)), ('clf', RidgeClassifier(alpha=1e-05, class_weight=None, copy_X=True,
        fit_intercept=True, max_iter=None, normalize=False, solver='lsqr',
        tol=0.01))])
train time: 0.574s
test time:  0.500s
accuracy:   0.628
precision:   0.628
recall:   0.628
classification report:
             precision    recall  f1-score   support

         No       0.63      0.98      0.77     20520
        Yes       0.54      0.04      0.07     12261

avg / total       0.60      0.63      0.51     32781

confusion matrix:
[[20149   371]
 [11820   441]]

________________________________________________________________________________
Training: Perceptron
Pipeline(steps=[('vect', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True)), ('clf'

In [71]:
len(X_test)

32781

In [74]:
y_test.value_counts()

False    20520
True     12261
dtype: int64

In [52]:
X_train

0        Ajax image upload from html file to codeignite...
1        How to emulate CPUs in Java? - So I have been ...
2        how can i fetch the next auto increment value ...
3        Update all column at once mysql - I have follo...
4        ArrayList Cannot find variable isJunior - I'm ...
5        how to place (push) input values dynamically i...
6        Is Javascript constructor function equivalent/...
7        Facebook oauth2 login returns (400) bad reques...
8        C: Is it legal to subscript an array of incomp...
9        Jquery - Convert a link to button in dialog - ...
10       C++ copy constructor behaviour - There is a pa...
11       How to check which objects collide with b2Cont...
12       "Insert Into" clause in sybase 15.5 - I am ins...
13       Differentiate retina and non-retina display in...
14       Where should .sh/bash scripts be placed when u...
15       Sail.js requires server restart after running ...
16       Setting server timezone to add/subtract time f.

In [54]:
X_test

32589    Unable to access Bower in Laravel app director...
32590    How to add winapi to Lua for Windows - I've in...
32592    Click on link in iframe then perform jQuery ac...
32593    what happened when function return - I know wh...
32594    ".errorClass" of jQuery validate works but the...
32599    How to get the full path of a file properly? -...
32601    How to print a sublist that contains all of th...
32603    Simplest way to store List of Objects in Windo...
32605    Use sed/awk to delete a line if the following ...
32607    My codes are true , but an error message stop ...
32610    FragmentTabHost and custom tabs - I really nee...
32612    connection of c# application to server using i...
32615    android - Countdown then an action - I want to...
32621    AJAX - Sending knockout observables as JSON ob...
32623    Missing file from MSDN example for GDI printin...
32626    OSX Install Python Distribute - I am trying to...
32628    MySQL not recognising datetime index in WHERE .

In [73]:
X_data.ix[597]

KeyError: 597

In [60]:
X_train.ix[597]

nan

In [30]:
# K = 10
# parameters = {
#     'vect__max_df': (0.5, 0.75, 1.0),
#     'vect__max_features': (None, 1000, 5000, 10000, 50000),
#     'vect__ngram_range': ((1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3)),  # unigrams or bigrams or trigrams
#     'vect__stop_words': (None, stopwords.words('english')),
#     'tfidf__use_idf': (True, False),
#     'tfidf__norm': ('l1', 'l2'),
# #     'clf__alpha': (1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6),
# #     'clf__penalty': ('l1', 'l2', 'elasticnet'),
# #     'clf__n_iter': (10, 50, 80),
# #     'clf__loss': ('log', 'modified_huber'),
# }
# csf_list = [
#     (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
#     (Perceptron(n_iter=50), "Perceptron"),
#     (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
#     (KNeighborsClassifier(n_neighbors=10), "kNN"),
#     (RandomForestClassifier(n_estimators=100), "Random forest")
# ]
# results = {}
# for clf, name in csf_list:
#     pipeline = Pipeline([
#         ('vect', CountVectorizer()),
#         ('tfidf', TfidfTransformer()),
#         ('clf', clf),
#     ])
#     grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
#     skf = StratifiedKFold(y, K)
#     results[name] = cross_val_score(grid_search, X, y, cv=skf, n_jobs=-1, verbose=1)

In [36]:
# split a training set and a test set
y_train = train_labels
y_test = test_labels

print("Extracting features from the training data using a sparse vectorizer")
t0 = time()
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
X_train = vectorizer.fit_transform(data_train.sentence.tolist())
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test.sentence.tolist())
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()

Extracting features from the training data using a sparse vectorizer
done in 0.398901s at 2.688MB/s
n_samples: 5922, n_features: 18088

Extracting features from the test data using the same vectorizer
done in 0.027828s at 4.383MB/s
n_samples: 659, n_features: 18088



In [37]:
# mapping from integer feature name to original token string
feature_names = vectorizer.get_feature_names()
opts_select_chi2 = 100

print("Extracting %d best features by a chi-squared test" %
      opts_select_chi2)
t0 = time()
ch2 = SelectKBest(chi2, k=opts_select_chi2)
X_train = ch2.fit_transform(X_train, y_train)
X_test = ch2.transform(X_test)
if feature_names:
    # keep selected feature names
    feature_names = [feature_names[i] for i
                     in ch2.get_support(indices=True)]
print("done in %fs" % (time() - t0))
print()

feature_names = np.asarray(feature_names)

Extracting 1000 best features by a chi-squared test
done in 0.022103s



In [38]:
# feature_names

In [39]:
def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) < 80 else s[:75] + "..."