In [1]:
#
# Imports

import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

from sklearn.feature_extraction.text import *
import common
from importlib import reload
%matplotlib inline


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
#
# Load the dataset
#
reload(common)

(train_sentences, train_categories, test_sentences, test_categories) = common.load_dataset('../dataset/redis_dataset.csv', split=0.6)
print(train_sentences.shape)

#
# Vectorizers
#
cv = CountVectorizer()
train_cv = cv.fit_transform(train_sentences)
test_cv = cv.transform(test_sentences)

tf = TfidfVectorizer()
train_tf = tf.fit_transform(train_sentences)
test_tf = tf.transform(test_sentences)


9849
(5909,)


In [3]:
#
# NaiveBayes
#

#
# Bernoulli NB Model
#
alphas = {'alpha': [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}
grid = GridSearchCV(BernoulliNB(),alphas)
grid.fit(train_cv, train_categories)
print('Best BernoulliNB score with CV:', grid.best_score_, 'with alpha= ', grid.best_params_)

grid.fit(train_tf, train_categories)
print('Best BernoulliNB score with TF:', grid.best_score_, 'with alpha= ', grid.best_params_)

b_clf = BernoulliNB(alpha=grid.best_params_['alpha'])
b_clf.fit(train_tf, train_categories)
pred = b_clf.predict(test_tf)
print('\nAccuracy:', np.mean(pred == test_categories))
report = metrics.classification_report(pred, test_categories)
print(report)

Best BernoulliNB score with CV: 0.7266881028938906 with alpha=  {'alpha': 2.0}
Best BernoulliNB score with TF: 0.7266881028938906 with alpha=  {'alpha': 2.0}

Accuracy: 0.716497461928934
             precision    recall  f1-score   support

          0       0.86      0.74      0.80      2950
          1       0.46      0.66      0.54       990

avg / total       0.76      0.72      0.73      3940



In [4]:
#
# Logistic Regression 
#
Cs = {'C': [.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 1]}
lg_clf = LogisticRegression(penalty='l2')
grid_search = GridSearchCV(lg_clf, Cs)
grid_search.fit(train_tf, train_categories)
print('Best Logistic regression score:', grid_search.best_score_, 'with C= ', grid_search.best_params_)

lg_clf = LogisticRegression(penalty='l2', C=1)
lg_clf.fit(train_tf, train_categories)
pred = lg_clf.predict(test_tf)
report = metrics.classification_report(pred, test_categories)
print(report)


Best Logistic regression score: 0.7786427483499746 with C=  {'C': 1}
             precision    recall  f1-score   support

          0       0.90      0.80      0.85      2836
          1       0.60      0.78      0.68      1104

avg / total       0.82      0.79      0.80      3940



In [5]:
#
# Load a new source file
#
# reload(common)
test_file='/tmp/fe-connect.c'
comment_sents = []

#
# Get comment sentences
#
comment_sents += common.get_comment_sents(test_file)
# print(len(comment_sents))
# for sent in comment_sents:
#     print(sent)


In [23]:
new_test_tf = tf.transform(comment_sents)
print(len(comment_sents))

786


In [22]:
pred = lg_clf.predict(new_test_tf)
print(pred.shape)
print(np.count_nonzero(pred))

(786,)
128


In [15]:
labels = ['Needs Fix', 'Good comment']

for i in range(len(comment_sents)):
    print(comment_sents[i], "\nReadability score: ", labels[int(pred[i])])


 mstcpip.h is missing on mingw 
Readability score:  Needs Fix
 OpenLDAP deprecates RFC 1823, but we want standard conformance 
Readability score:  Needs Fix
  Pre-9.0 servers will return this SQLSTATE if asked to set  application_name in a startup packet. 
Readability score:  Needs Fix
We hard-wire the value rather  than looking into errcodes.h since it reflects historical behavior  rather than that of the current code. 
Readability score:  Good comment
 This is part of the protocol so just define it 
Readability score:  Needs Fix
 This too 
Readability score:  Needs Fix
  Cope with the various platform-specific ways to spell TCP keepalive socket  options. 
Readability score:  Needs Fix
This doesn't cover Windows, which as usual does its own thing. 
Readability score:  Needs Fix
 TCP_KEEPIDLE is the name of this option on Linux and BSD 
Readability score:  Needs Fix
 TCP_KEEPALIVE_THRESHOLD is the name of this option on Solaris >= 11 
Readability score:  Needs Fix
 TCP_KEEPALIVE is the

Readability score:  Needs Fix
  Try to get the environment variable fallback 
Readability score:  Needs Fix
  Interpret the deprecated PGREQUIRESSL environment variable. 
Readability score:  Needs Fix
Per  tradition, translate values starting with "1" to sslmode=require,  and ignore other values. 
Readability score:  Needs Fix
Given both PGREQUIRESSL=1 and PGSSLMODE,  PGSSLMODE takes precedence; the opposite was true before v9.3. 
Readability score:  Needs Fix
  No environment variable specified or the variable isn't set - try  compiled-in default 
Readability score:  Needs Fix
  Special handling for "user" option. 
Readability score:  Needs Fix
Note that if pg_fe_getauthname  fails, we just leave the value as NULL; there's no need for this to  be an error condition if the caller provides a user name. 
Readability score:  Good comment
The only  reason we do this now at all is so that callers of PQconndefaults  will see a correct default (barring error, of course). 
Readability score:  