# Comparison of different Classification Algorithms



In [11]:
import sklearn
import numpy as np
import pandas as pd

from main import aggregate_data, normalize_data
from metrics.githubMetrics import GithubMetrics, metricCollection
from importer.testDataImporter import TestDataImporter

In [12]:
importer = TestDataImporter('data/testset.csv')
y_train = np.array(importer.trainset.classification)
y_test = np.array(importer.testset.classification)

In [35]:
metrics = np.array(list(metricCollection.keys()))
metrics

array(['html_count', 'up_to_dateness', 'watcher_count', 'avg_entropy',
       'file_count', 'is_io_page', 'open_issue_count', 'file_folder_ratio',
       'forks_count', 'repo_size', 'avg_folder_depth'], 
      dtype='<U17')

In [45]:
data_train = aggregate_data(importer.trainset.repos)
data_train = data_train[metrics]
#data_train = normalize_data(data_train)
data_train[:5]

Unnamed: 0,html_count,up_to_dateness,watcher_count,avg_entropy,file_count,is_io_page,open_issue_count,file_folder_ratio,forks_count,repo_size,avg_folder_depth
0,0,5324.161,4640,0.607567,628,0,352,9.661538,635,77453,2.25
1,0,9972311.0,3840,0.6164,52,0,3,2.08,440,252,2.625
2,0,2915525.0,0,0.691902,1307,0,0,7.064865,0,59299,2.173913
3,0,978964.7,0,0.598768,359,0,0,3.739583,0,266,5.316667
4,0,62169280.0,0,0.611392,376,0,0,4.820513,0,2012,3.784314


In [44]:
data_test = aggregate_data(importer.testset.repos)
data_test = data_test[metrics]
#data_test = normalize_data(data_test)
data_test[:5]

Unnamed: 0,html_count,up_to_dateness,watcher_count,avg_entropy,file_count,is_io_page,open_issue_count,file_folder_ratio,forks_count,repo_size,avg_folder_depth
0,1,3154210.0,52,0.631231,111,0,1,4.269231,126,15860,2.75
1,0,69677070.0,0,0.743107,71,0,0,1.290909,0,11632,2.913043
2,0,166022200.0,1,0.761874,56,0,0,1.217391,0,196,3.026316
3,0,25361980.0,1,0.608805,39,0,46,1.625,24,19584,2.6
4,5,143020.4,63,0.602415,1084,0,12,3.589404,16,11798,6.693333


## Helper

In [46]:
def analyze_results(model):
    print('acc train:', model.score(data_train, y_train))
    print('acc test:', model.score(data_test, y_test))

In [47]:
def null_acc(y):
    return max([len(y[y == x]) for x in np.unique(y)]) / len(y)

In [48]:
print('acc train:', null_acc(y_train))
print('acc test:', null_acc(y_test))

acc train: 0.3951890034364261
acc test: 0.33544303797468356


## Logistic Regression

In [49]:
logreg = sklearn.linear_model.LogisticRegression(C=1.0, max_iter=100, n_jobs=2)
logreg.fit(data_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=2,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [50]:
logreg.coef_

array([[ -1.79304392e-06,  -4.79723578e-08,  -4.32581402e-05,
         -2.57231277e-08,   3.20915853e-05,  -9.10377087e-09,
         -3.97488752e-06,   4.90377813e-07,  -2.74421523e-05,
         -9.15601952e-06,  -1.43132132e-07],
       [  2.09674818e-08,  -6.56767245e-09,   2.61498365e-08,
         -7.98480688e-12,   3.14262296e-08,   2.88495134e-12,
         -1.72586843e-09,  -1.26839775e-10,  -3.55235553e-08,
          6.87223964e-08,  -1.27832900e-11],
       [ -4.43927924e-06,  -3.29084078e-08,  -2.94125582e-04,
         -2.90236535e-07,  -1.32987539e-04,  -6.33896115e-08,
         -1.78587235e-05,  -2.20038150e-06,  -1.34059387e-04,
          1.17703706e-06,  -1.53871139e-06],
       [ -3.59299628e-06,  -5.37802070e-08,  -1.10505017e-04,
         -7.80656511e-08,  -1.36489189e-04,  -2.25017927e-08,
         -1.79397870e-05,  -8.34410445e-07,  -8.97550091e-05,
          4.21033462e-06,  -3.55135238e-07],
       [  3.39997378e-06,  -2.05834312e-08,  -1.47138437e-04,
         -1.22

In [51]:
analyze_results(logreg)

acc train: 0.395189003436
acc test: 0.322784810127


## SVM

In [52]:
svm = sklearn.svm.SVC(C=20.0, random_state=1337)
svm.fit(data_train, y_train)

SVC(C=20.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=1337, shrinking=True,
  tol=0.001, verbose=False)

In [53]:
analyze_results(svm)

acc train: 0.993127147766
acc test: 0.367088607595


## DecisionTree

In [54]:
from sklearn.tree import DecisionTreeClassifier

In [55]:
dtc = DecisionTreeClassifier(random_state=1337)
dtc.fit(data_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1337, splitter='best')

In [56]:
analyze_results(dtc)

acc train: 0.993127147766
acc test: 0.443037974684


## Random Forest

In [57]:
from sklearn.ensemble import RandomForestClassifier
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=100, random_state=1337)
rf.fit(data_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=1337,
            verbose=0, warm_start=False)

In [58]:
analyze_results(rf)

acc train: 0.993127147766
acc test: 0.481012658228


## NN

In [59]:
from sklearn.neural_network import MLPClassifier

In [60]:
mlp = MLPClassifier(max_iter=200, hidden_layer_sizes=(100,), random_state=1337)
mlp.fit(data_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1337,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [61]:
analyze_results(mlp)

acc train: 0.168384879725
acc test: 0.170886075949
