# Comparison of different Classification Algorithms



In [24]:
import sklearn
import numpy as np
import pandas as pd

from main import aggregate_data, normalize_data
from metrics.githubMetrics import GithubMetrics, metricCollection
from importer.testDataImporter import TestDataImporter

In [25]:
importer = TestDataImporter('data/testset.csv')
y_train = np.array(importer.trainset.classification)
y_test = np.array(importer.testset.classification)

In [26]:
metrics = np.array(list(metricCollection.keys()))
metrics

array(['avg_folder_depth', 'forks_count', 'avg_entropy',
       'file_folder_ratio', 'up_to_dateness', 'repo_size', 'watcher_count',
       'file_count', 'open_issue_count', 'html_count', 'is_io_page'], 
      dtype='<U17')

In [28]:
data_train = aggregate_data(importer.trainset.repos)
data_train = data_train[metrics]
data_train = normalize_data(data_train)
data_train[:5]

Unnamed: 0,avg_entropy,forks_count,file_folder_ratio,up_to_dateness,repo_size,watcher_count,file_count,open_issue_count,html_count,is_io_page
0,0.25159,0.566949,0.330773,0.026821,0.752917,0.770764,0.405086,0.641663,0.0,0.0
1,0.270911,0.53479,0.084179,0.708447,0.370083,0.753491,0.136859,0.15163,0.0,0.0
2,0.43188,0.0,0.275341,0.597203,0.735055,0.0,0.484467,0.0,0.0,0.0
3,0.232237,0.0,0.169776,0.498483,0.373685,0.0,0.344582,0.0,0.0,0.0
4,0.259969,0.0,0.210574,0.873993,0.508795,0.0,0.349585,0.0,0.0,0.0


In [29]:
data_test = aggregate_data(importer.testset.repos)
data_test = data_test[metrics]
data_test = normalize_data(data_test)
data_test[:5]

Unnamed: 0,avg_entropy,forks_count,file_folder_ratio,up_to_dateness,repo_size,watcher_count,file_count,open_issue_count,html_count,is_io_page
0,0.564294,0.730426,0.191077,0.539362,0.74726,0.421838,0.252792,0.15019,0.080901,0.0
1,0.741162,0.0,0.020757,0.877016,0.723308,0.0,0.197237,0.0,0.0,0.0
2,0.769716,0.0,0.014088,0.971735,0.408197,0.073646,0.167862,0.0,0.0,0.0
3,0.527382,0.485355,0.048594,0.766765,0.763555,0.073646,0.123329,0.834246,0.0,0.0
4,0.516772,0.427203,0.162831,0.201886,0.724402,0.441875,0.538326,0.555771,0.209125,0.0


## Helper

In [30]:
def analyze_results(model):
    print('acc train:', model.score(data_train, y_train))
    print('acc test:', model.score(data_test, y_test))

In [31]:
def null_acc(y):
    return max([len(y[y == x]) for x in np.unique(y)]) / len(y)

In [32]:
print('acc train:', null_acc(y_train))
print('acc test:', null_acc(y_test))

acc train: 0.3951890034364261
acc test: 0.33544303797468356


## Logistic Regression

In [33]:
logreg = sklearn.linear_model.LogisticRegression(C=1.0, max_iter=100, n_jobs=2)
logreg.fit(data_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=2,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [34]:
logreg.coef_

array([[  1.70606275e-01,   4.45015175e-01,   5.98238348e-01,
         -8.76836766e-01,  -1.16230373e-01,   1.33194204e+00,
         -9.05880893e-01,  -9.23629107e-01,  -1.55145216e+00,
         -7.53316242e-01],
       [ -1.02756417e+00,  -1.35992389e+00,   9.03277913e-01,
         -1.15737382e+00,   9.91238018e-01,  -7.63730013e-01,
          1.35787156e+00,   1.05872152e+00,   9.19771266e-01,
          6.82371173e-01],
       [  1.24339211e+00,  -6.06605433e-01,  -4.83697332e-01,
         -5.11125622e-01,   4.22377725e-01,   5.61526715e-04,
         -4.10482091e-01,   9.09000748e-02,  -1.01073927e+00,
         -2.90975156e-01],
       [ -2.33028034e-01,   1.71219585e+00,  -5.28517383e-01,
         -2.29226706e-01,  -2.55733988e-01,   1.07720446e+00,
         -7.21228068e-01,  -7.33370416e-01,   2.91701324e-01,
         -7.24981836e-01],
       [ -6.19429832e-01,   1.87279392e-01,  -2.58387262e-01,
          1.03109223e-01,  -3.92861673e-01,  -1.04351095e+00,
          4.94866315e-01

In [35]:
analyze_results(logreg)

acc train: 0.494845360825
acc test: 0.449367088608


## SVM

In [36]:
svm = sklearn.svm.SVC(C=20.0, random_state=1337)
svm.fit(data_train, y_train)

SVC(C=20.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=1337, shrinking=True,
  tol=0.001, verbose=False)

In [37]:
analyze_results(svm)

acc train: 0.567010309278
acc test: 0.487341772152


## DecisionTree

In [38]:
from sklearn.tree import DecisionTreeClassifier

In [39]:
dtc = DecisionTreeClassifier(random_state=1337)
dtc.fit(data_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1337, splitter='best')

In [40]:
analyze_results(dtc)

acc train: 0.993127147766
acc test: 0.354430379747


## Random Forest

In [41]:
from sklearn.ensemble import RandomForestClassifier
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=100, random_state=1337)
rf.fit(data_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=1337,
            verbose=0, warm_start=False)

In [42]:
analyze_results(rf)

acc train: 0.993127147766
acc test: 0.474683544304


## NN

In [43]:
from sklearn.neural_network import MLPClassifier

In [44]:
mlp = MLPClassifier(max_iter=200, hidden_layer_sizes=(100,), random_state=1337)
mlp.fit(data_train, y_train)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1337,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [45]:
analyze_results(mlp)

acc train: 0.54295532646
acc test: 0.487341772152


## Two Step Stuff

In [50]:
from main import TwoStepClassifier

In [51]:
tsc = TwoStepClassifier(
    sklearn.linear_model.LogisticRegression(C=1.0, max_iter=100, n_jobs=2),
    RandomForestClassifier(n_estimators=100, random_state=1337)
)
tsc.fit(data_train, y_train)

In [49]:
tsc.score(data_test, y_test)

0.19620253164556961