# Comparison of different Classification Algorithms



In [1]:
import sklearn
import numpy as np
import pandas as pd

from main import aggregate_data, normalize_data
from metrics.githubMetrics import GithubMetrics, metricCollection
from importer.testDataImporter import TestDataImporter

In [2]:
importer = TestDataImporter('data/testset.csv')
y_train = np.array(importer.trainset.classification)
y_test = np.array(importer.testset.classification)

In [3]:
data_train = aggregate_data(importer.trainset.repos)
data_train = normalize_data(data_train)
data_train[:5]

Unnamed: 0,avg_entropy,forks_count,repo_size,open_issue_count,avg_folder_depth,watcher_count,file_folder_ratio,up_to_dateness,file_count
0,0.25159,0.566949,0.752917,0.641663,0.253733,0.770764,0.330773,0.026821,0.405086
1,0.270911,0.53479,0.370083,0.15163,0.333086,0.753491,0.084179,0.708447,0.136859
2,0.43188,0.0,0.735055,0.0,0.236519,0.0,0.275341,0.597203,0.484467
3,0.232237,0.0,0.373685,0.0,0.736634,0.0,0.169776,0.498483,0.344582
4,0.259969,0.0,0.508795,0.0,0.534729,0.0,0.210574,0.873993,0.349585


In [4]:
data_test = aggregate_data(importer.testset.repos)
data_test = normalize_data(data_test)
data_test[:5]

Unnamed: 0,avg_entropy,forks_count,repo_size,open_issue_count,avg_folder_depth,watcher_count,file_folder_ratio,up_to_dateness,file_count
0,0.564294,0.730426,0.74726,0.15019,0.310194,0.421838,0.191077,0.539362,0.252792
1,0.741162,0.0,0.723308,0.0,0.339471,0.0,0.020757,0.877016,0.197237
2,0.769716,0.0,0.408197,0.0,0.359101,0.073646,0.014088,0.971735,0.167862
3,0.527382,0.485355,0.763555,0.834246,0.282112,0.073646,0.048594,0.766765,0.123329
4,0.516772,0.427203,0.724402,0.555771,0.804519,0.441875,0.162831,0.201886,0.538326


## Helper

In [5]:
def analyze_results(model):
    print('acc train:', model.score(data_train, y_train))
    print('acc test:', model.score(data_test, y_test))

In [6]:
def null_acc(y):
    return max([len(y[y == x]) for x in np.unique(y)]) / len(y)

In [7]:
print('acc train:', null_acc(y_train))
print('acc test:', null_acc(y_test))

acc train: 0.3951890034364261
acc test: 0.33544303797468356


## Logistic Regression

In [8]:
logreg = sklearn.linear_model.LogisticRegression(C=1.0, max_iter=100, n_jobs=2)
logreg.fit(data_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=2,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
logreg.coef_

array([[ 0.22457466,  0.43948792, -0.26185993, -0.82029758, -1.19206779,
         1.40746356,  0.5656693 , -0.64466193, -0.90217353],
       [-1.02194912, -1.34278266,  1.21066623,  0.96252042,  0.37297587,
        -0.88031193,  0.89368225, -1.3657322 ,  1.39492158],
       [ 1.30662512, -0.58647011,  0.37402214,  0.15644162, -1.55369588,
         0.04955791, -0.52736235, -0.27016482, -0.33191972],
       [-0.23836844,  1.73642814, -0.24225152, -0.70562363, -0.24501599,
         1.08265998, -0.52485589, -0.19036835, -0.61391072],
       [-0.65939469,  0.2266435 , -0.36361769, -0.39197227, -0.32206543,
        -1.04133017, -0.24770463,  0.18770769,  0.67372841],
       [-0.44397304, -0.75385378, -3.15828118,  0.00546681,  0.55295501,
        -0.73306548, -1.31927554,  0.53443706, -1.95426283],
       [ 0.21905238, -0.70549327,  0.0931602 , -0.48216327, -0.84946616,
        -0.99551816, -0.19199932, -1.59805084, -0.32892676]])

In [10]:
analyze_results(logreg)

acc train: 0.477663230241
acc test: 0.462025316456


## SVM

In [11]:
svm = sklearn.svm.SVC(C=20.0)
svm.fit(data_train, y_train)

SVC(C=20.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [12]:
analyze_results(svm)

acc train: 0.563573883162
acc test: 0.512658227848


## DecisionTree

In [14]:
from sklearn.tree import DecisionTreeClassifier

In [17]:
dtc = DecisionTreeClassifier()
dtc.fit(data_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [18]:
analyze_results(dtc)

acc train: 0.993127147766
acc test: 0.322784810127


## Random Forest

In [71]:
from sklearn.ensemble import RandomForestClassifier
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=100, random_state=1337)
rf.fit(data_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=1337,
            verbose=0, warm_start=False)

In [72]:
analyze_results(rf)

acc train: 0.993127147766
acc test: 0.46835443038


## NN

In [19]:
from sklearn.neural_network import MLPClassifier

In [63]:
mlp = MLPClassifier(max_iter=200, hidden_layer_sizes=(100,))
mlp.fit(data_train, y_train)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [64]:
analyze_results(mlp)

acc train: 0.525773195876
acc test: 0.487341772152
