# Imports

In [61]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Loading CSV

In [62]:
dataset = pd.read_csv('Results/Intersect_card_dataset.csv')
dataset.columns = dataset.columns.str.strip()
dataset.head()

Unnamed: 0,range,n1,average1,median1,std1,n2,average2,median2,std2,skewed_1,skewed_2,non_skewed,vector
0,65534.0,138082,19304.207898,9964.5,21656.224203,9764,19255.718968,9477.5,21718.702149,28717,43852,33742,5799
1,65534.0,138082,19304.207898,9964.5,21656.224203,3172,19030.790038,9721.5,21432.550738,16931,18436,19497,23673
2,65534.0,138082,19304.207898,9964.5,21656.224203,1211,19819.10322,11760.0,21898.387371,14170,9127,44423,3539
3,65534.0,138082,19304.207898,9964.5,21656.224203,3376,19271.502073,10493.5,21614.362552,17498,19000,20052,4120
4,65534.0,138082,19304.207898,9964.5,21656.224203,3421,19514.070447,10119.0,21945.366501,17564,19549,20429,4132


# Building Dataset

In [63]:
feature_cols = ['range', 'n1', 'n2']

In [64]:
X = dataset.loc[:, feature_cols]
X = X.values
X.shape

(39601, 3)

In [65]:
algos = ['skewed_1', 'skewed_2', 'non_skewed']
ys = []
for algo in algos:
    y = dataset.loc[:, algo]
    ys.append(y)
y.shape

(39601,)

In [88]:
def getClassificationY():
    cy = []
    cyTimes = []
    for i in range(len(ys[0])):
        minVal = float("inf")
        minName = ""
        for j, y in enumerate(ys):
            if (minVal > y[i]):
                minVal = y[i]
                minName = algos[j]
        cy.append(minName)
        cyTimes.append(minVal)
    cy = np.array(cy)
    return cy, cyTimes

In [90]:
# cy, cyTimes = getClassificationY()
# cy.shape

(39601,)

In [67]:
def getBenchmarkSet():
    benchmarkSet = X.tolist()
    for index, attributes in enumerate(benchmarkSet):
        for times in ys:
            attributes.append(times[index])
        attributes.append(cyTimes[index])
        attributes.append(cy[index])
    return np.array(benchmarkSet)

In [68]:
benchmarkSet = getBenchmarkSet()
benchmarkSet.shape

(39601, 8)

In [69]:
def clfSplit():
    return train_test_split(X, cy, test_size=0.2, random_state=0)

# Calculating value of the project

In [70]:
def percentChange(new, old):
    return ((float(old)/new)-1)

In [71]:
def getAlgoTime(algoName, row):
    for index, algo in enumerate(algos):
        if(algoName == algo):
            return int(row[len(X[0]) + index])
    print('oups none of the available algos')

In [72]:
def getOldSchoolAlgo(n1, n2):
    threshold = 64
    n1 = float(n1)
    n2 = float(n2)
    if (n1 * threshold < n2):
        return 'skewed_1'
    if (n2 * threshold < n1):
        return 'skewed_2'
    return 'non_skewed'

In [85]:
def calculateValueOfML():
    X_train, X_test, y_train, y_test = train_test_split(benchmarkSet, benchmarkSet, test_size=0.3, random_state=0)
    
    clf = DecisionTreeClassifier(max_depth=None, random_state=0)
    clf.fit(X_train[:, 0:len(X[0])], y_train[:,-1])

    mlChosenAlgos = clf.predict(X_test[:, 0:len(X[0])])

    GoodPredictionsCount = 0
    mlTotTime = 0
    osTotTime = 0
    perfectTotTime = 0
    for i, algo in enumerate(mlChosenAlgos):
        if(algo == y_test[i,-1]):
            GoodPredictionsCount += 1
        mlTotTime += getAlgoTime(algo, y_test[i])
        n1 = y_test[i,feature_cols.index('n1')]
        n2 = y_test[i,feature_cols.index('n2')]
        osAlgo = getOldSchoolAlgo(n1 , n2)
        #print(osAlgo)
        osTotTime += getAlgoTime(osAlgo, y_test[i])
        perfectTotTime += int(y_test[i,-2])


    print("Average classification precision: %.2f" % (float(GoodPredictionsCount)/len(mlChosenAlgos)))
    print("Total ml time: %d" % mlTotTime)
    print("Total old school time: %d" % osTotTime)
    print("Machine learning is faster by : %.2f" % percentChange(mlTotTime, osTotTime))
    print("With perfect prediction : %.2f" % percentChange(perfectTotTime, osTotTime))

In [86]:
calculateValueOfML()

Average classification precision: 0.84
Total ml time: 281515200
Total old school time: 453854075
Machine learning is faster by : 0.61
With perfect prediction : 0.64


# Serialize model

In [None]:
# from sklearn.externals import joblib

# clf = DecisionTreeClassifier(max_depth=None, random_state=0)
# clf.fit(X, cy)
# joblib.dump(clf, '.joblib')