# Imports

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Loading CSV

In [3]:
dataset = pd.read_csv('../results/Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz/census-income/Intersect_dataset.csv')
dataset.columns = dataset.columns.str.strip()
dataset.head()

Unnamed: 0,range,n1,average1,std1,n2,average2,std2,skewed_1,skewed_2,non_skewed
0,65535,169096,19337.2,21706.3,169096,19337.2,21706.3,4211,1970,155304
1,65535,169096,19337.2,21706.3,46,20595.6,23630.1,40268,274,7254
2,65535,169096,19337.2,21706.3,17648,19492.5,21753.7,2630,555,46111
3,65535,169096,19337.2,21706.3,240934,19336.5,21696.3,4364,2876,543650
4,65535,169096,19337.2,21706.3,3018,19249.9,21904.1,2188,568,15549


# Building Dataset

In [4]:
feature_cols = ['range', 'n1', 'n2']

In [5]:
X = dataset.loc[:, feature_cols]
X = X.values
X.shape

(40000, 3)

In [6]:
algos = ['skewed_1', 'skewed_2', 'non_skewed']
ys = []
for algo in algos:
    y = dataset.loc[:, algo]
    ys.append(y)
np.array(ys).shape

(3, 40000)

In [7]:
def getClassificationY():
    cy = []
    cyTimes = []
    for i in range(len(ys[0])):
        minVal = float("inf")
        minName = ""
        for j, y in enumerate(ys):
            if (minVal > y[i]):
                minVal = y[i]
                minName = algos[j]
        cy.append(minName)
        cyTimes.append(minVal)
    cy = np.array(cy)
    return cy, cyTimes

In [10]:
cy, cyTimes = getClassificationY()
cy.shape

(40000,)

In [120]:
def getBenchmarkSet():
    benchmarkSet = X.tolist()
    for index, attributes in enumerate(benchmarkSet):
        for times in ys:
            attributes.append(times[index])
        attributes.append(cyTimes[index])
        attributes.append(cy[index])
    return np.array(benchmarkSet)

In [121]:
benchmarkSet = getBenchmarkSet()
benchmarkSet.shape

(39601, 8)

In [122]:
def clfSplit():
    return train_test_split(X, cy, test_size=0.2, random_state=0)

# Calculating value of the project

In [123]:
def percentChange(new, old):
    return ((float(old)/new)-1)

In [124]:
def getAlgoTime(algoName, row):
    for index, algo in enumerate(algos):
        if(algoName == algo):
            return int(row[len(X[0]) + index])
    print('oups none of the available algos')

In [125]:
def getOldSchoolAlgo(n1, n2):
    threshold = 64
    n1 = float(n1)
    n2 = float(n2)
    if (n1 * threshold < n2):
        return 'skewed_1'
    if (n2 * threshold < n1):
        return 'skewed_2'
    return 'non_skewed'

In [126]:
def calculateValueOfML():
    X_train, X_test, y_train, y_test = train_test_split(benchmarkSet, benchmarkSet, test_size=0.3, random_state=0)
    
    clf = DecisionTreeClassifier(max_depth=None, random_state=0)
    clf.fit(X_train[:, 0:len(X[0])], y_train[:,-1])

    mlChosenAlgos = clf.predict(X_test[:, 0:len(X[0])])

    GoodPredictionsCount = 0
    mlTotTime = 0
    osTotTime = 0
    perfectTotTime = 0
    for i, algo in enumerate(mlChosenAlgos):
        if(algo == y_test[i,-1]):
            GoodPredictionsCount += 1
        mlTotTime += getAlgoTime(algo, y_test[i])
        n1 = y_test[i,feature_cols.index('n1')]
        n2 = y_test[i,feature_cols.index('n2')]
        osAlgo = getOldSchoolAlgo(n1 , n2)
        #print(osAlgo)
        osTotTime += getAlgoTime(osAlgo, y_test[i])
        perfectTotTime += int(y_test[i,-2])

    print("Average classification precision: %.2f" % (float(GoodPredictionsCount)/len(mlChosenAlgos)))
    print("Total ml time: %d" % mlTotTime)
    print("Total old school time: %d" % osTotTime)
    print("Machine learning is faster by : %.2f" % percentChange(mlTotTime, osTotTime))
    print("With perfect prediction : %.2f" % percentChange(perfectTotTime, osTotTime))

In [127]:
calculateValueOfML()

['65535.0' '240934.0' '9764.0' '3096' '1253' '41042' '1253' 'skewed_2']
Average classification precision: 0.87
Total ml time: 10222464
Total old school time: 438668374
Machine learning is faster by : 41.91
With perfect prediction : 44.55


# Serialize model

In [None]:
# from sklearn.externals import joblib

# clf = DecisionTreeClassifier(max_depth=None, random_state=0)
# clf.fit(X, cy)
# joblib.dump(clf, '.joblib')