In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import EnsembleClassificationInit as Ensemble #reuse trained
import numpy as np

In [2]:
data = {'car' : ['https://raw.githubusercontent.com/ostapkharysh/PublicDatasets/master/Classification/car.csv', ['buying','maint','doors','persons','lug_boot','safety'], 'class'],
        'obesity' : ['https://raw.githubusercontent.com/ostapkharysh/PublicDatasets/master/Classification/ObesityDataSet_raw_and_data_sinthetic.csv', ['Gender','Age','Height','Weight','family_history_with_overweight','FAVC','FCVC','NCP','CAEC','SMOKE','CH2O','SCC','FAF','TUE','CALC','MTRANS'], 'NObeyesdad'],
        'chess' : ['https://raw.githubusercontent.com/ostapkharysh/PublicDatasets/master/Classification/kr-vs-kp.csv', ['1','2','3','4','5','6','7','8','9','10','11','12','14','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31','32','33','34','35'], 'win'],
        'tic-tac-toe': ['https://raw.githubusercontent.com/ostapkharysh/PublicDatasets/master/Classification/tic-tac-toe.csv',  ['tla', 'tms', 'trs' ,'mls','mms','mrs','bls','bms','brs'], 'class']
         }

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

#from sklearn import metrics

def class_train(x_train, y_train):
        """
        Returns trained  classification models 
        """
        estimators = dict()
        
        estimators['mlr'] = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000000)
        estimators['mlr'].fit(x_train, y_train)
        
        estimators['svm'] = svm.SVC(probability=True)
        estimators['svm'].fit(x_train, y_train)

        estimators['sgd'] = SGDClassifier(loss='log')
        estimators['sgd'].fit(x_train, y_train)

        estimators['rfc'] = RandomForestClassifier()
        estimators['rfc'].fit(x_train, y_train)
        
        estimators['multNB'] = MultinomialNB() #Least Angle Regression
        estimators['multNB'].fit(x_train, y_train)

        estimators['bernNB'] = BernoulliNB() #Least Angle Regression
        estimators['bernNB'].fit(x_train, y_train)
        
        leaf_size = list(range(10,50))
        n_neighbors = list(range(1,20))
        p=[1,2]
        hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)
        estimators['knn'] = KNeighborsClassifier()
        estimators['knn'] = GridSearchCV(estimators['knn'], hyperparameters, cv=10)
        estimators['knn'].fit(x_train, y_train)
        
        estimators['ada'] = AdaBoostClassifier()
        estimators['ada'].fit(x_train, y_train)

        return estimators

def class_scoring(estim, y, yhat):
    """
    Returns classification scoring
    """
    score = list()
    classes = estim.classes_.tolist()
    #print(classes)
    for idx, el in enumerate(y):
        indx_right_class = classes.index(el) # find the index of the right class prediction proba
        score.append(1- yhat[idx][indx_right_class])
    return score

def class_prediction(df, df_competition, line, X, Y):
    """
    Predict Y for classificationand record the error
    """
    errors = dict()
    accuracy = {'mlr':0, 'svm': 0, 'sgd':0, 'rfc': 0, 'multNB': 0, 'bernNB': 0, 'knn':0, 'ada': 0} # create dictionary to gather error scores for estimators
    ######### TRAIN THE MODEL ###########
    
    x_train = df[X]
    y_train = df[Y]
    
    estimators = class_train(x_train.values, y_train.values) # train estimators

    ######### TEST THE MODEL ###########
    x_competition = df_competition[X]
    y_competition = df_competition[Y]

    ######### SCORE THE MODEL ###########
    for est in estimators.keys():
        y_est = estimators[est].predict(x_competition.values)
        accuracy[est] = accuracy_score(y_est.tolist(), df_competition[Y].tolist())

    # return the data
    return accuracy, estimators

In [4]:
results = {}

for idx, el in enumerate(data):
        print("Dataset: " + el)

        results[el] = list()
        
        df = pd.read_csv(data[el][0], encoding="ISO-8859-1")
        #df.drop(columns = columns_to_drop[idx],inplace = True)
        df = df.dropna()
        
        new_df = pd.get_dummies(df[data[el][1]])
        X = new_df.columns
        new_df[data[el][2]] = df[data[el][2]]

        ds, competition = train_test_split(new_df, test_size=0.3)
        ds = ds.reset_index(drop=True)
        competition = competition.reset_index(drop=True)

        
        
        #Estimators to compare
        m_accuracy, estimators = class_prediction(ds,competition,idx, X=X, Y=data[el][2])
        print("Estimators Finished")
        print(m_accuracy)
        
        #BETA = 3
        #Assembly without priority

        ensemble = Ensemble.analyze_assembly(learning_ds=ds, competition_ds=competition, X=X, Y=data[el][2],
                                                        priority = False, beta=3)
        
        #Assembly with priority
        ensemble_priority = Ensemble.analyze_assembly(learning_ds=ds, competition_ds=competition, X=X, Y=data[el][2],
                                                        priority = True, beta=3)
        print('Beta = 3')
        print("Ensemble without priority: " + str(ensemble) + " Ensemble with priority: " + str(ensemble_priority))
        
        #BETA = 5
        #Assembly without priority
        ensemble = Ensemble.analyze_assembly(learning_ds=ds, competition_ds=competition, X=X, Y=data[el][2],
                                                        priority = False, beta=5)
        
        #Assembly with priority
        ensemble_priority = Ensemble.analyze_assembly(learning_ds=ds, competition_ds=competition, X=X, Y=data[el][2],
                                                        priority = True, beta=5)
        print('Beta = 5')
        print("Ensemble without priority: " + str(ensemble) + " Ensemble with priority: " + str(ensemble_priority))

Dataset: car
Estimators Finished
{'mlr': 0.9171483622350675, 'svm': 0.9730250481695568, 'sgd': 0.905587668593449, 'rfc': 0.9672447013487476, 'multNB': 0.8554913294797688, 'bernNB': 0.8631984585741811, 'knn': 0.9460500963391136, 'ada': 0.7957610789980732}
Beta 3
[403, 806, 1209]
Beta 3
[403, 806, 1209]
Beta = 3
Ensemble without priority: 0.7610789980732178 Ensemble with priority: 0.7976878612716763
Beta 5
[241, 482, 723, 964, 1205]




Beta 5
[241, 482, 723, 964, 1205]




Beta = 5
Ensemble without priority: 0.8554913294797688 Ensemble with priority: 0.9633911368015414
Dataset: obesity
Estimators Finished
{'mlr': 0.831230283911672, 'svm': 0.5347003154574133, 'sgd': 0.556782334384858, 'rfc': 0.9589905362776026, 'multNB': 0.61198738170347, 'bernNB': 0.556782334384858, 'knn': 0.9100946372239748, 'ada': 0.31703470031545744}
Beta 3
[492, 984, 1476]


  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))


Beta 3
[492, 984, 1476]
Beta = 3
Ensemble without priority: 0.44637223974763407 Ensemble with priority: 0.9022082018927445
Beta 5
[295, 590, 885, 1180, 1475]
Beta 5
[295, 590, 885, 1180, 1475]


  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))


Beta = 5
Ensemble without priority: 0.7981072555205048 Ensemble with priority: 0.9069400630914827
Dataset: chess
Estimators Finished
{'mlr': 0.959332638164755, 'svm': 0.9468196037539103, 'sgd': 0.9562043795620438, 'rfc': 0.9708029197080292, 'multNB': 0.8498435870698644, 'bernNB': 0.8508863399374348, 'knn': 0.9249217935349322, 'ada': 0.9572471324296142}
Beta 3
[745, 1490, 2235]
Beta 3
[745, 1490, 2235]
Beta = 3
Ensemble without priority: 0.8717413972888426 Ensemble with priority: 0.9655891553701773
Beta 5
[447, 894, 1341, 1788, 2235]
Beta 5
[447, 894, 1341, 1788, 2235]
Beta = 5
Ensemble without priority: 0.9124087591240876 Ensemble with priority: 0.9676746611053181
Dataset: tic-tac-toe
Estimators Finished
{'mlr': 0.9826388888888888, 'svm': 0.9756944444444444, 'sgd': 0.9791666666666666, 'rfc': 0.9722222222222222, 'multNB': 0.6770833333333334, 'bernNB': 0.6666666666666666, 'knn': 0.96875, 'ada': 0.8576388888888888}
Beta 3
[223, 446, 669]
Beta 3
[223, 446, 669]
Beta = 3
Ensemble without pr