## Adaboost

In [105]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

In [106]:
df = data = pd.read_csv("data_hw.csv")
data.drop("Unnamed: 0", axis=1, inplace=True)
data.shape

(12563, 8)

### Preparing data

In [107]:
data['area_type'] = data['area_type'].replace(['B'],1)
data['area_type'] = data['area_type'].replace(['P'],-1)
data.head()

Unnamed: 0,area_type,availability,bedrooms,total_sqft,bath,balcony,ranked,price in rupees
0,1,1,3.0,1655.0,3.0,1.0,134,10800000.0
1,1,1,2.0,1102.0,2.0,1.0,134,4800000.0
2,1,0,2.0,1112.0,2.0,1.0,242,8800000.0
3,1,1,3.0,1450.0,3.0,3.0,335,5100000.0
4,1,1,2.0,1010.0,2.0,1.0,261,4100000.0


In [108]:
x_train = data.iloc[0:8040,1:].to_numpy()
y_train = data.iloc[0:8040, :1].to_numpy()
y_train = y_train.flatten()

x_val = data.iloc[8041:10050, 1:].to_numpy()
y_val = data.iloc[8041:10050, :1].to_numpy()
y_val = y_val.flatten()

x_test = data.iloc[10051:12563, 1:].to_numpy()
y_test = data.iloc[10051:12563, :1].to_numpy()
y_test = y_test.flatten()

x_test.shape

(2512, 7)

### Adaboost algorithm 

In [109]:

class DecisionStump:
    def __init__(self,polarity,feature,threshold):
        self.polarity = polarity if polarity else None
        self.feature = feature if feature else None
        self.threshold = threshold if threshold else None
        self.amount_of_say = None

    def predict(self, X):
        n_samples = X.shape[0]
        X_column = X[:, self.feature]
        predictions = np.ones(n_samples)
        if self.polarity == 1:
            predictions[X_column < self.threshold] = -1
        else:
            predictions[X_column > self.threshold] = -1

        return predictions





In [126]:
class Adaboost:
    
    def __init__(self, n_estimator=10):
        self.n_estimator = n_estimator
        
        
    def get_amount_of_say(self,total_error):
        small_error_term = 0.00000000000000001
        aos = np.log((1.0 - (total_error + small_error_term) ) / (total_error + small_error_term ))
        return 0.5 * aos
    
    
    def update_weight(self,w,say,y,stump_pred):
        new_weight = np.exp(-say*y*stump_pred)
        new_weight/=new_weight.sum()
        return new_weight
    
    
    def best_split_feature(self,X,Y,w):
        min_error = float("inf")
        stump_best = {}
        for ft in range(X.shape[1]):
            x = X[:, ft]
            thresholds = np.unique(x)  
            for threshold in thresholds:
                p = 1
                predictions = np.ones(X.shape[0])
                predictions[x < threshold] = -1

                error = sum(w[Y != predictions])
                if error > 0.5:
                    error = 1 - error
                    p = -1
                if error < min_error:
                    stump_best['polarity'] = p
                    stump_best['feature'] = ft
                    stump_best['threshold'] = threshold
                    min_error = error
        return stump_best, min_error
    
        
    def fit(self, X, y):
        init_weight = np.full(X.shape[0], (1 / X.shape[0]))
        w = init_weight
        stumps = []
        for i in range(self.n_estimator):
            best_stump,min_error = self.best_split_feature(X,y,w)
            stump = DecisionStump(best_stump['polarity'],best_stump['feature'],best_stump['threshold'])
            stump.amount_of_say = self.get_amount_of_say(min_error)
            stump_pred = stump.predict(X)

            w = self.update_weight(w, stump.amount_of_say, y, stump_pred)
            stumps.append(stump)
        return stumps

    
    def predict(self, x, stumps):
        y_pred = []
        for s in stumps:
            pred = s.amount_of_say*s.predict(x)
            y_pred.append(pred)
            
        y_pred = np.sum(y_pred, axis=0)
        y_pred = np.sign(y_pred)

        return y_pred



 ## Training the model - before optimization

In [127]:
%%time
clf = Adaboost(n_estimator=10)
stumps = clf.fit(x_train, y_train)


Wall time: 19.1 s


 ## Testing the model - before optimization


In [128]:
y_pred = clf.predict(x_test,stumps)

acc = accuracy_score(y_test, y_pred)
print("Accuracy - our model:", acc)


Accuracy - our model: 0.8722133757961783


## Training sklearn model - before optimization

In [124]:
%%time
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=10, random_state=0)
clf.fit(x_train, y_train)

Wall time: 65.8 ms


AdaBoostClassifier(n_estimators=10, random_state=0)

 ## Testing sklearn model - before optimization


In [125]:
y_sk = clf.predict(x_test)
acc = accuracy_score(y_test, y_sk)
print("Accuracy - sklearn:", acc)

Accuracy - sklearn: 0.8817675159235668


## Validation and hypertuning based on our model

In [134]:
%%time
grid_search = {"n_estimator": [], "accuracy": []}
for n in range(10, 100,10):
    clf_v = Adaboost(n_estimator=n)
    s = clf_v.fit(x_train, y_train)
    y_val_p = clf_v.predict(x_val,s)
    acc_v =accuracy_score(y_val, y_val_p)
    
    grid_search["n_estimator"].append(n)
    grid_search["accuracy"].append(acc_v)

    print('Rounds ',n)

Rounds  10
Rounds  20
Rounds  30
Rounds  40
Rounds  50
Rounds  60
Rounds  70
Rounds  80
Rounds  90
Wall time: 14min 52s


In [139]:
grid_search = pd.DataFrame(grid_search)
grid_search.sort_values("accuracy",ascending=False).head()

Unnamed: 0,n_estimator,accuracy
0,10,0.876058
1,20,0.876058
2,30,0.876058
3,40,0.876058
4,50,0.876058
5,60,0.876058
6,70,0.876058
7,80,0.876058
8,90,0.876058


In [136]:
i = grid_search.sort_values("accuracy",ascending=False).iloc[0]
best_n_estimator = int(i['n_estimator'])
best_n_estimator

10

## Validation and hypertuning based on sklearn model

In [142]:
%%time
grid_search_sk = {"n_estimator": [], "accuracy": []}
for n in range(10, 100,10):
    clf_v_sk = AdaBoostClassifier(n_estimators=n, random_state=0)
    clf_v_sk.fit(x_train, y_train)
    y_val_p_sk = clf_v_sk.predict(x_val)
    acc_v_sk =accuracy_score(y_val, y_val_p_sk)
    
    grid_search_sk["n_estimator"].append(n)
    grid_search_sk["accuracy"].append(acc_v_sk)

    print('Rounds ',n)

Rounds  10
Rounds  20
Rounds  30
Rounds  40
Rounds  50
Rounds  60
Rounds  70
Rounds  80
Rounds  90
Wall time: 2.37 s


In [143]:
grid_search_sk = pd.DataFrame(grid_search)
grid_search_sk.sort_values("accuracy",ascending=False).head()

Unnamed: 0,n_estimator,accuracy
0,10,0.876058
1,20,0.876058
2,30,0.876058
3,40,0.876058
4,50,0.876058


#### Got same results from both models (trying higher numbers may increase accuracy, but our model takes relevantly a long time to run)

## Testing with optimized hyper parameters 

### Testing based on our model

In [144]:
%%time
clf_opt = Adaboost(n_estimator=best_n_estimator)
stumps = clf.fit(x_train, y_train)
y_opt_pred = clf.predict(x_test,stumps)
acc_opt = accuracy_score(y_test, y_opt_pred)
print("Accuracy after tuning - our model:", acc)

Accuracy after tuning - our model: 0.8722133757961783
Wall time: 18.5 s


### Testing based on sklearn model

In [146]:
%%time

clf_opt_sk = AdaBoostClassifier(n_estimators=best_n_estimator, random_state=0)
clf_opt_sk.fit(x_train, y_train)
y_pred_sk = clf_opt_sk.predict(x_test)
acc_opt_sk = accuracy_score(y_test, y_pred_sk)
print("Accuracy - sklearn:", acc_opt_sk)

Accuracy - sklearn: 0.8817675159235668
Wall time: 58.8 ms
