<a href="https://colab.research.google.com/github/mukul-mschauhan/classification/blob/main/Gradient_Boosting_Naive_Bayes_%26_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

from sklearn.ensemble import GradientBoostingClassifier

In [2]:
# Create the Dataset
df = pd.DataFrame({"Exp":[2,3,5, 6],
                   "Edu":["BE", "Ms", "Ms", "Phd"],
                  "Salary":[50000, 70000, 80000, 100000]})
df

Unnamed: 0,Exp,Edu,Salary
0,2,BE,50000
1,3,Ms,70000
2,5,Ms,80000
3,6,Phd,100000


### Find the Base Learner and Calculate the Residual1

In [3]:
df['xbar_salary'] = df["Salary"].mean()
df["Residual1"] = df["Salary"] - df["xbar_salary"]
df

Unnamed: 0,Exp,Edu,Salary,xbar_salary,Residual1
0,2,BE,50000,75000.0,-25000.0
1,3,Ms,70000,75000.0,-5000.0
2,5,Ms,80000,75000.0,5000.0
3,6,Phd,100000,75000.0,25000.0


### Lets build our First Decision Tree and keep the Max Depth as 3

In [5]:
# Prepare the Data for Model
X = df.drop(["Salary",
             "xbar_salary", "Residual1"], axis=1)

dummyx = pd.get_dummies(X, drop_first = True)
y = df.Residual1

In [9]:
from sklearn.tree import DecisionTreeRegressor
dtree = DecisionTreeRegressor(max_depth = 3)

pred = dtree.fit(dummyx, y).predict(dummyx)

lrate = 0.1
df["Residual2"] = lrate * pred

### 2nd Decision Tree

In [10]:
pred = dtree.fit(dummyx, df["Residual2"]).predict(dummyx)
lrate = 0.1
df["Residual3"] = lrate * pred

In [14]:
# Final Pred = base_learner + R2+R3
df["Final_Pred"] = df.xbar_salary+df["Residual2"]+df["Residual3"]
df

Unnamed: 0,Exp,Edu,Salary,xbar_salary,Residual1,Residual2,Residual3,Final_Pred
0,2,BE,50000,75000.0,-25000.0,-2500.0,-250.0,72250.0
1,3,Ms,70000,75000.0,-5000.0,-500.0,-50.0,74450.0
2,5,Ms,80000,75000.0,5000.0,500.0,50.0,75550.0
3,6,Phd,100000,75000.0,25000.0,2500.0,250.0,77750.0


### Gradient Boosting Regressor

In [16]:
from sklearn.ensemble import GradientBoostingRegressor
gbm = GradientBoostingRegressor(n_estimators = 2)

pred_salary = gbm.fit(dummyx, df["Salary"]).predict(dummyx)
df["GBM_Output"] = pred_salary
df

Unnamed: 0,Exp,Edu,Salary,xbar_salary,Residual1,Residual2,Residual3,Final_Pred,GBM_Output
0,2,BE,50000,75000.0,-25000.0,-2500.0,-250.0,72250.0,70250.0
1,3,Ms,70000,75000.0,-5000.0,-500.0,-50.0,74450.0,74050.0
2,5,Ms,80000,75000.0,5000.0,500.0,50.0,75550.0,75950.0
3,6,Phd,100000,75000.0,25000.0,2500.0,250.0,77750.0,79750.0


In [22]:
# Default Setting of GBM
gbm = GradientBoostingRegressor(n_estimators =16)

pred_salary = gbm.fit(dummyx, df["Salary"]).predict(dummyx)
df["GBM_Default"] = pred_salary
df

Unnamed: 0,Exp,Edu,Salary,xbar_salary,Residual1,Residual2,Residual3,Final_Pred,GBM_Output,GBM_Default
0,2,BE,50000,75000.0,-25000.0,-2500.0,-250.0,72250.0,70250.0,54632.550472
1,3,Ms,70000,75000.0,-5000.0,-500.0,-50.0,74450.0,74050.0,70926.510094
2,5,Ms,80000,75000.0,5000.0,500.0,50.0,75550.0,75950.0,79073.489906
3,6,Phd,100000,75000.0,25000.0,2500.0,250.0,77750.0,79750.0,95367.449528


In [8]:
import seaborn as sns
titanic = sns.load_dataset("titanic")
# Predictor and Target
X = titanic.pclass
y = titanic.survived

# Split the Data in train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
                                                    random_state = 0)

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train.values.reshape(-1,1), y_train)
y_pred = knn.predict(X_test.values.reshape(-1,1))
accuracy_score(y_test, y_pred)

0.39664804469273746

In [11]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train.values.reshape(-1,1), y_train)
y_pred = dtree.predict(X_test.values.reshape(-1,1))
accuracy_score(y_test, y_pred)

0.7150837988826816

In [12]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train.values.reshape(-1,1), y_train)
y_pred = nb.predict(X_test.values.reshape(-1,1))
accuracy_score(y_test, y_pred)

0.7150837988826816

## Finding the Optima K Value using Grid Search CV

In [21]:
from sklearn.model_selection import GridSearchCV
knn = KNeighborsClassifier()
params = {'n_neighbors':range(1, 50)}
grid = GridSearchCV(knn, param_grid=params, cv = 5,
                              scoring = "accuracy")
grid.fit(X.values.reshape(-1,1), y)

# Finding the kvalue
print(grid.best_params_)

{'n_neighbors': 20}


In [20]:
knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train.values.reshape(-1,1), y_train)
y_pred = knn.predict(X_test.values.reshape(-1,1))
accuracy_score(y_test, y_pred)

0.6145251396648045