In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset = pd.read_csv('Datasets/Phishing_dataset.csv')

In [3]:
dataset = dataset.drop('index',axis=1)

In [4]:
X = dataset.drop('Result',axis=1)
y = dataset['Result']

In [5]:
from sklearn.model_selection import train_test_split 

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [7]:
from sklearn.ensemble import GradientBoostingClassifier

In [8]:
model = GradientBoostingClassifier()

In [9]:
model.fit(X_train,y_train)

GradientBoostingClassifier()

In [11]:
prediction = model.predict(X_test)

In [12]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [13]:
print(confusion_matrix(y_test,prediction))
print('\n')
print(classification_report(y_test,prediction))
print('\n')
print(accuracy_score(y_test,prediction))

[[1392  106]
 [  74 1745]]


              precision    recall  f1-score   support

          -1       0.95      0.93      0.94      1498
           1       0.94      0.96      0.95      1819

    accuracy                           0.95      3317
   macro avg       0.95      0.94      0.95      3317
weighted avg       0.95      0.95      0.95      3317



0.9457340970756708


In [14]:
#With default para gives the accuarcy upto 95%

In [15]:
# try GridSearch

In [16]:
from sklearn.model_selection import GridSearchCV

In [26]:
param_grid = {'learning_rate': [0.01,0.02,0.03,],'subsample':[1.0,0.9,0.5,0.2],
             'n_estimators':[100,300,500,700], 'max_depth':[4,6,8]}

In [27]:
grid = GridSearchCV(GradientBoostingClassifier(),param_grid,n_jobs=-1)

In [28]:
grid.fit(X_train,y_train)

GridSearchCV(estimator=GradientBoostingClassifier(), n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.02, 0.03],
                         'max_depth': [4, 6, 8],
                         'n_estimators': [100, 300, 500, 700],
                         'subsample': [1.0, 0.9, 0.5, 0.2]})

In [29]:
grid.best_params_

{'learning_rate': 0.03, 'max_depth': 6, 'n_estimators': 500, 'subsample': 1.0}

In [30]:
grid.best_estimator_

GradientBoostingClassifier(learning_rate=0.03, max_depth=6, n_estimators=500)

In [31]:
grid.best_score_

0.9702761366920054

In [32]:
gridprediction = grid.predict(X_test)

In [34]:
print(classification_report(y_test,gridprediction))

              precision    recall  f1-score   support

          -1       0.98      0.95      0.97      1498
           1       0.96      0.98      0.97      1819

    accuracy                           0.97      3317
   macro avg       0.97      0.97      0.97      3317
weighted avg       0.97      0.97      0.97      3317



In [35]:
print(accuracy_score(y_test,gridprediction))

0.968947844437745


In [36]:
import joblib

In [37]:
joblib.dump(grid,'ML_Pickel/GradientBooster.pkl')

['ML_Pickel/GradientBooster.pkl']

In [71]:
# extra test

In [38]:
param_test = {'min_samples_split':range(1000,2100,200), 'min_samples_leaf':range(30,71,10)}

In [39]:
gsearch = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=60,max_depth=9,max_features='sqrt', subsample=0.8, random_state=10),
                       param_grid = param_test, scoring='roc_auc',n_jobs=4,iid=False, cv=2)

In [40]:
gsearch.fit(X_train,y_train)



GridSearchCV(cv=2,
             estimator=GradientBoostingClassifier(max_depth=9,
                                                  max_features='sqrt',
                                                  n_estimators=60,
                                                  random_state=10,
                                                  subsample=0.8),
             iid=False, n_jobs=4,
             param_grid={'min_samples_leaf': range(30, 71, 10),
                         'min_samples_split': range(1000, 2100, 200)},
             scoring='roc_auc')

In [41]:
gsearch.best_params_

{'min_samples_leaf': 50, 'min_samples_split': 1000}

In [49]:
gsearch.best_estimator_

GradientBoostingClassifier(max_depth=9, max_features='sqrt',
                           min_samples_leaf=50, min_samples_split=1000,
                           n_estimators=60, random_state=10, subsample=0.8)

In [66]:
newmodel = GradientBoostingClassifier(learning_rate=0.005, n_estimators=1500,
                                      max_depth=9, min_samples_split=1200, min_samples_leaf=60, 
                                      subsample=0.85, random_state=10, max_features=7,warm_start=True)

In [67]:
newmodel.fit(X_train,y_train)

GradientBoostingClassifier(learning_rate=0.005, max_depth=9, max_features=7,
                           min_samples_leaf=60, min_samples_split=1200,
                           n_estimators=1500, random_state=10, subsample=0.85,
                           warm_start=True)

In [68]:
newmodel_pred = newmodel.predict(X_test)

In [69]:
print(classification_report(y_test,newmodel_pred))

              precision    recall  f1-score   support

          -1       0.95      0.93      0.94      1498
           1       0.94      0.96      0.95      1819

    accuracy                           0.94      3317
   macro avg       0.95      0.94      0.94      3317
weighted avg       0.94      0.94      0.94      3317



In [70]:
print(accuracy_score(y_test,newmodel_pred))

0.9445281881217968
