In [2]:
#import required libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,f2_score,roc_auc_score,roc_auc_score
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
#Load the dataset
df=pd.read_csv("rash_driving_data")

In [None]:
#Basics EDA(Exploratory data analysis)
df.head()         #get the first five rows
df.tail()         #get the last five rows
df.info()         #dataset information
df.shape()        #shape of dataset
df.describe()     #describe the dataset
df.isna().sum()   #checking the missing values
df['column'].fillna(value=mean_value, inplace=True)         #replace null values with mean     -missing value treatment
df.corr()["Outcome"]                      #Correlation on Dataset based on target variable
sns.pairplot(df,hue="Outcome")            #Pairplot using sns -plot relationship between variables in a dataset    ,here outcome column nmae is "rash_driving"
sns.boxplot(df["column_name"])            #checking for outliers
df['column_name'].mean()+df['column_name'].std()*3      #outlier treatment using mean,standard deviation,percentiles

#train_test_split
X=df.drop(["outcome"],axis=1)   # X contains independent variables
y=df["outcome"]     #y is the output variable, here the output column is "rash_driving"
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)        #train_test_split 

#scale down the dataset using standard scalar
scale=StandardScaler()
X_train_scaled=scale.fit_transform(X_train)
X_test_scaled=scale.transform(X_test)

#fit the model 
rf=RandomForestClassifier(n_estimators=100,random_state=42)
rf.fit(X_train,y_train)
#Predict on test set
y_pred=rf.predict(X_test)

#Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate  and print metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1)
print("AUC-ROC score:", roc_auc)

In [None]:
#Applying different algorithms
log =LogisticRegression()
dtc =DecisionTreeClassifier()
rf = RandomForestClassifier()
adb = AdaBoostClassifier()
grad = GradientBoostingClassifier()
svc = SVC()
knn = KNeighborsClassifier()
nbc = GaussianNB()

voting=VotingClassifier(estimators=[("Logistic_Regression",log),("Decision_Tree_Classifier",dtc),("Random_Forest_Classifier",rf),("AdaBoost_Classifier",adb),("Gragient_Boosting_Classifier",grad),("SVC",svc),("KNeighborsClassifier",knn),("GaussianNB",nbc)],voting="hard")

voting.fit(X_train,y_train)

#fit with different models
for  clf in (log,dtc,rf,adb,grad,svc,knn,nbc,voting):
  clf.fit(X_train,y_train)
  y_pred=clf.predict(X_test)
  print(clf.__class__.__name__,clf.score(X_train,y_train))
  print(clf.__class__.__name__,accuracy_score(y_test,y_pred))
  
  print("===========================")

In [None]:
#Hyperparameter Tuning

from sklearn.model_selection import GridSearchCV
lr_clf = LogisticRegression()
df_clf= DecisionTreeClassifier()
rf_clf = RandomForestClassifier()
adboost_clf = AdaBoostClassifier()
grad_clf = GradientBoostingClassifier()
svc_clf = SVC()
xgb_clf = XGBClassifier()
knn_clf = KNeighborsClassifier()

clf_list=[lr_clf,df_clf,rf_clf,adboost_clf,grad_clf,svc_clf,xgb_clf,knn_clf]

grid_params_lr= [{'penalty':['l1','l2'],'solver':['saga']}]

grid_params_df =[{'criterion':["gini","entropy"], 'splitter':['best','random'],'max_depth':[3,4,5],'min_samples_split':[2,3,4],'max_features':["auto","sqrt","log2"]}]

grid_params_rf=[{'n_estimators': [4, 6, 9], 'max_features': ['log2', 'sqrt','auto'], 'criterion': ['entropy', 'gini'],'max_depth': [2, 3, 5, 10]}]

grid_params_adboost=[{'n_estimators':[10,50,250,1000],'learning_rate':[0.01,0.1],}]

grid_params_grad=[{'loss':['deviance', 'exponential'],'learning_rate':[1,7,9],'criterion':['friedman_mse','mse']}]

grid_params_svc=[{'kernel':['linear','poly','rbf'],'degree':[3,4,5]}]

grid_params_xgb=[{'booster':['gbtree', 'gblinear'],'nthread':[10,15,25]}]

grid_params_knn=[{'n_neighbors':[5,7,9,11],'algorithm':['ball_tree','kd_tree','brute'],'leaf_size':[30,50,100]}]


clf_params=[grid_params_lr,grid_params_df,grid_params_rf,grid_params_adboost,grid_params_grad,grid_params_svc,grid_params_xgb,grid_params_knn]



for clf,clf_param in zip(clf_list,clf_params):
  print(f"The Classifier is {clf} and its hyper params are {clf_param}")

  grid_clf = GridSearchCV(estimator=clf,param_grid=clf_param,scoring="accuracy",cv=10)
  grid_clf.fit(X_train_scaled,y_train)
  print(f"The Train accuracy for the {clf} is {grid_clf.score(X_train_scaled,y_train)}")
  print(f"The Test accuracy for the {clf} is {grid_clf.score(X_test_scaled,y_test)}")
  print(f"The best param for {clf} is {grid_clf.best_params_}")
  print("====================\n")

In [None]:
# making the best model on the basis of accuracy, here we assume random forest is best model

rf_clf = RandomForestClassifier(criterion = 'entropy',max_depth = 3,max_features = 'log2',n_estimators = 4)
rf_clf.fit(X_train_scaled,y_train)

In [None]:
#Making pickle file of the model 
#Pickle file
import joblib   
joblib.dump(rf_clf,'filepath.pkl')     #pickle  file of the model
joblib.dump(scale,'filepath.pk1')      #Pickle file of the scaled data 
