## <u>*Regression Modelling for Severity as a Target Feature*</u>

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"]=(20,10)
import seaborn as sns;sns.set()
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error,r2_score,mean_absolute_error,root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from xgboost import XGBRFRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV,RandomizedSearchCV,cross_val_score,KFold
from sklearn.preprocessing import LabelEncoder,StandardScaler
from tqdm import tqdm
from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK,Trials

In [None]:
df=pd.read_csv(r"C:/Users/Omar/Desktop/Omar_Files/Python_Analysis/Auto_Insurance/Data_Sets/data_car.csv")
df=df.drop(["X_OBSTAT_","clm","numclaims"],axis=1)
df["agecat"]=df["agecat"].astype("object")
df["veh_age"]=df["veh_age"].astype("object")
df.head()

---

In [None]:
df=df.sort_values(by=['veh_age', 'agecat','area','gender']).reset_index().drop("index",axis=1)
df.head()

*Date preprocessing*

In [None]:
categories_encoder=LabelEncoder()
for col in df:
    if col in list(df.select_dtypes(include="object").columns):
        df[col]=categories_encoder.fit_transform(df[col])

df.head()

In [None]:
df=df[df["claimcst0"]>0]
df.head()

*Define x and y*

In [None]:
df_shaffled=df.sample(frac=1)
x=df_shaffled.drop(["claimcst0"],axis=1)
y=df_shaffled["claimcst0"]

In [None]:
train_split=round(.7*len(df_shaffled))
valid_split=round(train_split+.15*len(df_shaffled))
x_train,y_train=x[:train_split],y[:train_split]
x_valid,y_valid=x[train_split:valid_split],y[train_split:valid_split]
x_test,y_test=x[valid_split:],y[valid_split:]

In [None]:
len(x_train),len(x_valid),len(x_test)

In [None]:
def evaluation_preds(y_test,predicted):
    MSE=mean_squared_error(y_test,predicted)
    RMSE=root_mean_squared_error(y_test,predicted)
    MAE=mean_absolute_error(y_test,predicted)
    r2=r2_score(y_test,predicted)
    metrics_dict={
        "MSE": f"{MSE:.2f}",
        "RMSE":f"{RMSE:.2f}",
        "MAE":f"{MAE:.2f}",
        "r2":f"{r2:.2f}",
    }
    return metrics_dict

In [None]:
clf=RandomForestRegressor()
clf.fit(x_train,y_train)
y_preds=clf.predict(x_test)
evaluation_preds(y_test,y_preds)

In [None]:
pd.DataFrame({"base":evaluation_preds(y_test,y_preds)})

*Splitting data into training and testing subsets*

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y ,test_size= 0.20)

print("Train data shape of X = % s and Y = % s : "%(
	x_train.shape, y_train.shape))

print("Test data shape of X = % s and Y = % s : "%(
	x_test.shape, y_test.shape))

*Data Scalling*

In [None]:
sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

In [None]:
def cross_val_evaluation_preds(model,x,y):
    NRMSE=cross_val_score(model, x,y,scoring="neg_mean_squared_error").mean()
    NRASE=cross_val_score(model, x,y,scoring="neg_mean_absolute_error").mean()
    r2=cross_val_score(model, x,y,scoring="r2").mean()
    metrics_dict={
        "NRMSE": NRMSE,
        "NRASE": NRASE,
        "r2":r2
    }
    return metrics_dict

In [None]:
def evaluation_preds(model,y_test,predicted):
    MSE=mean_squared_error(y_test,predicted)
    RMSE=root_mean_squared_error(y_test,predicted)
    MAE=mean_absolute_error(y_test,predicted)
    r2=r2_score(y_test,predicted)
    metrics_dict={
        "MSE": MSE,
        "RMSE":RMSE,
        "MAE":MAE,
        "r2":r2,
    }
    return metrics_dict

---

## *Hyperparamers tunning*

* *Defining models and its parameters*

In [None]:
models = {
                "RandomForestRegressor": RandomForestRegressor(),
                "DecisionTreeRegressor": DecisionTreeRegressor(),
                "XGBRFRegressor": XGBRFRegressor(),
                "GradientBoostingRegressor":GradientBoostingRegressor(),
                "AdaBoostRegressor": AdaBoostRegressor()
        
}

search_spaces ={        
        "XGBRFRegressor":{
            "learning_rate": hp.uniform("learning_rate",0.01,1.0),    # 0.3 is the default
            "max_depth": hp.choice("max_depth", [None,2, 4, 5, 6,7,8]),
            "subsample": hp.uniform("subsample",0.5,1.0),
            "n_estimators": hp.choice("n_estimators", [100, 200, 300, 400,500,600]),
            'colsample_bytree': hp.uniform("colsample_bytree",0.5,1.0), 
            'colsample_bynode': hp.uniform("colsample_bynode",0.5,1.0),
            "reg_lambda": hp.quniform("reg_lambda",0,2,1),           #L2 regularization term on weights. Increasing this value will make model more conservative
            "reg_alpha": hp.quniform("reg_alpha",0,2,1),
            "num_parallel_tree": hp.choice("num_parallel_tree", [100,110])
            },
                    
        "GradientBoostingRegressor":{
            #"loss": hp.choice("loss",['squared_error', 'huber', 'absolute_error', 'quantile']),
            "n_estimators": hp.choice("n_estimators", [100, 200, 300, 400,500,600]),
            "max_depth": hp.choice("max_depth", [2, 4, 5, 6,7,8]),
            "max_features": hp.choice("max_features",["sqrt", "log2"]),
            "learning_rate": hp.uniform("learning_rate",0.01,1.0),
            "subsample": hp.uniform("subsample",0.8,1.0),
            "min_samples_split": hp.uniform("min_samples_split",0.8,1.0),
            "min_samples_leaf": hp.uniform("min_samples_leaf",0.8,1.0),
            #"criterion": hp.choice("criterion",["squared_error", "friedman_mse"]) 
            },
        "RandomForestRegressor":{
            "n_estimators": hp.choice("n_estimators", [100, 200, 300, 400,500,600]), 
            "min_samples_split": hp.uniform("min_samples_split",0.8,1.0),
            "min_samples_leaf": hp.uniform("min_samples_leaf",0.8,1.0),
            #"bootstrap": hp.choice("bootstrap",[True, False]),                                                                                                    
            #"max_features": hp.choice("max_features",["sqrt", "log2"]),                                    
            "max_depth": hp.choice("max_depth", [2, 4, 5, 6,7,8]),
            #"criterion": hp.choice("criterion",["poisson", "squared_error", "friedman_mse","absolute_error"])                    
        },

        "AdaBoostRegressor":{
            "n_estimators": hp.choice("n_estimators", [100, 200, 300, 400,500,600]),
            "learning_rate": hp.uniform("learning_rate",0.01,1.0),
            #"loss": hp.choice("loss",["linear","square","exponential"])
        },
        "DecisionTreeRegressor": {
                    "criterion": hp.choice("criterion",["poisson", "squared_error", "friedman_mse","absolute_error"]),
                    'splitter': hp.choice("splitter",["best", "random"]),
                    "max_depth": hp.choice("max_depth", [2, 4, 5, 6,7,8]),
                    "min_samples_split": hp.uniform("min_samples_split",0.8,1.0),
                    "min_samples_leaf": hp.uniform("min_samples_leaf",0.8,1.0),
                    "max_features": hp.choice("max_features",["sqrt", "log2"])
        }                    
}

In [None]:
model=RandomForestRegressor()
model.fit(x_train,y_train)

In [None]:
np.random.seed(42)
print("RF_cross_val_evaluation_preds :\n" , cross_val_evaluation_preds(RandomForestRegressor(),x,y))
print("===========")
print("XGBRF_cross_val_evaluation_preds :\n" , cross_val_evaluation_preds(XGBRFRegressor(),x,y))
print("===========")
print("ADABOOST_cross_val_evaluation_preds :\n" , cross_val_evaluation_preds(AdaBoostRegressor(),x,y))
print("===========")
print("GRADBOOST_cross_val_evaluation_preds :\n" , cross_val_evaluation_preds(GradientBoostingRegressor(),x,y))
print("===========")

In [None]:
basemodels=pd.DataFrame({
    "RF_cross_val_evaluation_preds":cross_val_evaluation_preds(RandomForestRegressor(),x,y),
    "XGBRF_cross_val_evaluation_preds":cross_val_evaluation_preds(XGBRFRegressor(),x,y),
    "ADABOOST_cross_val_evaluation_preds" :cross_val_evaluation_preds(AdaBoostRegressor(),x,y),
    "GRADBOOST_cross_val_evaluation_preds":cross_val_evaluation_preds(GradientBoostingRegressor(),x,y)
})

In [None]:
basemodels

* *AdaBoostRegressor hyperparameter*

In [None]:
space=search_spaces["AdaBoostRegressor"]
def hyperparameter_tuning(space):
    clf = AdaBoostRegressor(**space)
    acc = cross_val_score(clf, x,y,scoring="neg_mean_absolute_error").mean()
    return {"loss": -acc, "status": STATUS_OK}

trials = Trials()
best_AdaBoostRegressor = fmin(
    fn=hyperparameter_tuning,
    space = space, 
    algo=tpe.suggest, 
    max_evals=10, 
    trials=trials
)     

* *GradientBoostingRegressor hyperparameter*

In [None]:
space=search_spaces["GradientBoostingRegressor"]
def hyperparameter_tuning(space):
    clf = GradientBoostingRegressor(**space)
    acc = cross_val_score(clf, x_train,y_train,scoring="neg_mean_absolute_error").mean()
    return {"loss": -acc, "status": STATUS_OK}

trials = Trials()
best_GradientBoostingRegressor = fmin(
    fn=hyperparameter_tuning,
    space = space, 
    algo=tpe.suggest, 
    max_evals=10, 
    trials=trials
)  

* *RandomForestRegressor hyperparameter*

In [None]:
space=search_spaces["RandomForestRegressor"]
def hyperparameter_tuning(space):
    clf = RandomForestRegressor(**space)
    acc = cross_val_score(clf, x,y,scoring="neg_mean_absolute_error").mean()
    return {"loss": -acc, "status": STATUS_OK}

trials = Trials()
best_RandomForestRegressor = fmin(
    fn=hyperparameter_tuning,
    space = space, 
    algo=tpe.suggest, 
    max_evals=10, 
    trials=trials
)      

* *XGBRFRegressor hyperparameter*

In [None]:
space=search_spaces["XGBRFRegressor"]
def hyperparameter_tuning(space):
    clf = XGBRFRegressor(**space)
    acc = cross_val_score(clf, x,y,scoring="neg_mean_absolute_error").mean()
    return {"loss": -acc, "status": STATUS_OK}

trials = Trials()
best_XGBRFRegressor = fmin(
    fn=hyperparameter_tuning,
    space = space, 
    algo=tpe.suggest, 
    max_evals=10, 
    trials=trials
)

In [None]:
print("===========best_parametrs: AdaBoostRegressor===========")
print(best_AdaBoostRegressor)
import pickle
with open('C:/Users/Omar/Desktop/Omar_Files/Python_Analysis/Auto_Insurance/outcomes/bestParams/gender_best_AdaBoostRegressor.pkl', 'wb') as f:
    pickle.dump(best_AdaBoostRegressor, f)
    f.close()
print("===========best_parametrs: GradientBoostingRegressor===========")
print(best_GradientBoostingRegressor)
import pickle
with open('C:/Users/Omar/Desktop/Omar_Files/Python_Analysis/Auto_Insurance/outcomes/bestParams/gender_best_GradientBoostingRegressor.pkl', 'wb') as f:
    pickle.dump(best_GradientBoostingRegressor, f)
    f.close()
print("===========best_parametrs: RandomForestRegressor===========")
print(best_RandomForestRegressor)
with open('C:/Users/Omar/Desktop/Omar_Files/Python_Analysis/Auto_Insurance/outcomes/bestParams/gender_best_RandomForestRegressor.pkl', 'wb') as f:
    pickle.dump(best_RandomForestRegressor, f)
    f.close() 
print("===========best_parametrs: XGBRFRegressor===========")
print(best_XGBRFRegressor)
with open('C:/Users/Omar/Desktop/Omar_Files/Python_Analysis/Auto_Insurance/outcomes/bestParams/gender_best_XGBRFRegressor.pkl', 'wb') as f:
    pickle.dump(best_XGBRFRegressor, f)
    f.close()

In [None]:
'''import pickle
pickle.load(open("C:/Users/Omar/Desktop/Omar_Files/Python_Analysis/Auto_Insurance/outcomes/bestParams/gender_best_AdaBoostRegressor.pkl","rb"))
pickle.load(open("C:/Users/Omar/Desktop/Omar_Files/Python_Analysis/Auto_Insurance/outcomes/bestParams/gender_best_GradientBoostingRegressor.pkl","rb"))
pickle.load(open("C:/Users/Omar/Desktop/Omar_Files/Python_Analysis/Auto_Insurance/outcomes/bestParams/gender_best_RandomForestRegressor.pkl","rb"))
pickle.load(open("C:/Users/Omar/Desktop/Omar_Files/Python_Analysis/Auto_Insurance/outcomes/bestParams/gender_best_XGBRFRegressor.pkl","rb"))
'''

In [None]:
print("===========AdaBoostRegressor===========")
model = AdaBoostRegressor(**best_AdaBoostRegressor)
model.fit(x_train,y_train)
predicted=model.predict(x_test)
print("cross_val_evaluation_preds :\n" , cross_val_evaluation_preds(model,x,y))
print("===========")
print("evaluation_preds :\n" , evaluation_preds(model,y_test,predicted))
ADAboost_evaluation_preds=evaluation_preds(model,y_test,predicted)  
print("===========")

print("===========GradientBoostingRegressor===========")
model = GradientBoostingRegressor(**best_GradientBoostingRegressor)
model.fit(x_train,y_train)
predicted=model.predict(x_test)
print("cross_val_evaluation_preds :\n" , cross_val_evaluation_preds(model,x,y))
print("===========")
print("evaluation_preds :\n" , evaluation_preds(model,y_test,predicted)) 
GBOOST_evaluation_preds=evaluation_preds(model,y_test,predicted)  
print("===========")

print("===========XGBRFRegressor===========")
model = XGBRFRegressor(**best_XGBRFRegressor)
model.fit(x_train,y_train)
predicted=model.predict(x_test)
print("cross_val_evaluation_preds :\n" , cross_val_evaluation_preds(model,x,y))
print("===========")
print("evaluation_preds :\n" , evaluation_preds(model,y_test,predicted))
XGBRF_evaluation_preds=evaluation_preds(model,y_test,predicted)   
print("===========")

print("===========RandomForestRegressor===========")
model = RandomForestRegressor(**best_RandomForestRegressor)
model.fit(x_train,y_train)
predicted=model.predict(x_test)
print("cross_val_evaluation_preds :\n" , cross_val_evaluation_preds(model,x,y))
print("===========")
print("evaluation_preds :\n" , evaluation_preds(model,y_test,predicted))
RF_evaluation_preds=evaluation_preds(model,y_test,predicted)
print("=====================================================================")  


In [None]:
model = RandomForestRegressor(**best_RandomForestRegressor)
model.fit(x_train,y_train)
predicted=model.predict(x_test)

In [None]:
hyperoptmodels=pd.DataFrame({
    "RF_cross_val_evaluation_preds":RF_evaluation_preds,
    "XGBRF_cross_val_evaluation_preds":XGBRF_evaluation_preds,
    "ADABOOST_cross_val_evaluation_preds" :ADAboost_evaluation_preds,
    "GRADBOOST_cross_val_evaluation_preds":GBOOST_evaluation_preds
})

In [None]:
hyperoptmodels

In [None]:
hyperoptmodels.plot(kind="barh")

In [None]:
import pickle
with open ("C:/Users/Omar/Desktop/Omar_Files/Python_Analysis/Auto_Insurance/outcomes/bestParams/claims_best_XGBRFRegressor.pkl","wb") as f:
    pickle.dump(best_XGBRFRegressor,f)

### *Final Result*

In [None]:
model = XGBRFRegressor(**best_XGBRFRegressor)
model.fit(x_train,y_train)
predicted=model.predict(x_test)

In [None]:
model.feature_names=x.columns
importance=pd.concat([pd.DataFrame(model.feature_names,columns=["feature_names"]),
           pd.DataFrame(model.feature_importances_,columns=["feature_importances"])],axis=1)
importance

In [None]:
matplotlib.rcParams["figure.figsize"]=(20,7)
ax=sns.barplot(x=np.round(importance["feature_importances"],3),y=importance["feature_names"])
plt.title('Features importance for claims feature')
ax.bar_label(ax.containers[0]);
plt.savefig('C:/Users/Omar/Desktop/Omar_Files/Python_Analysis/Auto_Insurance/outcomes/featuresImportanceGraphs/claims_featureImportanc.png',dpi=500);

In [None]:
import pickle
with open ("C:/Users/Omar/Desktop/Omar_Files/Python_Analysis/Auto_Insurance/outcomes/bestModels/claims_prediction_model.pkl","wb") as f:
    pickle.dump(model,f)

---