<a href="https://colab.research.google.com/github/pavi-1994/Cardio_vascular_risk_prediction/blob/main/cardivascular_risk_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Problem statement:
The dataset is from an ongoing cardiovascular study on residents of the town of Framingham,
Massachusetts. The classification goal is to predict whether the patient has a 10-year risk of
future coronary heart disease (CHD). The dataset provides the patients’ information. It includes
over 4,000 records and 15 attributes.Variables
Each attribute is a potential risk factor. There are both demographic, behavioral, and medical risk
factors.

## Data description:

#### Demographic:
• Sex: male or female("M" or "F")

• Age: Age of the patient;(Continuous - Although the recorded ages have been truncated to
whole numbers, the concept of age is continuous)

#### Behavioral
• is_smoking: whether or not the patient is a current smoker ("YES" or "NO")

• Cigs Per Day: the number of cigarettes that the person smoked on average in one day.(can be
considered continuous as one can have any number of cigarettes, even half a cigarette.)

#### Medical( history)

• BP Meds: whether or not the patient was on blood pressure medication (Nominal)

• Prevalent Stroke: whether or not the patient had previously had a stroke (Nominal)

• Prevalent Hyp: whether or not the patient was hypertensive (Nominal)

• Diabetes: whether or not the patient had diabetes (Nominal)

#### Medical(current):

• Tot Chol: total cholesterol level (Continuous)

• Sys BP: systolic blood pressure (Continuous)

• Dia BP: diastolic blood pressure (Continuous)

• BMI: Body Mass Index (Continuous)

• Heart Rate: heart rate (Continuous - In medical research, variables such as heart rate though in
fact discrete, yet are considered continuous because of large number of possible values.)

• Glucose: glucose level (Continuous)


#### Predict variable (desired target)
• 10-year risk of coronary heart disease CHD(binary: “1”, means “Yes”, “0” means “No”) -
DV

In [None]:
#importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.simplefilter("ignore")

# to handle missing values for continuous value
from sklearn.impute import KNNImputer
#for categorical value
from sklearn.impute import SimpleImputer


#vif
from statsmodels.stats.outliers_influence import variance_inflation_factor
#train_test_split
from sklearn.model_selection import train_test_split
#scaling
from sklearn.preprocessing import StandardScaler,MinMaxScaler

#to deal with imbalanced data
from imblearn.over_sampling import SMOTE

#logistic regression
from sklearn.linear_model import LogisticRegression
#naive bayes classifier
from sklearn.naive_bayes import GaussianNB
## N nearest neigbour  classifier
from sklearn.neighbors import KNeighborsClassifier
#decision tree classifier
from sklearn.tree import DecisionTreeClassifier
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier
#XGBRFClassifier
from xgboost import XGBRFClassifier
#support vector machine
from sklearn.svm import SVC

# for hyperparameter tunning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

#evaluation techiniques
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve,precision_score, classification_report
from sklearn.metrics import make_scorer, recall_score, f1_score, roc_auc_score,auc





In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
file_path="/content/drive/MyDrive/almabetter/module 4/capstone_project_3/data_cardiovascular_risk.csv"

### Importing

In [None]:
#importing data 
cardio_df=pd.read_csv(file_path)
cardio_df.head()

In [None]:
#print tail of dataframe
cardio_df.tail()

In [None]:
#shape of data
cardio_df.shape

In [None]:
#printing the columns
cardio_df.columns

In [None]:
#using info
cardio_df.info()

In [None]:
#check the datatypes of columns present in the cardio_df
print(f'Feature datatype of cardio_df :\n\n {cardio_df.dtypes}')

In [None]:
#checking full information of data
cardio_df.describe(include="all").T

In [None]:
#check no of unique element present in each feature (columns)
cardio_df.nunique()


### Missing values

In [None]:
#checking for null value 
print('Missing Data Count')
print(cardio_df.isnull().sum().sort_values(ascending = False))

print('--'*50)
print('Missing Data Percentage')
print(f'{round(cardio_df.isnull().sum().sort_values(ascending = False)/len(cardio_df)*100,2)}%')

### here we are observing prescence of missing value in percentage wise as we listed in descending order glucose, education, BPMeds, totChol, cigsPerDay, BMI, Heartrate. Handling the missing values are most important.

Before dealing with missing value we will convert categorical into numerical i.e by label encoding

In [None]:
# Replacing string categorical value with numerical value 0 or 1

cardio_df['sex'] = cardio_df['sex'].replace({'F':1,'M':0})
cardio_df['is_smoking'] = cardio_df['is_smoking'].replace({'YES':1,'NO':0})

## Create a list of categorical and continuous features:
    

In [None]:
# Lets check the discrete and continuous features that contain null values
categorical_features = ['education',"BPMeds"]
continuous_features = ['cigsPerDay','glucose','heartRate','BMI',"totChol" ]

print(categorical_features)
print(continuous_features)

### Missing value imputation using knn imputer

In [None]:
# for categorical value nan value imputation using knn
imputer_categorical= SimpleImputer(strategy="most_frequent")
cardio_df[categorical_features]=imputer_categorical.fit_transform(cardio_df[categorical_features])
#cardio_knn_imputed= pd.DataFrame(cardio_df_imputed,columns=cardio_df.columns)
cardio_df.isnull().sum()

In [None]:
#for continuous feature nan value imputation using knn
imputer = KNNImputer(n_neighbors=1)
cardio_df[continuous_features]=imputer.fit_transform(cardio_df[continuous_features])

checking after imputation of nan value using knn imputer

In [None]:
#checking for null value 
print('Missing Data Count')
print(cardio_df.isnull().sum().sort_values(ascending = False))

print('--'*50)
print('Missing Data Percentage')
print(f'{round(cardio_df.isnull().sum().sort_values(ascending = False)/len(cardio_df)*100,2)}%')

In [None]:
#check presence of duplicated rows
cardio_df.duplicated().sum()

### Knowing about target variable

**10-year risk of coronary heart disease CHD(binary: “1”, means “Yes”, “0” means “No”) - DV**

In [None]:
#value_count
cardio_df["TenYearCHD"].value_counts()

In [None]:
#To get the pie Plot to analyze TenYearCHD
cardio_df["TenYearCHD"].value_counts().plot(kind="bar")
plt.title('Pie Chart for TenYearCHD')
plt.show()

In [None]:
#To get the pie Plot to analyze seasons
cardio_df["TenYearCHD"].value_counts().plot.pie(explode=[0.05,0.05], autopct='%1.1f%%',  startangle=90,shadow=True, figsize=(8,8))
plt.title('Pie Chart for TenYearCHD')
plt.show()

From above analysis of target variabale "TenYearCHD" we observed that dataset is biased we need to carefully deal with biased dataset

In [None]:
#renaming the TenYearCHD target feature
cardio_df.rename(columns={"TenYearCHD":"cardio_risk"},inplace=True)

In [None]:
### Univarient analysis  of continuous variables
cardio_numerics_col = cardio_df.select_dtypes(include=np.number).columns.tolist()
print("numerical columns \n")
print(cardio_numerics_col)




### 1. Sex

In [None]:
cardio_df["sex"].value_counts()

In [None]:
cardio_df["sex"].value_counts().plot.pie(explode=[0.05,0.05], autopct='%1.1f%%',  startangle=90,shadow=True, figsize=(8,8))
plt.title('Pie Chart for gender')
plt.show()

The dataset contain 43.3% of male data and remaining data belongs to female 

In [None]:
cardio_df.groupby(["sex","cardio_risk"])["cardio_risk"].count().unstack('cardio_risk')

In [None]:
# plotting graph for checking how many users are male or female and how it affects the target variable

pd.crosstab(cardio_df['sex'],cardio_df['cardio_risk']).plot(kind='bar')

In cimparison with male female less likely  to get cardiovascular risk

### 2. 'is_smoking'

In [None]:
cardio_df["is_smoking"].value_counts()

In [None]:
cardio_df["is_smoking"].value_counts().plot.pie(explode=[0.05,0.05], autopct='%1.1f%%',  startangle=90,shadow=True, figsize=(8,8))
plt.title('Pie Chart for is_smoking')
plt.show()

The data set contains almost same no of smokers and non smokers

In [None]:
# plotting graph for checking how many users are smoking or not and how it affects the target variable

pd.crosstab(cardio_df['is_smoking'],cardio_df['cardio_risk']).plot(kind='bar')

### Based on studies the smokers are 2 to 4 times more likely to get heart disease than nonsmokers.


**Based on analysis cardiovascular risk dataset,we observed that the smoking slightly more  effective for cardiovascular risk then non smoker.**

In [None]:
cardio_df.groupby(["sex","is_smoking","cardio_risk"])["cardio_risk"].count().unstack('cardio_risk')

In [None]:
cardio_df.groupby(["sex","is_smoking","cardio_risk"])["cardio_risk"].count().plot(kind='bar')

here we can observe that male who smoke has higher risk of having cardiovascular risk than female who smoke.

### cigsPerDay

In [None]:
cardio_df["cigsPerDay"].plot(kind="hist")

In [None]:
# plotting graph for checking how no of cigrates per day used by person and how it affects the cardio_risk variable

g=sns.factorplot('cigsPerDay','cardio_risk',kind='bar',
                    data=cardio_df,hue='sex',aspect=4)

g=g.set_ylabels("probability of cardio_risk based on no of cigsPerDay")

As we observed that people who don't smoke or use cigsperday less than 6  has less chance having risk of cardiovascular problem.
But as no of cigsperday increases the cardiovascular risk also increase in males. females have less chance of in comparison with male.

### age

In [None]:
# analyzing cardio_vascular risk over age 
plt.figure(figsize=[15,10])
plt.title("Age based analysis of cardio_vascular risk")
sns.countplot(x='age', hue='cardio_risk', data=cardio_df, palette='colorblind')


In [None]:
#plotting line plot for understanding distribution cardio_risk over age and cigsPerDay
fig,ax=plt.subplots(figsize=(18,8))
sns.lineplot(data=cardio_df,x='age',y='cigsPerDay',hue='cardio_risk',ax=ax)
ax.set(title=' Distribution cardio_risk over age and cigsPerDay ')

we observed that the Number of people with cardiovascular disease  risk lesser than the number of people withhout cardiovascular disease risk.But people with age greater than the 40 has higher chance of having cardiovascular risk.

Age may matter for having higher chance but if the no. of cigrates per day decreases chance of having higher cardio_risk problem decreases.

lets convert the age feature into bins

In [None]:
#perform data binning on points variable with specific quantiles and labels
bins = [30, 40, 50, 60, 70]
labels = ["30-40age","40-50age","50-60age","60-70age"]
cardio_df['binned_age'] = pd.cut(cardio_df['age'], bins=bins, labels=labels)
cardio_df.head()


In [None]:
cardio_df['binned_age'] .value_counts()

In [None]:
cardio_df.groupby(["binned_age","cardio_risk"])["cardio_risk"].count().unstack('cardio_risk')

In [None]:
# analyzing cardio_vascular risk over age 
ax=sns.factorplot('binned_age','cardio_risk',kind='bar', data=cardio_df,size=4,aspect=2)
ax=ax.set_ylabels("RISK Probability")


we observed from above analysis that people with age greater than 50 has higher chance of getting cardiovascular disease. 

In [None]:
cardio_df.groupby(["binned_age","sex","cardio_risk"])["cardio_risk"].count().unstack('cardio_risk')

In [None]:
# analyzing cardio_vascular risk over age 
ax=sns.factorplot('binned_age','cardio_risk',kind='bar',hue="sex", data=cardio_df,size=4,aspect=2)
ax=ax.set_ylabels("RISK Probability")

Based on our observation, as people get older the risk of getting cardiovascular disease iis high male in comarison with female.

### Education


In [None]:
cardio_df["education"].value_counts()

we observed that the people who completed their eduction level 1 is higher incount, than people who completed level 2(intermediate) or level 3(graduates) or level 4(post graduate)

In [None]:
#Plotting pie chart of target variable
f,ax=plt.subplots(1,2,figsize=(10,5)) 
cardio_df.groupby(["education","cardio_risk"])["cardio_risk"].count().unstack('cardio_risk').plot(kind="bar",ax=ax[0])
ax[0].set_title('is there any educational awareness')
ax[0].set_ylabel('count')
# count plot of education levels people having diseases
sns.countplot(x ='cardio_risk', hue ='education', data = cardio_df,ax=ax[1])
ax[1].set_title('is there any educational awareness')
ax[1].set_ylabel('count')



Education of people not much effective in creating awareness to reduce cardio vascular risk.so we can drop this .



###'BPMeds' 
 we can say that people with blood pressure medication

In [None]:
cardio_df["BPMeds"].value_counts()

In [None]:
#Plotting pie chart of BPMeds variable
f,ax=plt.subplots(1,2,figsize=(10,5))
cardio_df['BPMeds'].value_counts().plot.pie(autopct='%1.1f%%',ax=ax[0],shadow=True)
ax[0].set_title('BPMeds')
ax[0].set_ylabel('')
sns.countplot('BPMeds',data=cardio_df,ax=ax[1])
ax[1].set_title('BPMeds')
plt.show()


In [None]:
cardio_df.groupby(["BPMeds","cardio_risk"])["cardio_risk"].count().unstack('cardio_risk')

In [None]:
plt.figure(figsize=(10,8))
ax=sns.distplot(cardio_df[cardio_df['cardio_risk']==1].BPMeds,bins=10,label="high_risk",color='b',kde=False)
ax=sns.distplot(cardio_df[cardio_df['cardio_risk']==0].BPMeds,bins=10,label="no_risk",color='r',kde=False)
ax.legend()
ax.set_title('BPMeds')

#### People who is under madication has nearly 50%  chance of getting cardiovascular disease

### **PrevalentStroke**

It deal with  whether or not the patient had previously had a stroke

In [None]:
cardio_df["prevalentStroke"].value_counts()

In [None]:
#Plotting pie chart of prevalentStroke variable
f,ax=plt.subplots(1,2,figsize=(10,5))
cardio_df['prevalentStroke'].value_counts().plot.pie(autopct='%1.1f%%',ax=ax[0],shadow=True)
ax[0].set_title('prevalentStroke')
ax[0].set_ylabel('')
sns.countplot('prevalentStroke',data=cardio_df,ax=ax[1])
ax[1].set_title('prevalentStroke')
plt.show()


In [None]:
cardio_df.groupby(["prevalentStroke","cardio_risk"])["cardio_risk"].count().unstack('cardio_risk')

In [None]:
plt.figure(figsize=(10,8))
ax=sns.distplot(cardio_df[cardio_df['cardio_risk']==1].prevalentStroke,bins=10,label="high_risk",color='b',kde=False)
ax=sns.distplot(cardio_df[cardio_df['cardio_risk']==0].prevalentStroke,bins=10,label="no_risk",color='r',kde=False)
ax.legend()
ax.set_title('prevalentStroke')

Even if patient has got stroke earlier there is 40-50% chance getting cardiovascular disease risk. 
# we can drop this feature

### PrevalentHyp

In [None]:
cardio_df["prevalentHyp"].value_counts()

In [None]:
#Plotting pie chart of target variable
f,ax=plt.subplots(1,2,figsize=(10,5))
cardio_df['prevalentHyp'].value_counts().plot.pie(autopct='%1.1f%%',ax=ax[0],shadow=True)
ax[0].set_title('prevalentHyp')
ax[0].set_ylabel('')
sns.countplot('prevalentHyp',data=cardio_df,ax=ax[1])
ax[1].set_title('prevalentHyp')
plt.show()

In [None]:
cardio_df.groupby(["prevalentHyp","cardio_risk"])["cardio_risk"].count().unstack('cardio_risk')

In [None]:
plt.figure(figsize=(10,8))
ax=sns.distplot(cardio_df[cardio_df['cardio_risk']==1].prevalentHyp,bins=10,label="high_risk",color='b',kde=False)
ax=sns.distplot(cardio_df[cardio_df['cardio_risk']==0].prevalentHyp,bins=10,label="no_risk",color='r',kde=False)
ax.legend()
ax.set_title('prevalentHyp')

## 'diabetes'

In [None]:
cardio_df['diabetes'].value_counts()

In [None]:
#Plotting pie chart of target variable
f,ax=plt.subplots(1,2,figsize=(10,5))
cardio_df['diabetes'].value_counts().plot.pie(autopct='%1.1f%%',ax=ax[0],shadow=True)
ax[0].set_title('diabetes')
ax[0].set_ylabel('')
sns.countplot('diabetes',data=cardio_df,ax=ax[1])
ax[1].set_title('diabetes')
plt.show()

In [None]:
cardio_df.groupby(['diabetes',"cardio_risk"])["cardio_risk"].count().unstack('cardio_risk')

In [None]:
plt.figure(figsize=(10,8))
ax=sns.distplot(cardio_df[cardio_df['cardio_risk']==1].diabetes,bins=10,label="high_risk",color='b',kde=False)
ax=sns.distplot(cardio_df[cardio_df['cardio_risk']==0].diabetes,bins=10,label="no_risk",color='r',kde=False)
ax.legend()
ax.set_title('diabetes')

### totChol

In [None]:
cardio_df['totChol'].value_counts()

In [None]:
plt.figure(figsize=(10,8))
ax=sns.distplot(cardio_df[cardio_df['cardio_risk']==1].totChol,bins=20,label="high_risk",color='b',kde=False)
ax=sns.distplot(cardio_df[cardio_df['cardio_risk']==0].totChol,bins=20,label="no_risk",color='r',kde=False)
ax.legend()
ax.set_title('totChol')

In [None]:
#plotting the box plot to vizualize the data distribution of cardio_risk and totChol
plt.figure(figsize=(10,10))
ax = sns.boxplot(x="cardio_risk", y="totChol", data=cardio_df,palette="Set2")

### BMI

In [None]:
cardio_df['BMI'].value_counts()

In [None]:
plt.figure(figsize=(10,8))
ax=sns.distplot(cardio_df[cardio_df['cardio_risk']==1].BMI,bins=20,label="high_risk",color='b',kde=False)
ax=sns.distplot(cardio_df[cardio_df['cardio_risk']==0].BMI,bins=20,label="no_risk",color='r',kde=False)
ax.legend()
ax.set_title('BMI')


In [None]:
#plotting the box plot to vizualize the data distribution of cardio_risk and BMI
plt.figure(figsize=(10,10))
ax = sns.boxplot(x="cardio_risk", y="BMI", data=cardio_df,palette="Set2")

### HeartRate

In [None]:
cardio_df['heartRate'].value_counts()

In [None]:
plt.figure(figsize=(10,8))
ax=sns.distplot(cardio_df[cardio_df['cardio_risk']==1].heartRate,bins=20,label="high_risk",color='b',kde=False)
ax=sns.distplot(cardio_df[cardio_df['cardio_risk']==0].heartRate,bins=20,label="no_risk",color='r',kde=False)
ax.legend()
ax.set_title('heartRate')


In [None]:
#plotting the box plot to vizualize the data distribution of cardio_risk and heartrate
plt.figure(figsize=(10,10))
ax = sns.boxplot(x="cardio_risk", y="heartRate", data=cardio_df,palette="Set2")

### Glucose

In [None]:
cardio_df['glucose'].value_counts()

In [None]:
plt.figure(figsize=(10,8))
ax=sns.distplot(cardio_df[cardio_df['cardio_risk']==1].glucose,bins=20,label="high_risk",color='b',kde=False)
ax=sns.distplot(cardio_df[cardio_df['cardio_risk']==0].glucose,bins=20,label="no_risk",color='r',kde=False)
ax.legend()
ax.set_title('glucose')


In [None]:
#plotting the box plot to vizualize the data distribution of cardio_risk and glucose
plt.figure(figsize=(10,10))
ax = sns.boxplot(x="cardio_risk", y="glucose", data=cardio_df,palette="Set1")

### sysBP

In [None]:
cardio_df['sysBP'].value_counts()

In [None]:
plt.figure(figsize=(10,8))
ax=sns.distplot(cardio_df[cardio_df['cardio_risk']==1].sysBP,bins=20,label="high_risk",color='b',kde=False)
ax=sns.distplot(cardio_df[cardio_df['cardio_risk']==0].sysBP,bins=20,label="no_risk",color='r',kde=False)
ax.legend()
ax.set_title('sysBP')

In [None]:
#plotting the box plot to vizualize the data distribution of cardio_risk and sysBP
plt.figure(figsize=(10,10))
ax = sns.boxplot(x="cardio_risk", y="sysBP", data=cardio_df,palette="Set3")

### diaBP

In [None]:
cardio_df['diaBP'].value_counts()

In [None]:
plt.figure(figsize=(10,8))
ax=sns.distplot(cardio_df[cardio_df['cardio_risk']==1].diaBP,bins=20,label="high_risk",color='b',kde=False)
ax=sns.distplot(cardio_df[cardio_df['cardio_risk']==0].diaBP,bins=20,label="no_risk",color='r',kde=False)
ax.legend()
ax.set_title('diaBP')

In [None]:
#plotting the box plot to vizualize the data distribution of cardio_risk and diaBP
plt.figure(figsize=(10,10))
ax = sns.boxplot(x="cardio_risk", y="diaBP", data=cardio_df,palette="Set3")

In [None]:
#droping columns which doesnot add value to following process
cardio_df.drop(columns=["binned_age","id"],axis=1,inplace=True)

### Outlier/Anamoly detection

In [None]:
### Univarient analysis  of continuous variables
# Lets check the discrete and continuous features
categorical_features = [i for i in cardio_df.columns if cardio_df[i].nunique()<=10]
continuous_features = [i for i in cardio_df.columns if i not in categorical_features and i!="id"]
print("continuous feature columns \n")
print(continuous_features)


In [None]:
#ploting boxplot and distplot to check data distribution of each numerical column
for var in continuous_features:
    plt.figure(figsize=(15,6))
    plt.subplot(1, 2, 1)
    fig = sns.boxplot(y=cardio_df[var])
    fig.set_title(var)
    fig.set_ylabel(var)
    
    plt.subplot(1, 2, 2)
    fig = sns.distplot(cardio_df[var])
    plt.axvline(cardio_df[var].mean(), color='red', linestyle='dashed', linewidth=2)
    plt.axvline(cardio_df[var].median(), color='blue', linestyle='dashed', linewidth=2)
    fig.set_title(var)
    fig.set_ylabel('count')
    fig.set_xlabel(var)

    plt.show()

In [None]:
#checking skewness of each feature
print("skewness of the numerical column distribution is as follows:\n")
for var in continuous_features:
     print(f'{var} has skewness ...........  {round(cardio_df[var].skew(),3)}')

here we observed that the glucose feature suffer from skewness i.e that occured due to precence of outliers. But we cannot drop these outlier because some people may has got high level of glucose  this may increase their cardiovascular problem.

feature with outliers:

In [None]:
feature_with_outlier=["BMI","heartRate","glucose","diaBP","sysBP","totChol","cigsPerDay"]

In [None]:
# Removing outliers by standard methods and Plotting graphs:
for col in feature_with_outlier:
  # Using IQR method to define the range of inliners:
  q1, q3, median = cardio_df[col].quantile([0.25,0.75,0.5])
  lower_limit = q1 - 1.5*(q3-q1)
  upper_limit = q3 + 1.5*(q3-q1)

  # Replacing Outliers with median value
  cardio_df[col] = np.where(cardio_df[col] > upper_limit, median,np.where(cardio_df[col] < lower_limit,median,cardio_df[col]))

In [None]:
#ploting boxplot and distplot to check data distribution of each numerical column after outlier treatment
for var in continuous_features:
    plt.figure(figsize=(15,6))
    plt.subplot(1, 2, 1)
    fig = sns.boxplot(y=cardio_df[var])
    fig.set_title(var)
    fig.set_ylabel(var)
    
    plt.subplot(1, 2, 2)
    fig = sns.distplot(cardio_df[var])
    plt.axvline(cardio_df[var].mean(), color='red', linestyle='dashed', linewidth=2)
    plt.axvline(cardio_df[var].median(), color='blue', linestyle='dashed', linewidth=2)
    fig.set_title(var)
    fig.set_ylabel('count')
    fig.set_xlabel(var)

    plt.show()

In [None]:
#checking skewness of each feature after outlier treatment
print("skewness of the numerical column distribution is as follows:\n")
for var in continuous_features:
     print(f'{var} has skewness ...........  {round(cardio_df[var].skew(),3)}')

checking multicolinearity between all independent feature and target variable

1.Using pair plot

In [None]:
#pairplot
sns.pairplot(cardio_df)

In [None]:
##drop feature education
cardio_df.drop(columns=["education"],axis=1,inplace=True)

2.using correlation metrics

In [None]:
#correlation matrix for each numerical feature using ".corr()"
corr_matrix= round(cardio_df.corr(),3)
corr_matrix

In [None]:
#visualization using heatmap
sns.set(rc={'figure.figsize':(20,14)})
matrix = np.triu(corr_matrix)
sns.heatmap(corr_matrix, annot=True, cmap="Greens")

In [None]:
# Find out the Fields with high correlation

correlated_features = set()
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > 0.7:
            colname1 = corr_matrix.columns[i]
            colname2 = corr_matrix.columns[j]
            print(abs(corr_matrix.iloc[i, j]), "--", i, '--', j, '--', colname1, '--', colname2)
            correlated_features.add(colname1)
            correlated_features.add(colname2)

#### **👉Here we can see that glucose,heartrate are not significantly correlated with cardio risk. so  we can drop these feature.**
### **👉cigsPerDay -- is_smoking  where cigsPerDay- no of cigrates per day and is_smoking - is smoking are highly correlated with each other** 


###**👉"diaBP" and "sysBP" are highly correlated**
where
diaBP is Diastolic Pressure
sysBP is Systolic Pressure


### What is VIF?
A variance inflation factor(VIF) detects multicollinearity in regression analysis. Multicollinearity is when there’s correlation between predictors (i.e. independent variables) in a model; it’s presence can adversely affect your regression results. The VIF estimates how much the variance of a regression coefficient is inflated due to multicollinearity in the model.

A rule of thumb for interpreting the variance inflation factor:

👉 1 = not correlated.

👉 Between 1 and 5 = moderately correlated.

👉 Greater than 5 = highly correlated.


 from above analysis cigsPerDay and is_smoking is highly correlated with each other and diaBP and  sysBP are also highly correlated with each other

In [None]:
#Multicollinearity

def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif.sort_values(by='VIF',ascending=False).reset_index(drop=True))

In [None]:
#applying vif for each feature
#Calc_vif(cardio_df[[i for i in cardio_df.describe().columns if i not in ["glucose","heartRate","diaBP","sysBP","40-50age","BMI"]]])
calc_vif(cardio_df[[i for i in cardio_df.describe().columns if i not in ["cardio_risk"," ","diaBP","sysBP","glucose","heartRate","totChol","BMI"]]])#"diaBP","sysBP","40-50age","BMI"]]])

In [None]:
#correlation matrix for each numerical feature using ".corr()"
#corr_matrix= round(cardio_df[[i for i in cardio_df.describe().columns if i not in ["glucose","heartRate","diaBP","sysBP","40-50age","BMI"]]].corr(),3)
#corr_matrix

In [None]:
#visualization using heatmap
#sns.set(rc={'figure.figsize':(20,14)})
#matrix = np.triu(corr_matrix)
#sns.heatmap(corr_matrix, annot=True, cmap="Greens")

In [None]:
#independent feature after removal of multicolliniarity of the data
#features=[i for i in cardio_df.describe().columns if i not in ["cardio_risk","glucose","heartRate","diaBP","sysBP","40-50age","BMI"]]
#print(features)       

features=[i for i in cardio_df.describe().columns if i not in["cardio_risk","diaBP","sysBP","glucose","heartRate","totChol","BMI"]]
print(features)

#logistic regression 

###Data preprocessing for logistic model


In [None]:
#separating dependent and independent variables

X=cardio_df[features]
y=cardio_df["cardio_risk"]

In [None]:
## Test-Train Split of Input Dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=4)
print((X_train.shape,  y_train.shape),(y_test.shape,X_test.shape))

**What is Standardization?**

Standardization is another scaling technique where the values are centered around the mean with a unit standard deviation. This means that the mean of the attribute becomes zero and the resultant distribution has a unit standard deviation.

In [None]:
#using standardization
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [None]:
#creating dictionary for storing different models accuracy
model_comparison={}

###  Synthetic Minority Oversampling Technique (SMOTE)
This technique generates synthetic data for the minority class.

SMOTE (Synthetic Minority Oversampling Technique) works by randomly picking a point from the minority class and computing the k-nearest neighbors for this point. The synthetic points are added between the chosen point and its neighbors.



In [None]:
# Checking Value counts for both classes Before 
print("Before Handling Class Imbalace:")
y_train.value_counts()

In [None]:
smote = SMOTE(random_state=42)

#fit predictor and target variable
X_smote, y_smote = smote.fit_resample(X_train, y_train)

In [None]:
# Checking Value counts for both classes After handling Class Imbalance:
print("after  Handling Class Imbalace:")
y_smote.value_counts()

## **Building logistic regression model** 

In [None]:
# Fitting Logistic Regression Model
log_reg_clf_model= LogisticRegression(fit_intercept=True, max_iter=10000)
log_reg_clf_model.fit(X_smote,y_smote)
y_pred = log_reg_clf_model.predict(X_test)

### Since dataset is balanced we use accuracy metric

In [None]:
#for train data
print(f"Using logistic regression we get an accuracy of train set{round(((accuracy_score(y_smote, log_reg_clf_model.predict(X_smote))))*100,2)}%")
print(f"Using logistic regression we get an precision of train set {round(((precision_score(y_smote, log_reg_clf_model.predict(X_smote))))*100,2)}%")
print(f"Using logistic regression we get an recall score of train set {round(((recall_score(y_smote, log_reg_clf_model.predict(X_smote))))*100,2)}%")
print(f"Using logistic regression we get an f1 of train set {round(((f1_score(y_smote, log_reg_clf_model.predict(X_smote))))*100,2)}%")
print(f"Using logistic regression we get an roc_auc_score of train set {round(((roc_auc_score(y_smote, log_reg_clf_model.predict(X_smote))))*100,2)}%")


print("\n")
#for test data

print(f"Using logistic regression we get an accuracy of {round((accuracy_score(y_test, y_pred))*100,2)}%")
print(f"Using logistic regression we get an precision of test set {round((precision_score(y_test, y_pred))*100,2)}%")
print(f"Using logistic regression we get an recall score of test set {round((recall_score(y_test, y_pred))*100,2)}%")
print(f"Using logistic regression we get an f1 of test set {round((f1_score(y_test, y_pred))*100,2)}%")
print(f"Using logistic regression we get an roc_auc_score of test set {round((roc_auc_score(y_test, y_pred))*100,2)}%")


In [None]:
# print classification report
print(classification_report(y_test, y_pred))
report = pd.DataFrame(classification_report(y_pred=y_pred, y_true=y_test, output_dict=True))

In [None]:
 # Classification report
plt.figure(figsize=(18,3))
plt.subplot(1,3,1)
sns.heatmap(report.iloc[:-1, :-1].T, annot=True, cmap='coolwarm')
plt.title(f'logistic regression Report')

### Feature importance

In [None]:
#for feature importance
try:
    importance = log_reg_clf_model.feature_importances_
    feature = features
except:
    importance = np.abs(log_reg_clf_model.coef_[0])
    feature = features
    indices = np.argsort(importance)
    indices = indices[::-1]

plt.figure(figsize=(15,6))
#ploting feature importance
plt.bar(range(len(indices)),importance[indices])
plt.xticks(range(len(indices)), [feature[i] for i in indices])
plt.title('Feature Importance')
plt.tight_layout()
plt.show()    

## 👉 Hyperparameter tunning to choose best parameters

In [None]:
from sklearn.model_selection import GridSearchCV


In [None]:
# Fitting Logistic Regression Model
log_reg_clf_model= LogisticRegression()
# search for optimun parameters using gridsearch
params = {'penalty':['l1','l2'],
         'C' : [0.000001,0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000,10000,100000],
         'class_weight':['balanced',None]}
logistic_clf = GridSearchCV(estimator =log_reg_clf_model,param_grid=params,cv=10, scoring='roc_auc')

In [None]:
#training the classifier
logistic_clf.fit(X_smote,y_smote)

In [None]:
# printing the optimal accuracy score and hyperparameters
print('We can get accuracy of',logistic_clf.best_score_,'using',logistic_clf.best_params_)

### **Fitting the final model with the best parameters obtained from logistic_clf**

In [None]:
logistic_clf_model=LogisticRegression(C= 0.01, class_weight= 'balanced', penalty='l2',fit_intercept=True, max_iter=10000)
#training the classifier
logistic_clf_model.fit(X_smote,y_smote)

In [None]:
#predicting
y_pred = logistic_clf_model.predict(X_test)

## **Classification Evaluation Metrics for logistic regression**


In [None]:
#for train data
print(f"Using logistic regression we get an accuracy of train set{round(((accuracy_score(y_smote, logistic_clf_model.predict(X_smote))))*100,2)}%")
print(f"Using logistic regression we get an precision of train set {round(((precision_score(y_smote, logistic_clf_model.predict(X_smote))))*100,2)}%")
print(f"Using logistic regression we get an recall score of train set {round(((recall_score(y_smote, logistic_clf_model.predict(X_smote))))*100,2)}%")
print(f"Using logistic regression we get an f1 of train set {round(((f1_score(y_smote, logistic_clf_model.predict(X_smote))))*100,2)}%")
print(f"Using logistic regression we get an roc_auc_score of train set {round(((roc_auc_score(y_smote, logistic_clf_model.predict(X_smote))))*100,2)}%")

print("\n")
#for test data

print(f"Using logistic regression we get an accuracy of {round((accuracy_score(y_test, y_pred))*100,2)}%")
print(f"Using logistic regression we get an precision of test set {round((precision_score(y_test, y_pred))*100,2)}%")
print(f"Using logistic regression we get an recall score of test set {round((recall_score(y_test, y_pred))*100,2)}%")
print(f"Using logistic regression we get an f1 of test set {round((f1_score(y_test, y_pred))*100,2)}%")
print(f"Using logistic regression we get an roc_auc_score of test set {round((roc_auc_score(y_test, y_pred))*100,2)}%")


model_comparison['Logistic Regression']= [round(((accuracy_score(y_smote, logistic_clf_model.predict(X_smote))))*100,2),round(((precision_score(y_smote, logistic_clf_model.predict(X_smote))))*100,2),
                                         round(((recall_score(y_smote, logistic_clf_model.predict(X_smote))))*100,2),round(((f1_score(y_smote, logistic_clf_model.predict(X_smote))))*100,2),round(((roc_auc_score(y_smote, logistic_clf_model.predict(X_smote))))*100,2),"",
                                         round(((accuracy_score(y_test,y_pred)))*100,2),round((precision_score(y_test, y_pred))*100,2),round((recall_score(y_test, y_pred))*100,2),round((f1_score(y_test, y_pred))*100,2),round((roc_auc_score(y_test, y_pred))*100,2)]




### classification_report of logistic model

In [None]:
# print classification report
print(classification_report(y_test, y_pred))
report = pd.DataFrame(classification_report(y_pred=y_pred, y_true=y_test, output_dict=True))
print("\n")

# Classification report
plt.figure(figsize=(18,3))
plt.subplot(1,3,1)
sns.heatmap(report.iloc[:-1, :-1].T, annot=True, cmap='coolwarm')
plt.title(f'logistic regression Report')

### Confusion metrix

In [None]:
# confusion matrix of Logistic Model
cm=confusion_matrix(y_test,y_pred)
conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
plt.figure(figsize = (8,5))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")


### Feature importance

In [None]:
# ROC curve and AUC
probs =logistic_clf_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
log_auc = roc_auc_score(y_test, probs)

# calculate roc curve
fpr, tpr, thresholds = roc_curve(y_test, probs)
# plot curve
sns.set_style('whitegrid')
plt.figure(figsize=(10,6))
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(fpr, tpr, marker='.')
plt.ylabel('True positive rate')
plt.xlabel('False positive rate')
plt.title(f"AUC = {round(log_auc,3)}")
plt.show()

In [None]:
#for feature importance
try:
    importance = logistic_clf_model.feature_importances_
    feature = features
except:
    importance = np.abs(logistic_clf_model.coef_[0])
    feature = features
    indices = np.argsort(importance)
    indices = indices[::-1]

plt.figure(figsize=(15,6))
#ploting feature importance
plt.bar(range(len(indices)),importance[indices])
plt.xticks(range(len(indices)), [feature[i] for i in indices])
plt.title('Feature Importance')
plt.tight_layout()
plt.show()   

In [None]:
cardio_df.columns

## Decision Tree Model

In [None]:
#separating dependent and independent features
features=cardio_df.columns
independent_variables=[i for i in features if i not in ["cardio_risk"]]
#independent variables
X=cardio_df[independent_variables]
#dependent variables
y=cardio_df["cardio_risk"]

In [None]:
#spliting dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split( X,y , test_size = 0.2, random_state = 3)
print((X_train.shape,  y_train.shape),(y_test.shape,X_test.shape))

In [None]:
smote = SMOTE(random_state=42)
#fit predictor and target variable
X_smote, y_smote = smote.fit_resample(X_train, y_train)

In [None]:
# Create the parameter grid 
param_grid = {
    'max_depth': range(1, 6, 1),
    'min_samples_leaf': range(1, 50, 2),
    'min_samples_split': range(2, 50, 2),
    'criterion': ["entropy", "gini"]
}

n_folds = 5

# Instantiate the grid search model
model_tree = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator = model_tree, param_grid = param_grid, scoring = 'roc_auc',
                          cv = n_folds, verbose = 1)


In [None]:

# Fit the grid search to the data
grid_search.fit(X_smote,y_smote)

In [None]:
# cv results
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results.head()

In [None]:
# printing the optimal accuracy score and hyperparameters
print("best accuracy", grid_search.best_score_)
print(grid_search.best_estimator_)

In [None]:
# model with optimal hyperparameters
tree_clf_model = DecisionTreeClassifier(max_depth=5, min_samples_leaf=21, min_samples_split=44)
tree_clf_model.fit(X_smote, y_smote)
y_pred= tree_clf_model.predict(X_test)

## **Classification Evaluation Metrics for Decision Tree**


In [None]:
#for train data
print(f"Using DecisionTreeClassifier we get an accuracy of train set{round(((accuracy_score(y_smote, tree_clf_model.predict(X_smote))))*100,2)}%")
print(f"Using DecisionTreeClassifier we get an precision of train set {round(((precision_score(y_smote, tree_clf_model.predict(X_smote))))*100,2)}%")
print(f"Using DecisionTreeClassifier we get an recall score of train set {round(((recall_score(y_smote, tree_clf_model.predict(X_smote))))*100,2)}%")
print(f"Using DecisionTreeClassifier we get an f1 of train set {round(((f1_score(y_smote, tree_clf_model.predict(X_smote))))*100,2)}%")
print(f"Using DecisionTreeClassifier we get an roc_auc_score of train set {round(((roc_auc_score(y_smote, tree_clf_model.predict(X_smote))))*100,2)}%")

print("\n")
#for test data

print(f"Using DecisionTreeClassifier we get an accuracy of test set {round((accuracy_score(y_test, y_pred))*100,2)}%")
print(f"Using DecisionTreeClassifier we get an precision of test set {round((precision_score(y_test, y_pred))*100,2)}%")
print(f"Using DecisionTreeClassifier we get an recall score of test set {round((recall_score(y_test, y_pred))*100,2)}%")
print(f"Using DecisionTreeClassifier we get an f1 of test set {round((f1_score(y_test, y_pred))*100,2)}%")
print(f"Using DecisionTreeClassifier we get an roc_auc_score of test set {round((roc_auc_score(y_test, y_pred))*100,2)}%")

model_comparison['DecisionTreeClassifier ']= [round(((accuracy_score(y_smote, tree_clf_model.predict(X_smote))))*100,2),round(((precision_score(y_smote, tree_clf_model.predict(X_smote))))*100,2),
                                         round(((recall_score(y_smote, tree_clf_model.predict(X_smote))))*100,2),round(((f1_score(y_smote, tree_clf_model.predict(X_smote))))*100,2),round(((roc_auc_score(y_smote, tree_clf_model.predict(X_smote))))*100,2),"",
                                         round(((accuracy_score(y_test,y_pred)))*100,2),round((precision_score(y_test, y_pred))*100,2),round((recall_score(y_test, y_pred))*100,2),round((f1_score(y_test, y_pred))*100,2),round((roc_auc_score(y_test, y_pred))*100,2)]





In [None]:
# print classification report
print("classification report")
print(classification_report(y_test, y_pred))
report = pd.DataFrame(classification_report(y_pred=y_pred, y_true=y_test, output_dict=True))
print("\n\n")

# Classification report

plt.figure(figsize=(18,3))
plt.subplot(1,3,1)
sns.heatmap(report.iloc[:-1, :-1].T, annot=True, cmap='coolwarm')
plt.title(f'decision tree Report')

In [None]:
# confusion matrix of Logistic Model
cm=confusion_matrix(y_test,y_pred)
conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
plt.figure(figsize = (8,5))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")


In [None]:
# ROC curve and AUC
probs =tree_clf_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
tree_auc = roc_auc_score(y_test, probs)


# calculate roc curve
fpr, tpr, thresholds = roc_curve(y_test, probs)
# plot curve
sns.set_style('whitegrid')
plt.figure(figsize=(10,6))
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(fpr, tpr, marker='.')
plt.ylabel('True positive rate')
plt.xlabel('False positive rate')
plt.title(f"AUC = {round(tree_auc,3)}")
plt.show()

In [None]:

#for feature importance
try:
    importance = tree_clf_model.feature_importances_
    feature = features
except:
    importance = np.abs(tree_clf_model.coef_[0])
    feature = features
    indices = np.argsort(importance)
    indices = indices[::-1]

plt.figure(figsize=(15,6))
#ploting feature importance
plt.bar(range(len(indices)),importance[indices])
plt.xticks(range(len(indices)), [feature[i] for i in indices])
plt.title('Feature Importance')
plt.tight_layout()
plt.show()   


## Random Forest Classifier

In [None]:

# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth':range(4, 10, 1),
    'min_samples_leaf': range(100, 400, 100),
    'min_samples_split': range(200, 500, 100),
    'n_estimators': [100,200,300], 
    'max_features': [2, 10]
}
# Create a based model
rf = RandomForestClassifier(random_state=2)
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,scoring = 'roc_auc', 
                          cv = 3, n_jobs = -1,verbose = 1, return_train_score=True)

In [None]:
grid_search.fit(X_smote, y_smote)


In [None]:
# printing the optimal accuracy score and hyperparameters
print('We can get accuracy of',grid_search.best_score_,'using',grid_search.best_params_)

### **Fitting the final model with the best parameters obtained from grid search.**


In [None]:
#random forest  model with best parameters
random_clf_model = RandomForestClassifier(bootstrap=True,
                                  max_depth= 9, 
                                  max_features = 2,
                                  min_samples_leaf=100,
                                  min_samples_split= 200,
                                  n_estimators=300)

In [None]:
# fit
random_clf_model.fit(X_smote,y_smote)
# predict
y_pred= random_clf_model.predict(X_test)

## **Classification Evaluation Metrics for Random Forest**


In [None]:
#for train data
print(f"Using Random forest Classifier we get an accuracy of train set {round(((accuracy_score(y_smote, random_clf_model.predict(X_smote))))*100,2)}%")
print(f"Using Random forest Classifier we get an precision of train set {round(((precision_score(y_smote, random_clf_model.predict(X_smote))))*100,2)}%")
print(f"Using Random forest Classifier we get an recall score of train set {round(((recall_score(y_smote, random_clf_model.predict(X_smote))))*100,2)}%")
print(f"Using Random forest Classifier we get an f1 of train set {round(((f1_score(y_smote, random_clf_model.predict(X_smote))))*100,2)}%")
print(f"Using Random forest Classifier we get an roc_auc_score of train set {round(((roc_auc_score(y_smote, random_clf_model.predict(X_smote))))*100,2)}%")

print("\n")
#for test data

print(f"Using Random forest Classifier we get an accuracy of test set {round((accuracy_score(y_test, y_pred))*100,2)}%")
print(f"Using Random forest Classifier we get an precision of test set {round((precision_score(y_test, y_pred))*100,2)}%")
print(f"Using Random forest Classifier we get an recall score of test set {round((recall_score(y_test, y_pred))*100,2)}%")
print(f"Using Random forest Classifier we get an f1 of test set {round((f1_score(y_test, y_pred))*100,2)}%")
print(f"Using Random forest Classifier we get an roc_auc_score of test set {round((roc_auc_score(y_test, y_pred))*100,2)}%")




model_comparison['RandomforestClassifier ']= [round(((accuracy_score(y_smote, random_clf_model.predict(X_smote))))*100,2),round(((precision_score(y_smote, random_clf_model.predict(X_smote))))*100,2),
                                         round(((recall_score(y_smote, random_clf_model.predict(X_smote))))*100,2),round(((f1_score(y_smote, random_clf_model.predict(X_smote))))*100,2),round(((roc_auc_score(y_smote, random_clf_model.predict(X_smote))))*100,2),"",
                                         round(((accuracy_score(y_test,y_pred)))*100,2),round((precision_score(y_test, y_pred))*100,2),round((recall_score(y_test, y_pred))*100,2),round((f1_score(y_test, y_pred))*100,2),round((roc_auc_score(y_test, y_pred))*100,2)]




In [None]:
# print classification report
print(classification_report(y_test, y_pred))
report = pd.DataFrame(classification_report(y_pred=y_pred, y_true=y_test, output_dict=True))
print("\n")

# Classification report
plt.figure(figsize=(18,3))
plt.subplot(1,3,1)
sns.heatmap(report.iloc[:-1, :-1].T, annot=True, cmap='coolwarm')
plt.title(f'Random forest Report')

#### confusion metric

In [None]:
# confusion matrix of Logistic Model
cm=confusion_matrix(y_test,y_pred)
conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
plt.figure(figsize = (8,5))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")

### ROC curve

In [None]:

# ROC curve and AUC
probs =random_clf_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
random_auc = roc_auc_score(y_test, probs)

# calculate roc curve
fpr, tpr, thresholds = roc_curve(y_test, probs)
# plot curve
sns.set_style('whitegrid')
plt.figure(figsize=(10,6))
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(fpr, tpr, marker='.')
plt.ylabel('True positive rate')
plt.xlabel('False positive rate')
plt.title(f"AUC = {round(random_auc,3)}")
plt.show()

#feature importance

In [None]:
#for feature importance
try:
    importance = random_clf_model.feature_importances_
    feature = features
except:
    importance = np.abs(random_clf_model.coef_[0])
    feature = features
    indices = np.argsort(importance)
    indices = indices[::-1]

plt.figure(figsize=(15,6))
#ploting feature importance
plt.bar(range(len(indices)),importance[indices])
plt.xticks(range(len(indices)), [feature[i] for i in indices])
plt.title('Feature Importance')
plt.tight_layout()
plt.show()   

## XGBRFClassifier

In [None]:
# Classifier
Xgb_clf_model = XGBRFClassifier(silent=True, random_state=3)

# HYperparameter Grid
grid = {'n_estimators' : [150],
        'max_depth' : [8,10],
        'min_samples_split' : [50,100],
        'min_samples_leaf' : [30,50],
        'eta' : [0.05,0.08,0.1]}

# GridSearch to find the best parameters
xgb = GridSearchCV(Xgb_clf_model, param_grid = grid, scoring = 'roc_auc', cv=5)

In [None]:
#training
xgb.fit(X_smote, y_smote)

In [None]:
# printing the optimal accuracy score and hyperparameters
print("best roc_score", xgb.best_score_)
print(xgb.best_estimator_)

### Fitting the final model with the best parameters obtained from grid search.

In [None]:
#XGB Classifier model with best parameters
XGB_clf_model = XGBRFClassifier(eta=0.05, max_depth=10, min_samples_leaf=30,min_samples_split=50, n_estimators=150, random_state=3,silent=True)

In [None]:
# fit
XGB_clf_model.fit(X_smote,y_smote)
# predict
y_pred= XGB_clf_model.predict(X_test)

### XGB classifier evaluation metric


In [None]:
#for train data
print(f"Using XGB classifier we get an accuracy of train set{round(((accuracy_score(y_smote, XGB_clf_model.predict(X_smote))))*100,2)}%")
print(f"Using XGB classifier we get an precision of train set {round(((precision_score(y_smote, XGB_clf_model.predict(X_smote))))*100,2)}%")
print(f"Using XGB classifier we get an recall score of train set {round(((recall_score(y_smote, XGB_clf_model.predict(X_smote))))*100,2)}%")
print(f"Using XGB Classifier we get an f1 of train set {round(((f1_score(y_smote, XGB_clf_model.predict(X_smote))))*100,2)}%")
print(f"Using XGB Classifier we get an roc_auc_score of train set {round(((roc_auc_score(y_smote, XGB_clf_model.predict(X_smote))))*100,2)}%")

print("\n")
#for test data

print(f"Using XGB Classifier we get an accuracy of test set {round((accuracy_score(y_test, y_pred))*100,2)}%")
print(f"Using XGB Classifier we get an precision of test set {round((precision_score(y_test, y_pred))*100,2)}%")
print(f"Using XGB Classifier we get an recall score of test set {round((recall_score(y_test, y_pred))*100,2)}%")
print(f"Using XGB Classifier we get an f1 of test set {round((f1_score(y_test, y_pred))*100,2)}%")
print(f"Using XGB Classifier we get an roc_auc_score of test set {round((roc_auc_score(y_test, y_pred))*100,2)}%")




model_comparison['XGBClassifier ']= [round(((accuracy_score(y_smote, XGB_clf_model.predict(X_smote))))*100,2),round(((precision_score(y_smote, XGB_clf_model.predict(X_smote))))*100,2),
                                         round(((recall_score(y_smote, XGB_clf_model.predict(X_smote))))*100,2),round(((f1_score(y_smote, XGB_clf_model.predict(X_smote))))*100,2),round(((roc_auc_score(y_smote, XGB_clf_model.predict(X_smote))))*100,2),"",
                                         round(((accuracy_score(y_test,y_pred)))*100,2),round((precision_score(y_test, y_pred))*100,2),round((recall_score(y_test, y_pred))*100,2),round((f1_score(y_test, y_pred))*100,2),round((roc_auc_score(y_test, y_pred))*100,2)]




In [None]:
# print classification report
print(classification_report(y_test, y_pred))
report = pd.DataFrame(classification_report(y_pred=y_pred, y_true=y_test, output_dict=True))
print("\n")

# Classification report
plt.figure(figsize=(18,3))
plt.subplot(1,3,1)
sns.heatmap(report.iloc[:-1, :-1].T, annot=True, cmap='coolwarm')
plt.title(f'XGB Report')


In [None]:
# confusion matrix of Logistic Model
cm=confusion_matrix(y_test,y_pred)
conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
plt.figure(figsize = (8,5))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")

In [None]:
# ROC curve and AUC
probs =XGB_clf_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
XGB_auc = roc_auc_score(y_test, probs)

# calculate roc curve
fpr, tpr, thresholds = roc_curve(y_test, probs)
# plot curve
sns.set_style('whitegrid')
plt.figure(figsize=(10,6))
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(fpr, tpr, marker='.')
plt.ylabel('True positive rate')
plt.xlabel('False positive rate')
plt.title(f"AUC = {round(XGB_auc,3)}")
plt.show()


In [None]:

#for feature importance
try:
    importance = XGB_clf_model.feature_importances_
    feature = features
except:
    importance = np.abs(XGB_clf_model.coef_[0])
    feature = features
    indices = np.argsort(importance)
    indices = indices[::-1]

plt.figure(figsize=(15,6))
#ploting feature importance
plt.bar(range(len(indices)),importance[indices])
plt.xticks(range(len(indices)), [feature[i] for i in indices])
plt.title('Feature Importance')
plt.tight_layout()
plt.show()   



##Support vector machine

In [None]:
# SVM algorithm
svm_clf_model = SVC(random_state= 0,probability=True)

# HYperparameter Grid
grid = {'kernel': ["linear","rbf","poly","sigmoid"],
        'C': [0.1, 1, 10, 100],
        'max_iter' : [1000]}

# GridSearch to find the best parameters
svc_search = GridSearchCV(svm_clf_model, param_grid = grid, cv=5)


In [None]:
#fit
svc_search.fit(X_smote, y_smote)

In [None]:
# printing the optimal accuracy score and hyperparameters
print("best roc_score", svc_search.best_score_)
print(svc_search.best_estimator_)

In [None]:
#XGB Classifier model with best parameters
svc_clf_model = SVC(C=0.1, kernel='poly', max_iter=1000, probability=True, random_state=0)

In [None]:
# fit
svc_clf_model.fit(X_smote,y_smote)
# predict
y_pred= svc_clf_model.predict(X_test)

### Evaluation metric for SVM

In [None]:
#for train data
print(f"Using SVM classifier we get an accuracy of train set{round(((accuracy_score(y_smote, svc_clf_model.predict(X_smote))))*100,2)}%")
print(f"Using SVM classifier we get an precision of train set {round(((precision_score(y_smote, svc_clf_model.predict(X_smote))))*100,2)}%")
print(f"Using SVM classifier we get an recall score of train set {round(((recall_score(y_smote, svc_clf_model.predict(X_smote))))*100,2)}%")
print(f"Using SVM Classifier we get an f1 of train set {round(((f1_score(y_smote, svc_clf_model.predict(X_smote))))*100,2)}%")
print(f"Using SVM Classifier we get an roc_auc_score of train set {round(((roc_auc_score(y_smote, svc_clf_model.predict(X_smote))))*100,2)}%")

print("\n")
#for test data

print(f"Using SVM Classifier we get an accuracy of test set {round((accuracy_score(y_test, y_pred))*100,2)}%")
print(f"Using SVM Classifier we get an precision of test set {round((precision_score(y_test, y_pred))*100,2)}%")
print(f"Using SVM Classifier we get an recall score of test set {round((recall_score(y_test, y_pred))*100,2)}%")
print(f"Using SVM Classifier we get an f1 of test set {round((f1_score(y_test, y_pred))*100,2)}%")
print(f"Using SVM Classifier we get an roc_auc_score of test set {round((roc_auc_score(y_test, y_pred))*100,2)}%")




model_comparison['SVMClassifier ']= [round(((accuracy_score(y_smote, svc_clf_model.predict(X_smote))))*100,2),round(((precision_score(y_smote, svc_clf_model.predict(X_smote))))*100,2),
                                         round(((recall_score(y_smote, svc_clf_model.predict(X_smote))))*100,2),round(((f1_score(y_smote, svc_clf_model.predict(X_smote))))*100,2),round(((roc_auc_score(y_smote, svc_clf_model.predict(X_smote))))*100,2),"",
                                         round(((accuracy_score(y_test,y_pred)))*100,2),round((precision_score(y_test, y_pred))*100,2),round((recall_score(y_test, y_pred))*100,2),round((f1_score(y_test, y_pred))*100,2),round((roc_auc_score(y_test, y_pred))*100,2)]




In [None]:
# print classification report
print(classification_report(y_test, y_pred))
report = pd.DataFrame(classification_report(y_pred=y_pred, y_true=y_test, output_dict=True))
print("\n")

# Classification report
plt.figure(figsize=(18,3))
plt.subplot(1,3,1)
sns.heatmap(report.iloc[:-1, :-1].T, annot=True, cmap='coolwarm')
plt.title(f'XGB Report')

In [None]:
# confusion matrix of Logistic Model
cm=confusion_matrix(y_test,y_pred)
conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
plt.figure(figsize = (8,5))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")

In [None]:
# ROC curve and AUC
probs =svc_clf_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
svc_auc = roc_auc_score(y_test, probs)

# calculate roc curve
fpr, tpr, thresholds = roc_curve(y_test, probs)
# plot curve
sns.set_style('whitegrid')
plt.figure(figsize=(10,6))
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(fpr, tpr, marker='.')
plt.ylabel('True positive rate')
plt.xlabel('False positive rate')
plt.title(f"AUC = {round(svc_auc,3)}")
plt.show()


## Model comparison

In [None]:
Model_comparision=pd.DataFrame(model_comparison).T
Model_comparision.columns=['accuracy_score_train_set',"precision_score_train_set","recall_score_train_set","f1_score_train_set","roc_auc_score_train_set","-------","accuracy_score_test_set","precision_score_test_set","recall_score_test_set","f1_score_test_set","roc_auc_score_test_set"]

In [None]:
Model_comparision[["recall_score_train_set","recall_score_test_set"]]