<a href="https://colab.research.google.com/github/nuraishasb/chasingflights/blob/main/Python_SupervisedML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1 | Introduction

In this report, we experiment the use of machine learning operators, mainly Supervised Learning, on 2 different data sets. We begin with the Classification analysis on the ----- data set, before moving on to the Regression analysis on the ----- data set.

# 2 | Regression Task


In [None]:
# Data visulisation
import pandas as pd # analyze data
import numpy as np # work with arrays
import scipy.stats as stats # for statistical procedures such as t-test etc.
import seaborn as sns # data visualisation library
import matplotlib.pyplot as plt

#libraries used for handling missing and non-numeric values - part of data cleaning
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler,MinMaxScaler

#Pipeline : Chains all steps of the workflow for a more streamlined procedure.
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


#Feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression,chi2

#Training data
from sklearn.model_selection import train_test_split #Split arrays or matrices into random train and test subsets.
from sklearn.model_selection import cross_val_score # Evaluate a score by cross-validation.
from sklearn.model_selection import StratifiedShuffleSplit #Random permutation cross-validator
from sklearn.model_selection import GridSearchCV # Exhaustive search over specified parameter values for an estimator

#linear regression
from sklearn.linear_model import LinearRegression, Ridge, Lasso
#Decision tree and SVM
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVC

#R2 square and mean squared error
from sklearn import metrics
from sklearn.metrics import r2_score,mean_squared_error

#KNN classifier
from sklearn.neighbors import KNeighborsClassifier

# Gaussian Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB

# Logistic Regression
from sklearn.linear_model import LogisticRegression

#Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

#SVM
from sklearn.svm import SVC

### 2.1: EDA

We use exploratory data analysis (EDA) to investigate data sets and summarize their main characteristics, which can be done by employing data visualization methods.

In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)
df = pd.read_csv('/content/drive/My Drive/DAC/insurance.csv')

In [None]:
# checking for duplicates + remove duplicates
print(df.duplicated().sum())

df = df.drop_duplicates()

print(df.duplicated().sum())

# check for null values
df.isnull().sum()

# remove all the outliers from the data. Outliers can either be Q1-(1.5*IQR) or Q3+(1.5*IQR)
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = (df < lower_bound) | (df > upper_bound)

data = df[~outliers]
data = data.dropna()
sns.boxplot(data)

### 2.2: Prediction Diagnosis of Breast Cancer

In [None]:
# assign all the missing values of the numerical data with the medians and scale the data using StandardScaler

# assign all the categorical data with numerical values using OrdinalEncoder

ordinal_encoder = OrdinalEncoder()

age = data['age'].values.reshape(-1,1)
data['age'] = ordinal_encoder.fit_transform(age)


sex = data['sex'].values.reshape(-1,1)
data['sex'] = ordinal_encoder.fit_transform(sex)

region = data['region'].values.reshape(-1,1)
data['region'] = ordinal_encoder.fit_transform(region)

smoker = data['smoker'].values.reshape(-1,1)
data['smoker'] = ordinal_encoder.fit_transform(smoker)

# feature selection - split the data into features and targets. We want the target/dependent variable to be 'charges'.
target = data['charges']
features = data.loc[:, data.columns != 'charges']

selected_features = []
selector = SelectKBest(f_regression, k = 'all')
selector.fit_transform(features, target)

supports = selector.get_support()

print(supports)
print(features.columns)

for support, feature in zip(supports, features.columns):

  if (support == True):
    selected_features.append(feature)

print('Selected features are: ', selected_features)

# split the feature and target into train and test
X_train, X_test, Y_train, Y_test = train_test_split(features, target, test_size = 0.3, random_state = 13)

#### 3.2.1: Linear Regression

Linear regression model is a basic and commonly used type of predictive analysis - it assumes a linear relationship between the independent variable and the dependent variable, and aims to find the best-fitting line that describes the relationship.

In [None]:
# train them in these 3 regressions and find their respective root mean squared error and coefficient of determination (r^2)
# simple linear regression
model = LinearRegression()
model.fit(X_train, Y_train)
predictions = model.predict(X_test)
r2_lr = r2_score(Y_test, predictions)
mse_lr = mean_squared_error(Y_test, predictions)
rmse_lr = np.sqrt(mse_lr)

print('R2 score of the regression model is: ', r2_lr)
print('RMSE score of the regression model is: ', mse_lr)

#does not take into account multicollinearity

plt.scatter(Y_test, predictions, alpha = 0.7)

#### 3.2.3: Ridge Regression

Ridge regression is a model tuning method that is used to analyse any data that suffers from multicollinearity. We’ll use the `glmnet()` function to fit the ridge regression model and specify `alpha=0`.

In [None]:
# ridge regression
model = Ridge(alpha = 0.5)
model.fit(X_train, Y_train)

predictions = model.predict(X_test)
r2_rr = r2_score(Y_test, predictions)
mse_rr = mean_squared_error(Y_test, predictions)
rmse_rr = np.sqrt(mse_rr)

print('R2 score of the regression model is: ', r2_rr)
print('RMSE score of the regression model is: ', rmse_rr)

####3.2.4: Lasso Regression

Lasso regression is another model tuning method, similar to ridge, that is used to analyse data with multicollinearity present. We’ll use the `glmnet()` function as well to fit the lasso regression model but specify `alpha=1` instead.

In [None]:
# lasso regression
model = Lasso(alpha = 0.5)
model.fit(X_train, Y_train)

predictions = model.predict(X_test)
r2_lr = r2_score(Y_test, predictions)
mse_lr = mean_squared_error(Y_test, predictions)
rmse_lr = np.sqrt(mse_lr)

print('R2 score of the regression model is: ', r2_lr)
print('RMSE score of the regression model is: ', rmse_lr)

### 2.3: Final Analyisis

In [None]:
# creating a report
models = pd.DataFrame({
    'Model': ['Simple Linear Regression', 'Ridge Regression', 'Lasso Regression'],
    'R2 Score': [r2_lr, r2_rr, r2_lr],
    'RMSE Score': [rmse_lr, rmse_rr, rmse_lr]})
models

# 3 | Classification Task


In [None]:
# This time let “smokers” be the target, the rest as the features

target = data['smoker']
features = data.loc[:, data.columns != 'smoker']

selected_features = []
selector = SelectKBest(f_regression, k = 'all')
selector.fit_transform(features, target)

supports = selector.get_support()

print(supports)
print(features.columns)

for support, feature in zip(supports, features.columns):

  if (support == True):
    selected_features.append(feature)

print('Selected features are: ', selected_features)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(features, target, test_size = 0.3, random_state = 31)

In [None]:
# Classify the “smokers” using: (1) K Nearest Neighbour

KNN = KNeighborsClassifier(n_neighbors = 5)
KNN.fit(X_train, Y_train)

Y_pred = KNN.predict(X_test)

accuracy_KNN = round(metrics.accuracy_score(Y_test, Y_pred)*100, 2)
print('Accuracy of KNN is ', accuracy_KNN)

In [None]:
# (2) Logistic Regression

LR = LogisticRegression()
LR.fit(X_train, Y_train)

Y_pred = LR.predict(X_test)

accuracy_LR = round(metrics.accuracy_score(Y_test, Y_pred)*100, 2)
print('Accuracy of LR is ', accuracy_LR)

In [None]:
# (3) Decision Tree Classifier

DTC = DecisionTreeClassifier()

DTC.fit(X_train, Y_train)

Y_pred = DTC.predict(X_test)

accuracy_DTC = round(metrics.accuracy_score(Y_test, Y_pred)*100, 2)
print('Accuracy of DTC is ', accuracy_DTC)

In [None]:
# (4) Support Vector Machine

SVC = SVC()

SVC.fit(X_train, Y_train)

Y_pred = SVC.predict(X_test)

accuracy_SVC = round(metrics.accuracy_score(Y_test, Y_pred)*100, 2)
print('Accuracy of SVC is ', accuracy_SVC)

In [None]:
# (5) Naive Bayes Classifier

Gaussian = GaussianNB()

Gaussian.fit(X_train, Y_train)

Y_pred = Gaussian.predict(X_test)

accuracy_Gaussian = round(metrics.accuracy_score(Y_test, Y_pred)*100, 2)
print('Accuracy of Gaussian is ', accuracy_Gaussian)

In [None]:
# Produce a report for Step 5, sorting the accuracy values in descending order

models = pd.DataFrame({
    'Model':['KNN', 'Naive Bayes', 'Logistic Regression','Decision Tree Classifier', 'Support Vector Machines'],
    'Score':[accuracy_KNN, accuracy_Gaussian, accuracy_LR, accuracy_DTC, accuracy_SVC]
})

models