# ***Lung 🫁 cancer Prediction using Machine Learning Techniques*** 

---
---
We are proposing a machine-learning model to predict the level of lung cancer. Our goal is to create an efficient machine-learning model to predict the risk level of the lung by using available features such as index, Patient Id, Age, Gender, Air Pollution, Alcohol use, Dust Allergy, Occupational Hazards, Genetic Risk, Chronic Lung Disease, Balanced Diet, Obesity, Smoking, Passive Smoker and so on.













---

> **Importing libraries**

---


In [None]:
# importing libraries
import pandas as pd # data processing
import numpy as np # linear algebra
import matplotlib.pyplot as plt # visualization
import graphviz

%matplotlib inline
from IPython.display import Image
from itertools import product
import seaborn as sns
# increases the size of sns plots
sns.set(rc={'figure.figsize':(8,6)})
from sklearn.linear_model import Lasso,LassoCV,LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.feature_selection import RFE 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold, cross_val_score,GridSearchCV
from sklearn import tree
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score, roc_curve, auc, classification_report, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler,StandardScaler,MaxAbsScaler,RobustScaler
import warnings
warnings.filterwarnings('ignore')




---
**Mounting google drive**


---




In [None]:
# mount google drive 
from google.colab import drive
drive.mount('/content/drive')



---
**Accessing dataset available in the google drive and exploring the dataset.**

---





In [None]:
# raw data in panda dataframe
df = pd.read_csv('/content/drive/MyDrive/CSE499/cancer-patient-data-sets.csv')
print('Data Frame Shape: \n{}'.format(df.shape))
# shows five instances of the dataframe
print('First few instances of the dataset: ')
df.head()

In [None]:
# columns of the dataset
df.columns

In [None]:
# investigating all the elements whithin each Feature
for column in df:
  unique_vals = df[column].unique()
  nr_values = len(unique_vals)
  
  if nr_values < 10:
    print('The number of values for feature {} :{} -- {}'.format(column, nr_values,unique_vals))
  else:
    print('The number of values for feature {} :{}'.format(column, nr_values))

---
---

# ***Data preprocessing*** 

---
---

We want to preprocess data with some tasks like checking null values, converting data types, watching the importance of features by Pearson correlation, the Lasso Regression model, and Recursive Feature Elimination.

In [None]:
# checking for the null values
df.isnull().sum()

In [None]:
# data types
df.dtypes

In [None]:
# Find out all the features with type object
objectList = df.select_dtypes(include = "object").columns
print (objectList)



---

**We want to do label Encoding to convert categorical data to numeric data. So, all the instances will be numerical.**

---





In [None]:
#Label Encoding for object to numeric conversion
encoder = LabelEncoder()

for obj in objectList:
    df[obj] = encoder.fit_transform(df[obj].astype(str))

print (df.info())

In [None]:
# exporting new dataframe as csv
df.to_csv('/content/drive/MyDrive/CSE499/cancer-patient-data-sets(labelencoded).csv')

In [None]:
# separating attributes and target
attribute = df.drop(columns = ['Level'])
target = df['Level']
print('Attribute Shape: ', attribute.shape)
print('Target Shape: ', target.shape)

In [None]:
#Analyzing the target variable to check if it blanaced or imbalanced
target.value_counts()

In [None]:
# first few instances of attribute
attribute.head()

In [None]:
# first few instances of target
target.head()

In [None]:
# train test splitting(70% for training & 30% for testing)
X_train, X_test, y_train, y_test = train_test_split(attribute, target, train_size = 0.7, test_size = 0.3, random_state = 0)

In [None]:
print('For training: ')
print('Attribute Shape: ', X_train.shape)
print('Target Shape: ', y_train.shape)

print('\nFor testing: ')
print('Attribute Shape: ', X_test.shape)
print('Target Shape: ', y_test.shape)

In [None]:
# using pearson correlation
plt.figure(figsize=(25, 25))
correlation = df.corr()
sns.heatmap(correlation, annot=True, cmap=plt.cm.CMRmap_r)
plt.show()

In [None]:
# print the top most important features accordingly(excluding the target variable)
corr_abs = abs(correlation['Level'])  
corr_abs_sorted = corr_abs.sort_values(ascending=False)  
print('Most important features:\n', corr_abs_sorted[1:26])  

In [None]:
# Using Lasso Regression model to the data
lasso_cv = LassoCV(cv=5)
lasso_cv.fit(X_train, y_train)

# Print the optimal value of alpha
print("Optimal alpha:", lasso_cv.alpha_)


In [None]:
# Fit the Lasso Regression model to the data
lasso = Lasso(alpha=lasso_cv.alpha_)
lasso.fit(X_train, y_train)

In [None]:
# Print the coefficients of the Lasso model
for i in range(len(lasso.coef_)):
    print(f"Feature : {X_train.columns[i]}, Coefficient values: {lasso.coef_[i]:.2f}")

In [None]:
# Visualize the feature importances
plt.figure(figsize=(25,5))
plt.bar(range(len(lasso.coef_)), lasso.coef_)
plt.xticks(range(len(lasso.coef_)), [f"{X_train.columns[i]}" for i in range(len(lasso.coef_))], rotation=90)
plt.title("Lasso Regression Feature Importance")
plt.show()

In [None]:
# Create a logistic regression estimator
estimator = LogisticRegression()

# Create a Recursive Feature Elimination (RFE) object
rfe = RFE(estimator)

# Define the hyperparameter grid
param_grid = {
    'n_features_to_select': [5, 10, 15],
    'step': [1, 2, 3]
}

# Perform a grid search with cross-validation
grid_search = GridSearchCV(rfe, param_grid, cv=10)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and corresponding score
print("Best hyperparameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)


In [None]:
# Create a Recursive Feature Elimination (RFE) object with best hyperparameters
rfe = RFE(estimator, n_features_to_select=10, step=1)

# Fit the RFE object to the data
rfe.fit(X_train, y_train)

In [None]:
# Print the ranking of each feature
print("Feature rankings:")
for i in range(len(rfe.ranking_)):
    print(f"Feature :'{X_train.columns[i]}', ranking: {rfe.ranking_[i]}")

In [None]:
# Visualize the feature importances

plt.figure(figsize=(10, 5))
plt.bar(range(len(rfe.ranking_)), rfe.ranking_)
plt.xticks(range(len(rfe.ranking_)), [f"{X_train.columns[i]}" for i in range(len(rfe.ranking_))], rotation=90)
plt.title("RFE Feature Ranking")
plt.show()



---

**After Data preprocessing, We want to remove the 'index' & 'Patient Id ' features. Because of the uniqueness of these features, They can be Noisy or Overfitting. Lack of variability is another issue here.**

---




In [None]:
#Removing two columns (index and patient id)
new_df = df.drop(columns = ['index','Patient Id'])
print('New Data Frame Shape: ', new_df.shape)

In [None]:
# exporting new dataframe as csv
new_df.to_csv('/content/drive/MyDrive/CSE499/cancer-patient-data-sets(filtered).csv')



---

**Exploring new Dataset**

---





In [None]:
# separating attributes and target
attribute = new_df.drop(columns = ['Level'])
target = new_df['Level']
print('Attribute Shape: ', attribute.shape)
print('Target Shape: ', target.shape)

In [None]:
#70% for training and 30% for testing
X_train, X_test, y_train, y_test = train_test_split(attribute, target, train_size = 0.7, test_size = 0.3, random_state = 0)

In [None]:
print('For training: ')
print('Attribute Shape: ', X_train.shape)
print('Target Shape: ', y_train.shape)

print('\nFor testing: ')
print('Attribute Shape: ', X_test.shape)
print('Target Shape: ', y_test.shape)

---
---

# ***Decision Tree*** 

---
---

We want to run the "Decision Tree" classifier in the dataset. Initially, we will train the model without the best hypermeters. Later then, We will process it with the best hypermeters.

In [None]:
# Decision Tree Model
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)


---
**The Below code will generate the .dot file of the graph. Using this file, we can export the image that is available in the posted link:** 
*`https://dreampuf.github.io/GraphvizOnline`*

---

In [None]:
# Graph available in: https://dreampuf.github.io/GraphvizOnline

dot_data = tree.export_graphviz(dtree, out_file='/content/drive/MyDrive/CSE499/Decision tree.dot')
feature_names = new_df.drop('Level', axis=1).columns,
class_names = new_df['Level'].unique().astype(str)
graph = graphviz.Source(dot_data)


In [None]:
#Generating Image
Image(filename='/content/drive/MyDrive/CSE499/graphviz.png')


---

**Calculating the training and Testing Accuracy of the Model**

---

In [None]:
# Training Accuracy Of Decision Tree
print("Training Accuracy is: ", dtree.score(X_train, y_train))
# Test Accuracy Of Decision Tree
print("Testing Accuracy is: ", dtree.score(X_test, y_test))

---

**After Model training, We want to check importance of each feature's**

---

In [None]:
# Finding importance of each feature

for i, column in enumerate(new_df.drop('Level', axis=1)):
  print('Importance of feature {}:, {:.3f}'.format(column, dtree.feature_importances_[i]))
  feature_imp = pd.DataFrame({'Variable': [column], 'Feature Importance Score': [dtree.feature_importances_[i]]})

  try:
    final_feature_imp = pd.concat([final_feature_imp, feature_imp], ignore_index = True)
  except:
    final_feature_imp = feature_imp

# Ordering the data
final_feature_imp = final_feature_imp.sort_values('Feature Importance Score', ascending = False).reset_index()
final_feature_imp

---

**We select to apply 10-fold and 5-fold cross-validation to see if we find different results**

---

In [None]:
# after applying 10 fold cross validation
kfold_validation = KFold(n_splits = 10)
results = cross_val_score(dtree, attribute, target, cv = kfold_validation)
print(results)
print ('\nResults = ', np.mean(results), '+/-', np.std(results))

In [None]:
# after applying 5 fold cross validation
kfold_validation = KFold(n_splits = 5)
results = cross_val_score(dtree, attribute, target, cv = kfold_validation)
print(results)
print ('\nResults = ', np.mean(results), '+/-', np.std(results))

---

**We will try to find the best hyperparameters. Using grid search, we will tune the Hyperparameters.**

---

In [None]:
param_grid = {'criterion': ['gini', 'entropy'],
              'max_depth': [2, 4, 6, 8, 10],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4],
              'max_features': ['auto', 'sqrt', 'log2']}

grid = GridSearchCV(dtree, param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best hyperparameters:", grid.best_params_)
print("Best score:", grid.best_score_)
best_model = grid.best_estimator_
best_model.score(X_test, y_test)


---

**We will apply these parameters to process the Model again. It is listed below:**


`{'criterion': 'gini', 'max_depth': 6, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2}`

---


In [None]:
# Decision Tree Model
dtree = DecisionTreeClassifier(criterion = 'gini', max_depth = 6, max_features = 'auto', min_samples_leaf = 1, min_samples_split = 2)
dtree.fit(X_train, y_train)

In [None]:
# Graph available in: https://dreampuf.github.io/GraphvizOnline
dot_data = tree.export_graphviz(dtree, out_file='/content/drive/MyDrive/CSE499/Decision tree(Another).dot')
feature_names = new_df.drop('Level', axis=1).columns,
class_names = new_df['Level'].unique().astype(str)
graph = graphviz.Source(dot_data)

In [None]:
#Print The graph
Image(filename='/content/drive/MyDrive/CSE499/DT.png')

In [None]:
# Training Accuracy Of Decision Tree
print("Training Accuracy is: ", dtree.score(X_train, y_train))
# Test Accuracy Of Decision Tree
print("Testing Accuracy is: ", dtree.score(X_test, y_test))


---

**Initially, Training and Testing Accuracy was 100%. After processing with the best Hyperparameters we got 97.42% for training and 96.00% for testing.**

---

In [None]:
# Finding importance of each feature

for i, column in enumerate(new_df.drop('Level', axis=1)):
  print('Importance of feature {}:, {:.3f}'.format(column, dtree.feature_importances_[i]))
  feature_imp = pd.DataFrame({'Variable': [column], 'Feature Importance Score': [dtree.feature_importances_[i]]})

  try:
    final_feature_imp = pd.concat([final_feature_imp, feature_imp], ignore_index = True)
  except:
    final_feature_imp = feature_imp

# Ordering the data
final_feature_imp = final_feature_imp.sort_values('Feature Importance Score', ascending = False).reset_index()
final_feature_imp

---

**We select to apply 10-fold and 5-fold cross-validation again  to see if we find different results**

---

In [None]:
# after applying 10 fold cross validation
kfold_validation = KFold(n_splits = 10)
results = cross_val_score(dtree, attribute, target, cv = kfold_validation)
print(results)
print ('\nResults = ', np.mean(results), '+/-', np.std(results))

In [None]:
# after applying 5 fold cross validation
kfold_validation = KFold(n_splits = 5)
results = cross_val_score(dtree, attribute, target, cv = kfold_validation)
print(results)
print ('\nResults = ', np.mean(results), '+/-', np.std(results))

---

**Developing a Function for generating Confusion Matrix**

---

In [None]:
# Confusion Matrix
# Confusion Matrix function
def plot_confusion_matrix(cm, classes=None, title='Confusion matrix'):
  if classes is not None:
    sns.heatmap(cm, xticklabels=classes, yticklabels=classes, vmin=0., vmax=1., annot=True, annot_kws={'size':30})
  else:
    sns.heatmap(cm, vmin=0., vmax=1.)
    
  plt.title(title)
  plt.ylabel('True label')
  plt.xlabel('Predicted label')

---
**Visualizing Confusion Matrix graph**

---

In [None]:
# prediction
y_pred = dtree.predict(X_train)

# Plotting Confusion Matrix for Training
cmatrix = confusion_matrix(y_train, y_pred)

In [None]:
cmatrix

In [None]:
cmatrix_norm = cmatrix/cmatrix.sum(axis=1)[:, np.newaxis]
plt.figure()
plot_confusion_matrix(cmatrix_norm, classes=dtree.classes_, title='Training confusion')


---
**Calculating some evaluation metric**

---

In [None]:
# Calculating False Positives (FP), False Negatives (FN), True Positives(TP), True Negatices (TN)
FP = cmatrix.sum(axis=0) - np.diag(cmatrix)
FN = cmatrix.sum(axis=1) - np.diag(cmatrix)
TP = np.diag(cmatrix)
TN = cmatrix.sum() - (FP + FN + TP)

# precision or positive predictive value
precision = TP / (TP + FP)
print('Precision per class: ', precision)

# sensitivity, recall or true predictive rate
recall = TP / (TP + FN)
print('Recall per class: ', recall)

# false positive rate
fpr = FP / (FP + TN)
print('False positive rate per class: ', fpr)

# false negative rate
fnr = FN / (TP + FN)
print('False negative rate per class: ', fnr)

# classification error
c_error = (FP + FN) / (TP + FP + FN + TN)
print('The classification error of each class: ' ,c_error)

# overall accuracy
accuracy = (TP + TN) / (TP + FP + FN + TN)
print('The accuracy of each class: ' ,accuracy)

# Averages
print('\nAverage Recall : ' ,recall.sum()/3)
print('Average Precision : ' ,precision.sum()/3)
print('Average Miss Rate : ' ,fnr.sum()/3)
print('Average Classification error : ' ,c_error.sum()/3)
print('Average accuracy : ' ,accuracy.sum()/3)

---
**We got results listed below:**

*   Average Recall :  97.05%
*   Average Precision :  97.66%
*   Average Miss Rate :  2.94%
*   Average Classification error :  1.71%
*   Average accuracy :  98.28%

---

---
---

# ***Random Forest*** 

---
---
We want to run the "Random Forest" classifier in the dataset. We calculate the best parameters using the gid search and randomized search. Then, Fit these parameters to calculate accuracy for training and testing.

In [None]:
forest = RandomForestClassifier()
# Define the hyperparameters and their ranges
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
# Create a Random Forest classifier
rf = RandomForestClassifier()

# Perform grid search using the defined hyperparameters and 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

print("Best hyperparameters: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))




In [None]:
# Fit a Random Forest model with the best hyperparameters
best_rf = RandomForestClassifier(n_estimators=grid_search.best_params_['n_estimators'],
                                  max_depth=grid_search.best_params_['max_depth'],
                                  min_samples_split=grid_search.best_params_['min_samples_split'],
                                  min_samples_leaf=grid_search.best_params_['min_samples_leaf'])

best_rf.fit(X_train, y_train)


In [None]:
best_rf.fit(X_train, y_train)
prediction_test = best_rf.predict(X=X_test)
# Training Accuracy Of Random Forest
print("Training Accuracy : ", best_rf.score(X_train, y_train))
# Test Accuracy Of Random Forest
print("Testing Accuracy : ", best_rf.score(X_test, y_test))


---
**Using grid search, we got the result of 100% for training and testing.**

---

In [None]:

# Define the hyperparameters and their distributions
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5)
}

# Create a Random Forest classifier
rf = RandomForestClassifier()

# Perform randomized search using the defined hyperparameters and 5-fold cross-validation
n_iter_search = 50
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=n_iter_search, cv=5, n_jobs=-1)

# Fit the randomized search to the data
random_search.fit(X_train, y_train)

# Print out the best hyperparameters and the corresponding mean cross-validation score
print("Best hyperparameters: ", random_search.best_params_)
print("Best cross-validation score: {:.2f}".format(random_search.best_score_))




In [None]:
# Fit a Random Forest model with the best hyperparameters
best_rf = RandomForestClassifier(n_estimators=random_search.best_params_['n_estimators'],
                                  max_depth=random_search.best_params_['max_depth'],
                                  min_samples_split=random_search.best_params_['min_samples_split'],
                                  min_samples_leaf=random_search.best_params_['min_samples_leaf'])

best_rf.fit(X_train, y_train)
prediction_test = best_rf.predict(X=X_test)
# Training Accuracy Of Random Forest
print("Training Accuracy : ", best_rf.score(X_train, y_train))
# Test Accuracy Of Random Forest
print("Testing Accuracy : ", best_rf.score(X_test, y_test))



---
**Using Randomized search, we also got the result of 100% for training and testing.**

---

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 300, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 100,5)]
# Minimum number of samples required to split a node
min_samples_split = [2, 3, 5, 7, 9, 10, 11, 14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 6, 7, 8]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,'min_samples_leaf': min_samples_leaf,
'criterion':['entropy','gini']
}
print(random_grid)

In [None]:
rand_forest = RandomForestClassifier()
rand_forest_randomcv = RandomizedSearchCV(estimator=rand_forest, param_distributions=param_dist, n_iter=100, cv=10, verbose=2, random_state=100, n_jobs=-1)
# fit the randomized model
rand_forest_randomcv.fit(X_train,y_train)

In [None]:
# best parameters
rand_forest_randomcv.best_params_

In [None]:
# best estimator
rand_forest_randomcv.best_estimator_
RandomForestClassifier(max_depth=None, max_features='auto', min_samples_leaf=1,
 min_samples_split=5, n_estimators=153)
best_random_grid = rand_forest_randomcv.best_estimator_
y_pred=best_random_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print("Classification report: \n{}".format(classification_report(y_test,y_pred)))

In [None]:
from itertools import product

n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200, 300, 500]
max_features = ['auto', 'sqrt', 'log2']
max_depths = [None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15]

train_results = []
test_results = []

# to iterate through all possible combinations
for feature, depth in product(max_features, max_depths):
    for estimator in n_estimators:
        tunned_forest = RandomForestClassifier(n_estimators=estimator,
                                                criterion='entropy',
                                                max_features=feature,
                                                max_depth=depth,
                                                n_jobs=1,
                                                random_state=30)

        tunned_forest.fit(X_train, y_train)
        prediction_train = tunned_forest.predict(X=X_train)
        roc_auc_train = roc_auc_score(y_train, tunned_forest.predict_proba(X_train), multi_class='ovr')
        train_results.append(roc_auc_train)

        prediction_test = tunned_forest.predict(X=X_test)
        roc_auc_test = roc_auc_score(y_test, tunned_forest.predict_proba(X_test), multi_class='ovr')
        test_results.append(roc_auc_test)

        # Checking classification accuracy of each tree
        print('For n_estimators : ', estimator)
        print('Classification accuracy on Train set with max_features = {} and max_depth = {}: Accuracy: = {}'
              .format(feature, depth, accuracy_score(y_train, prediction_train)))

        print('Classification accuracy on test set with max_features = {} and max_depth = {}: Accuracy: = {}'
              .format(feature, depth, accuracy_score(y_test, prediction_test)))
        print()

        # Generating confusion matrix
        c_matrix = confusion_matrix(y_test, prediction_test)
        c_matrix_norm = c_matrix / c_matrix.sum(axis=1)[:, np.newaxis]


In [None]:

rand_forest = RandomForestClassifier(n_estimators=500, 
                                     criterion='entropy',
                                     max_features='log2',
                                     max_depth=15)
rand_forest.fit(X_train, y_train)
prediction_test = rand_forest.predict(X_test)
prediction_train = rand_forest.predict(X_train)

# Training Accuracy Of Random Forest
print("Training Accuracy : ", rand_forest.score(X_train, y_train))

# Test Accuracy Of Random Forest
print("Testing Accuracy : ", rand_forest.score(X_test, y_test))

In [None]:
print(classification_report(y_test, prediction_test))

---
**We got no change in the result by using a different approach in the Randomized search! Now we will check the Importance of features and Visualize the bar graph.**

---


In [None]:
# feature scores
feature_scores = pd.Series(rand_forest.feature_importances_, index=X_train.columns).sort_values(ascending=False)
feature_scores

In [None]:
# seaborn bar plot
sns.barplot(x=feature_scores, y=feature_scores.index)
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.show()



---


**Defining a function for generating a graph for the confusion matrix for training and testing accuracy**

---



In [None]:
import numpy as np
import matplotlib.pyplot as plt
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


In [None]:
c_matrix_rand = confusion_matrix(y_train, prediction_train)
c_matrix_norm = c_matrix_rand/c_matrix_rand.sum(axis=1)[:, np.newaxis]
plt.figure()
plot_confusion_matrix(c_matrix_norm, classes=rand_forest.classes_, title='Classification accuracy on Train set')

In [None]:
c_matrix_rand

In [None]:
c_matrix_rand = confusion_matrix(y_train, prediction_train)
c_matrix_norm = c_matrix_rand/c_matrix_rand.sum(axis=1)[:, np.newaxis]
plt.figure()
plot_confusion_matrix(c_matrix_norm, classes=rand_forest.classes_, title='Classification accuracy on Test set')



In [None]:
c_matrix_rand

---
---

# ***KNN*** 

---
---
Now, we will run the KNN classifier in the dataset. For different values of k, we want to find the difference between the training and testing accuracy. Later, we will apply it to the scale of instances using MinMaxScaler, StandardScaler, MaxAbsScaler, and RobustScaler. With the scaled value, we want to find the difference between the training and testing accuracy. 



---


**Exploring the dataset. Dividing it into training and testing part.**

---



In [None]:
#Analyzing the dataset
new_df.describe()

In [None]:
#Train Test splitting, 70% for training & 30% for testing
X = new_df.drop(columns = 'Level').values# Input features (attributes)
y = new_df['Level'].values # Target vector
print('X shape: {}'.format(np.shape(X)))
print('y shape: {}'.format(np.shape(y)))
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size=0.3, random_state=0)



---

**We will apply it to the model. We will define k values from 1 to 700 neighbors.**

---




In [None]:
numNeighbors = range(1,701)
trainAcc = []
testAcc = []
for k in numNeighbors:
  knn = KNeighborsClassifier(n_neighbors=k, metric='minkowski', p=2, algorithm ='brute')
  knn.fit(X_train, y_train)
  y_predTrain = knn.predict(X_train)
  y_predTest = knn.predict(X_test)
  trainAcc.append(accuracy_score(y_train, y_predTrain))
  testAcc.append(accuracy_score(y_test, y_predTest))

plt.plot(numNeighbors, trainAcc, 'ro-', numNeighbors, testAcc, 'bv-')
plt.legend(['Training Accuracy','Test Accuracy'])
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')
index = 0
for i in numNeighbors:
  print("K = ", numNeighbors[index], ", Training Accuracy = ", trainAcc[index], " Test Accuracy = ",
        testAcc[index], " Difference = ", np.abs(trainAcc[index]-testAcc[index])*100, "%")
  index += 1
  #8



---

**For k = 8, training accuracy of 99.71% and testing accuracy of 99.66% had a difference of 0.047% approximately. We have ignored the k value of 1 & 2 since these can be noisy.**

---





---
**We will apply StandardScaler to scale data and train it to KNN classifier**

---




In [None]:

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the KNN classifier using the scaled data
knn = KNeighborsClassifier(metric='minkowski', p=2)
knn.fit(X_train_scaled, y_train)

In [None]:
numNeighbors = range(1,701)
trainAcc = []
testAcc = []
for k in numNeighbors:
  knn = KNeighborsClassifier(n_neighbors=k, metric='minkowski', p=2, algorithm ='brute')
  knn.fit(X_train_scaled, y_train)
  y_predTrain = knn.predict(X_train_scaled)
  y_predTest = knn.predict(X_test_scaled)
  trainAcc.append(accuracy_score(y_train, y_predTrain))
  testAcc.append(accuracy_score(y_test, y_predTest))

plt.plot(numNeighbors, trainAcc, 'ro-', numNeighbors, testAcc, 'bv-')
plt.legend(['Training Accuracy','Test Accuracy'])
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')
index = 0
for i in numNeighbors:
  print("K = ", numNeighbors[index], ", Training Accuracy = ", trainAcc[index], " Test Accuracy = ",
        testAcc[index], " Difference = ", np.abs(trainAcc[index]-testAcc[index])*100, "%")
  index += 1



---


**For k = 9, training accuracy of 99.42% and testing accuracy of 98.00% had a difference of 1.42% approximately. We have ignored the k value of (1-8) since these can be noisy.**

---





---
**Now, We will apply MinMaxScaler to scale data and train it to KNN classifier**

---




In [None]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the KNN classifier using the scaled data
knn = KNeighborsClassifier(metric='minkowski', p=2)
knn.fit(X_train_scaled, y_train)

In [None]:
numNeighbors = range(1,701)
trainAcc = []
testAcc = []
for k in numNeighbors:
  knn = KNeighborsClassifier(n_neighbors=k, metric='minkowski', p=2, algorithm ='brute')
  knn.fit(X_train_scaled, y_train)
  y_predTrain = knn.predict(X_train_scaled)
  y_predTest = knn.predict(X_test_scaled)
  trainAcc.append(accuracy_score(y_train, y_predTrain))
  testAcc.append(accuracy_score(y_test, y_predTest))

plt.plot(numNeighbors, trainAcc, 'ro-', numNeighbors, testAcc, 'bv-')
plt.legend(['Training Accuracy','Test Accuracy'])
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')
index = 0
for i in numNeighbors:
  print("K = ", numNeighbors[index], ", Training Accuracy = ", trainAcc[index], " Test Accuracy = ",
        testAcc[index], " Difference = ", np.abs(trainAcc[index]-testAcc[index])*100, "%")
  index += 1



---


**We got same result as Standardscaler.For k =3 , the model seems overfitted.**

---



---



**After then,We will apply MaxAbsScaler to scale data and train it to KNN classifier**

---



In [None]:
scaler = MaxAbsScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the KNN classifier using the scaled data
knn = KNeighborsClassifier(metric='minkowski', p=2)
knn.fit(X_train_scaled, y_train)

In [None]:
numNeighbors = range(1,701)
trainAcc = []
testAcc = []
for k in numNeighbors:
  knn = KNeighborsClassifier(n_neighbors=k, metric='minkowski', p=2, algorithm ='brute')
  knn.fit(X_train_scaled, y_train)
  y_predTrain = knn.predict(X_train_scaled)
  y_predTest = knn.predict(X_test_scaled)
  trainAcc.append(accuracy_score(y_train, y_predTrain))
  testAcc.append(accuracy_score(y_test, y_predTest))

plt.plot(numNeighbors, trainAcc, 'ro-', numNeighbors, testAcc, 'bv-')
plt.legend(['Training Accuracy','Test Accuracy'])
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')
index = 0
for i in numNeighbors:
  print("K = ", numNeighbors[index], ", Training Accuracy = ", trainAcc[index], " Test Accuracy = ",
        testAcc[index], " Difference = ", np.abs(trainAcc[index]-testAcc[index])*100, "%")
  index += 1



---


**No change for the MinMaxScaler!**

---



---



**Lastly, We will apply RobustScaler to scale data and train it to KNN classifier**

---



In [None]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the KNN classifier using the scaled data
knn = KNeighborsClassifier(metric='minkowski', p=2)
knn.fit(X_train_scaled, y_train)

In [None]:
numNeighbors = range(1,701)
trainAcc = []
testAcc = []
for k in numNeighbors:
  knn = KNeighborsClassifier(n_neighbors=k, metric='minkowski', p=2, algorithm ='brute')
  knn.fit(X_train_scaled, y_train)
  y_predTrain = knn.predict(X_train_scaled)
  y_predTest = knn.predict(X_test_scaled)
  trainAcc.append(accuracy_score(y_train, y_predTrain))
  testAcc.append(accuracy_score(y_test, y_predTest))

plt.plot(numNeighbors, trainAcc, 'ro-', numNeighbors, testAcc, 'bv-')
plt.legend(['Training Accuracy','Test Accuracy'])
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')
index = 0
for i in numNeighbors:
  print("K = ", numNeighbors[index], ", Training Accuracy = ", trainAcc[index], " Test Accuracy = ",
        testAcc[index], " Difference = ", np.abs(trainAcc[index]-testAcc[index])*100, "%")
  index += 1



---


**Same result!**

---



---
---

# ***Naive Bayes*** 

---
---

---
---

# ***Support Vector Machine*** 

---
---