#  AI FINAL PROJECT PHASE 3




## Data Pre-Processing ##





### Importing the Necessary Libraries ###

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### Mounting Google Drive ###

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Reading Data from CSV ###

In [None]:
data = pd.read_csv('/content/NATOPS_sid20-C_TRAIN_su.csv')

In [None]:
# Checking if the Data is successfully Read
data.head()


Unnamed: 0,isTest,fea2,fea3,fea4,fea5,fea6,fea7,fea8,fea9,fea10,...,fea14,fea15,fea16,fea17,fea18,fea19,fea20,fea21,sid,class
0,0,0.117647,0.039216,0.0,0.137255,0.019608,0.098039,0.294118,0.0,0.0,...,0.0,0.0,0.0,0.0,0.137255,0.0,0.0,0.0,1,3
1,0,0.117647,0.039216,0.0,0.137255,0.019608,0.098039,0.294118,0.0,0.0,...,0.0,0.0,0.0,0.0,0.137255,0.0,0.0,0.0,1,3
2,0,0.117647,0.039216,0.0,0.137255,0.019608,0.098039,0.294118,0.0,0.0,...,0.0,0.0,0.0,0.0,0.137255,0.0,0.0,0.0,1,3
3,0,0.117647,0.039216,0.0,0.137255,0.019608,0.098039,0.294118,0.0,0.0,...,0.0,0.0,0.0,0.0,0.137255,0.0,0.0,0.0,1,3
4,0,0.117647,0.039216,0.0,0.137255,0.019608,0.098039,0.294118,0.0,0.0,...,0.0,0.0,0.0,0.0,0.137255,0.0,0.0,0.0,1,3


### Checking If there are any Null values in the Data feeded ###

In [None]:
missing_values = data.isna().sum()
print("Number of Values missing per column:\n", missing_values)

Number of Values missing per column:
 isTest    0
fea2      0
fea3      0
fea4      0
fea5      0
fea6      0
fea7      0
fea8      0
fea9      0
fea10     0
fea11     0
fea12     0
fea13     0
fea14     0
fea15     0
fea16     0
fea17     0
fea18     0
fea19     0
fea20     0
fea21     0
sid       0
class     0
dtype: int64


### Extracting the Required Data ###

In [None]:
# Extracting the Train Data and Test Data

#Train data is extracted,The value of is test coloumn is 0
train_data = data.query('isTest == 0')

#Test data is extracted,The value of is test coloumn is 0
test_data = data.query('isTest == 1')


In [None]:
# Separating features and target variable
X_train = train_data.drop(['isTest', 'sid', 'class'], axis=1)
y_train = train_data['class']
X_test = test_data.drop(['isTest', 'sid', 'class'], axis=1)
y_test = test_data['class']

### Performing Standardization ###

In [None]:
#Scaling the values using Standard Scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)
# The fit_transform method computes the mean and standard deviation of each feature in X_train
# and scales each feature by subtracting its mean and dividing by its standard deviation.
# This transforms X_train into a standardized version.

# Transform the test data using the same scaler
X_test_scaled = scaler.transform(X_test)
# The transform method applies the same mean and standard deviation obtained from the training data
# to scale the test data. This ensures that the scaling is consistent between training and test datasets.


### Checking the Shape Of the Processed Data ####

In [None]:
# Print the shape (dimensions) of the scaled training features
print("Training Features Shape:", X_train_scaled.shape)

# Print the shape (dimensions) of the scaled test features
print("Test Features Shape:", X_test_scaled.shape)

# Print the shape (dimensions) of the training labels (target variable)
print("Training Labels Shape:", y_train.shape)

# Print the shape (dimensions) of the test labels (target variable)
print("Test Labels Shape:", y_test.shape)


Training Features Shape: (9180, 20)
Test Features Shape: (9180, 20)
Training Labels Shape: (9180,)
Test Labels Shape: (9180,)


## Design Algorithm ##

### Importing the Necessary Libraries ###

In [None]:
# Import the KNeighborsClassifier class from scikit-learn, which is used for k-nearest neighbors classification
from sklearn.neighbors import KNeighborsClassifier

# Import the LogisticRegression class from scikit-learn, which is used for logistic regression classification
from sklearn.linear_model import LogisticRegression

# Import the classification_report and accuracy_score functions from scikit-learn metrics module
# These functions are used for evaluating the performance of classification models
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Import the MLPClassifier from scikit-learn for neural network modeling.
from sklearn.neural_network import MLPClassifier


# Import RandomForestClassifier from scikit-learn for ensemble learning using random forests.
from sklearn.ensemble import RandomForestClassifier



### Creating a KNN, Logistic ,MLP ,Random Forest Classifiers###

### Creating a KNN Classifiers ###

In [None]:
# Create KNN classifier
knn = KNeighborsClassifier(n_neighbors=3)

# Define Stratified K-Fold cross-validator
strat_kfold = StratifiedKFold(n_splits=5,shuffle=True, random_state=45)  # 5 folds

### Creating a Logistic Regression Classifier ###

In [None]:
# Create a Logistic Regression classifier
log_reg = LogisticRegression(max_iter=1000, solver='saga')

# Define Stratified K-Fold cross-validator
strat_kfold1 = StratifiedKFold(n_splits=5, shuffle=True, random_state=45)  # 5 folds


### Creating a MLP Classifier ###

In [None]:
# Create an MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(100,50), max_iter=1000,random_state=50)

# Define Stratified K-Fold cross-validator
strat_kfold2 = StratifiedKFold(n_splits=5,shuffle=True, random_state=45)  # 5 folds



### Creating a Random Forest Classifier ###

In [None]:
# Create the Random Forest classifier
random_forest = RandomForestClassifier(random_state=42)

# Define Stratified K-Fold cross-validator
strat_kfold3 = StratifiedKFold(n_splits=5,shuffle=True, random_state=45)  # 5 folds

### Training the Classifiers ###

In [None]:
# Train the k-nearest neighbors classifier (knn) using the scaled training data
knn.fit(X_train_scaled, y_train)

# Perform Stratified K-Fold Cross-Validation
cross_val_scores = cross_val_score(knn, X_train_scaled, y_train, cv=strat_kfold)

# Train the logistic regression classifier (log_reg) using the scaled training data
log_reg.fit(X_train_scaled, y_train)

# Perform Stratified K-Fold Cross-Validation for Logistic Regression
cross_val_scores_log_reg = cross_val_score(log_reg, X_train_scaled, y_train, cv=strat_kfold1)

# Train the MLP classifier  using the scaled training data
mlp.fit(X_train_scaled, y_train)

# Perform Stratified K-Fold Cross-Validation
cross_val_scores_MLP = cross_val_score(mlp, X_train_scaled, y_train, cv=strat_kfold2)

# Train the classifier on the scaled training data
random_forest.fit(X_train_scaled, y_train)

# Perform Stratified K-Fold Cross-Validation
cross_val_scores_RFC = cross_val_score(random_forest, X_train_scaled, y_train, cv=strat_kfold3)


### Predicting the Trained Data ###

In [None]:
# Use the trained k-nearest neighbors model to make predictions on the scaled training data
train_predictions = knn.predict(X_train_scaled)

# Use the trained logistic regression model to make predictions on the scaled training data
train_predictions_log_reg = log_reg.predict(X_train_scaled)

# Use the trained MLP model to make predictions on the scaled training data
mlp_predictions = mlp.predict(X_train_scaled)

# Use the trained Random Forest model to make predictions on the scaled training data
rf_predictions = random_forest.predict(X_train_scaled)

### Evaluation of the Classifier ###

In [None]:
# Compute the accuracy of the k-nearest neighbors classifier on the training data
accuracy = accuracy_score(y_train, train_predictions)

# Generate a classification report for the k-nearest neighbors classifier on the training data
classification_report_result = classification_report(y_train, train_predictions)

# Compute the accuracy of the logistic regression classifier on the training data
accuracy_log_reg = accuracy_score(y_train, train_predictions_log_reg)

# Generate a classification report for the logistic regression classifier on the training data
classification_report_log_reg = classification_report(y_train, train_predictions_log_reg)

# Generate the accuracy of the MLP classifier on the Training data
accuracy_mlp = accuracy_score(y_train, mlp_predictions)

#Generate a classfication report for the MLP Classider on the training data
classification_report_mlp = classification_report(y_train, mlp_predictions)

# Generate the accuracy of the Random Forest classifier on the Training data
accuracy_rfc=accuracy_score(y_train,rf_predictions)

#Generate a classfication report for the Random Forest Classider on the training data
classification_report_rfc=classification_report(y_train,rf_predictions)


### Model Performance on Training data ###

In [None]:
# Output the results of the cross-validation
print("K-Nearest Neighbors\n")

# Print the training accuracy of the K-Nearest Neighbors classifier
print("Training Accuracy :", accuracy)

# Print the classification report for the K-Nearest Neighbors classifier
print("\nClassification Report:\n", classification_report_result)

print(f"\nAverage Stratified K-Fold Cross-Validation Scores: {cross_val_scores.mean()} \n")

print("Logistic Regression :\n")

# Print the training accuracy of the Logistic Regression classifier
print("Training Accuracy :", accuracy_log_reg)

# Print the classification report for the Logistic Regression classifier
print(f"\nClassification Report :\n {classification_report_log_reg} \n")

print("Average Stratified Cross-Validation Score (Logistic Regression):", cross_val_scores_log_reg.mean())

print("\nMLP CLassifier :\n")

#Print the Training accuracy of MLP Classifier
print("Training Accuracy:", accuracy_mlp)

#Print the Classification report for the MLP CLassifier
print("\nClassification Report:\n", classification_report_mlp)

print(f"\nAverage Stratified K-Fold Cross-Validation Scores: {cross_val_scores_MLP.mean()} \n")

print("\nRandom Forest Classifier : \n")

#Print the Training accuracy of Random Forest Classifier
print("Training Accuracy:", accuracy_rfc)

#Print the Classification report for the MLP CLassifier
print("\nClassification Report:\n", classification_report_rfc)

print(f"\nAverage Stratified K-Fold Cross-Validation Scores: {cross_val_scores_RFC.mean()} \n")


K-Nearest Neighbors

Training Accuracy : 0.9888888888888889

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98      1530
           1       1.00      0.93      0.97      1530
           2       0.97      1.00      0.98      1530
           3       1.00      1.00      1.00      1530
           4       1.00      1.00      1.00      1530
           5       1.00      1.00      1.00      1530

    accuracy                           0.99      9180
   macro avg       0.99      0.99      0.99      9180
weighted avg       0.99      0.99      0.99      9180


Average Stratified K-Fold Cross-Validation Scores: 0.9888888888888889 

Logistic Regression :

Training Accuracy : 0.7611111111111111

Classification Report :
               precision    recall  f1-score   support

           0       0.93      0.90      0.92      1530
           1       0.54      0.63      0.58      1530
           2       0.62      0.53      0.57     

## Model evaluation ##

### Importing the Necessary Libraries ###

In [None]:
# Import the accuracy_score function from scikit-learn metrics module
from sklearn.metrics import accuracy_score

# Import the classification_report function from scikit-learn metrics module
from sklearn.metrics import classification_report

# Import the confusion_matrix function from scikit-learn metrics module
from sklearn.metrics import confusion_matrix


from sklearn.neighbors import KNeighborsClassifier

from sklearn.neural_network import MLPClassifier


### Make predictions and Evaluation of Classifiers on the Test Data ###

In [None]:
# Use the trained KNN model to make predictions on the scaled test data
knn_predictions = knn.predict(X_test_scaled)

# Compute the accuracy of the KNN classifier on the test data
accuracy_knn = accuracy_score(y_test, knn_predictions)

# Generate a classification report for the KNN classifier on the test data
classification_report_knn = classification_report(y_test, knn_predictions)

# Calculate the confusion matrix for the KNN classifier's predictions on the test data
confusion_matrix_knn = confusion_matrix(y_test, knn_predictions)


In [None]:
# Use the trained Logistic Regression model to make predictions on the scaled test data
log_reg_predictions = log_reg.predict(X_test_scaled)

# Compute the accuracy of the Logistic Regression classifier on the test data
accuracy_log_reg = accuracy_score(y_test, log_reg_predictions)

# Generate a classification report for the Logistic Regression classifier on the test data
classification_report_log_reg = classification_report(y_test, log_reg_predictions)

# Calculate the confusion matrix for the Logistic Regression classifier's predictions on the test data
confusion_matrix_log_reg = confusion_matrix(y_test, log_reg_predictions)


In [None]:
#Use the Trained MLP model to make predictions on the scaled test data
mlp_predictions_test=mlp.predict(X_test_scaled)

#Compute the accuracy of the MLP classifier on the test data
accuracy_mlp_test = accuracy_score(y_test, mlp_predictions_test)

# Generate a classification report for the MLP classifier on the test data
classification_report_mlp_test = classification_report(y_test, mlp_predictions_test)

# Calculate the confusion matrix for the MLP classifier's predictions on the test data
confusion_matrix_mlp_test = confusion_matrix(y_test, mlp_predictions_test)


In [None]:
#Use the Trained RFC model to make predictions on the scaled test data
rfc_predictions_test=random_forest.predict(X_test_scaled)

#Compute the accuracy of the RFC classifier on the test data
accuracy_rfc_test = accuracy_score(y_test, rfc_predictions_test)

# Generate a classification report for the RFC classifier on the test data
classification_report_rfc_test = classification_report(y_test, rfc_predictions_test)

# Calculate the confusion matrix for the RFC classifier's predictions on the test data
confusion_matrix_rfc_test = confusion_matrix(y_test, rfc_predictions_test)

### Model Performance on Test Data ###

In [None]:
# Print the test accuracy of the KNN classifier
print("KNN Test Accuracy:", accuracy_knn)

# Print the classification report for the KNN classifier on the test data
print("\nKNN Classification Report:\n", classification_report_knn)

# Print the confusion matrix for the KNN classifier's predictions on the test data
print("\nKNN Confusion Matrix:\n", confusion_matrix_knn)


KNN Test Accuracy: 0.6

KNN Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.63      0.72      1530
           1       0.39      0.47      0.42      1530
           2       0.41      0.47      0.44      1530
           3       0.59      0.53      0.56      1530
           4       0.53      0.57      0.55      1530
           5       1.00      0.93      0.97      1530

    accuracy                           0.60      9180
   macro avg       0.63      0.60      0.61      9180
weighted avg       0.63      0.60      0.61      9180


KNN Confusion Matrix:
 [[ 969  408  153    0    0    0]
 [ 102  714  714    0    0    0]
 [ 102  714  714    0    0    0]
 [   0    0   51  816  663    0]
 [   0    0  102  561  867    0]
 [   0    0    0    0  102 1428]]


In [None]:
# Print the test accuracy of the Logistic Regression classifier
print("\nLogistic Regression Test Accuracy:", accuracy_log_reg)

# Print the classification report for the Logistic Regression classifier on the test data
print("\nLogistic Regression Classification Report:\n", classification_report_log_reg)

# Print the confusion matrix for the Logistic Regression classifier's predictions on the test data
print("\nLogistic Regression Confusion Matrix:\n", confusion_matrix_log_reg)



Logistic Regression Test Accuracy: 0.6666666666666666

Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.80      0.84      1530
           1       0.47      0.63      0.54      1530
           2       0.48      0.33      0.39      1530
           3       0.57      0.67      0.62      1530
           4       0.63      0.57      0.60      1530
           5       1.00      1.00      1.00      1530

    accuracy                           0.67      9180
   macro avg       0.67      0.67      0.66      9180
weighted avg       0.67      0.67      0.66      9180


Logistic Regression Confusion Matrix:
 [[1224  204  102    0    0    0]
 [  51  969  459   51    0    0]
 [ 102  867  510   51    0    0]
 [   0    0    0 1020  510    0]
 [   0    0    0  663  867    0]
 [   0    0    0    0    0 1530]]


In [None]:
# Print the test accuracy of the MLP classifier
print("\nMLP Test Accuracy:", accuracy_mlp_test)

# Print the classification report for the MLP classifier on the test data
print("\nMLP Classification Report:\n", classification_report_mlp_test)

# Print the confusion matrix for the MLP classifier's predictions on the test data
print("\nMLP Confusion Matrix:\n", confusion_matrix_mlp_test)



MLP Test Accuracy: 0.6444444444444445

MLP Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.50      0.64      1530
           1       0.43      0.63      0.51      1530
           2       0.52      0.47      0.49      1530
           3       0.66      0.63      0.64      1530
           4       0.62      0.67      0.65      1530
           5       0.94      0.97      0.95      1530

    accuracy                           0.64      9180
   macro avg       0.67      0.64      0.65      9180
weighted avg       0.67      0.64      0.65      9180


MLP Confusion Matrix:
 [[ 765  561  102    0  102    0]
 [  51  969  510    0    0    0]
 [  51  663  714    0    0  102]
 [   0   51    0  969  510    0]
 [   0    0    0  510 1020    0]
 [   0    0   51    0    0 1479]]


In [None]:
# Print the test accuracy of the Random Forest classifier
print("\nRandom Forest Test Accuracy:", accuracy_rfc_test)

# Print the classification report for the Random Forest classifier on the test data
print("\nRandom Forest Classification Report:\n", classification_report_rfc_test)

# Print the confusion matrix for the Random Forest classifier's predictions on the test data
print("\nRFC Confusion Matrix:\n", confusion_matrix_rfc_test)


Random Forest Test Accuracy: 0.6555555555555556

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.80      0.80      1530
           1       0.42      0.50      0.45      1530
           2       0.50      0.40      0.44      1530
           3       0.62      0.60      0.61      1530
           4       0.61      0.63      0.62      1530
           5       1.00      1.00      1.00      1530

    accuracy                           0.66      9180
   macro avg       0.66      0.66      0.66      9180
weighted avg       0.66      0.66      0.66      9180


RFC Confusion Matrix:
 [[1224  255   51    0    0    0]
 [ 204  765  561    0    0    0]
 [ 102  816  612    0    0    0]
 [   0    0    0  918  612    0]
 [   0    0    0  561  969    0]
 [   0    0    0    0    0 1530]]


## **Final Analysis** ##

Based on the provided test accuracy, classification reports, and confusion matrices for the K-Nearest Neighbors (KNN), Logistic Regression, Multi-Layer Perceptron (MLP), and Random Forest Classifier (RFC) models, here's a detailed analysis and comparison of their performances:

 **K-Nearest Neighbors (KNN)**

- **Test Accuracy:** 60%
- **Precision & Recall:**
  - High precision (100%) for class 5 but low for classes 1 and 2 (around 39-41%).
  - High recall for class 5 (93%) but lower for class 0 (63%).
- **F1-Score:**
  - Excellent for class 5 (0.97), indicating strong performance.
  - Lower F1-scores for classes 1 and 2 (around 0.42-0.44).
- **Confusion Matrix Observations:**
  - Noticeable misclassification between classes 1 and 2.
  - Class 5 is very well identified with minimal errors.

**Logistic Regression**

- **Test Accuracy:** 66.67%
- **Precision & Recall:**
  - High precision for class 0 (89%) and class 5 (100%), but lower for classes 1 and 2.
  - High recall for class 5 (100%) and good for class 3 (67%).
- **F1-Score:**
  - Excellent for class 5 (1.00), demonstrating outstanding performance.
  - Lower scores for classes 1 and 2, indicating weaker performance.
- **Confusion Matrix Observations:**
  - Some confusion between classes 1 and 2.
  - Class 5 is perfectly classified, indicating clear feature distinction.

**Multi-Layer Perceptron (MLP)**

- **Test Accuracy:** 64.44%
- **Precision & Recall:**
  - High precision for class 5 (94%) but lower for class 1 (43%).
  - High recall for class 5 (97%), but lower for class 0 (50%).
- **F1-Score:**
  - Excellent for class 5 (0.95), showing very strong performance.
  - Lower for class 0 (0.64), indicating room for improvement.
- **Confusion Matrix Observations:**
  - Class 0 frequently misclassified as class 1.
  - Class 5 is accurately identified with very few errors.

**Random Forest Classifier (RFC)**

- **Test Accuracy:** 65.56%
- **Precision & Recall:**
  - Good precision for class 0 (80%) and excellent for class 5 (100%).
  - Recall is also strong for class 5 (100%), but moderate for others.
- **F1-Score:**
  - High for class 5 (1.00), indicating exceptional performance.
  - Moderate for other classes, with some variation.
- **Confusion Matrix Observations:**
  - Some misclassifications between classes 1, 2, and 3.
  - Class 5 is perfectly predicted.

**Overall Evaluation**

- **Best Overall Performer:** Logistic Regression stands out with the highest overall accuracy and a balanced performance across most classes, particularly excelling in class 5.
- **Class 5 Prediction:** All models perform exceptionally well in predicting class 5, suggesting distinct and clear features for this class.
- **Challenges with Classes 1 and 2:** All models show difficulties in accurately predicting classes 1 and 2, indicating possible overlapping features or insufficient differentiation.
- **Model Selection Considerations:**
  - If predicting class 5 accurately is a priority, Logistic Regression and MLP are strong choices.
  - For a balanced performance across various classes, Logistic Regression appears most consistent.
  - KNN, despite its lower overall accuracy, might still offer valuable insights, particularly in multi-model ensembles.

In conclusion, Logistic Regression seems to be the most suitable model considering its overall accuracy and balanced class performance. However, the choice should also be influenced by the specific requirements of your application, such as the importance of accurately predicting certain classes or the need for a balance across all classes.