In [111]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import statsmodels.api as sm
from scipy import stats
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [113]:
# Load the CSV file into a DataFrame
file_path = 'C:\\Users\\seanm\\Downloads\\letters.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame
print(df.head())

   label  pixel43  pixel44  pixel92  pixel124  pixel125  pixel126  pixel127  \
0      1        0        0        0         0         0         0         0   
1      0        0        0        0       137       137       192        86   
2      1        0        0        0         3       141       139         3   
3      4        0        0        0         0         0         0         0   
4      0        0        0        0       155       254       254       254   

   pixel128  pixel129  ...  pixel329  pixel351  pixel410  pixel411  pixel412  \
0         0         0  ...         0       254         0         0         0   
1        72         1  ...       254         0         0        75       254   
2         0         0  ...         0       184         0         0         0   
3         0         0  ...         0         0        94       255        69   
4       157        30  ...       253         0         0         0       223   

   pixel413  pixel414  pixel415  pixel416  p

In [115]:
#Summary Statistics for dataset
# Get descriptive statistics
summary_stats = df.describe()
print(summary_stats)

# Get frequency count of a specific categorical column
frequency_count = df['label'].value_counts()
print(frequency_count)

# Get frequency count of a specific categorical column
frequency_count = df['pixel43'].value_counts()
print(frequency_count)

# Get the correlation matrix
correlation_matrix = df.corr()
print(correlation_matrix)

              label       pixel43       pixel44       pixel92      pixel124  \
count  42000.000000  42000.000000  42000.000000  42000.000000  42000.000000   
mean       4.456643      0.171357      0.164476      1.192833     28.043952   
std        2.887730      5.726352      5.515774     14.692403     70.505431   
min        0.000000      0.000000      0.000000      0.000000      0.000000   
25%        2.000000      0.000000      0.000000      0.000000      0.000000   
50%        4.000000      0.000000      0.000000      0.000000      0.000000   
75%        7.000000      0.000000      0.000000      0.000000      0.000000   
max        9.000000    255.000000    255.000000    255.000000    255.000000   

           pixel125      pixel126      pixel127      pixel128      pixel129  \
count  42000.000000  42000.000000  42000.000000  42000.000000  42000.000000   
mean      36.084976     42.713952     46.092310     44.542452     38.948524   
std       78.631145     84.390533     87.287033    

In [117]:
#Building training and testing sets for the dataset
from sklearn.model_selection import train_test_split

#'label' is your target variable
y = df['label']

#Use all other columns as features
x = df.drop('label', axis=1)

# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

In [119]:
# Check for missing values in each column
missing_values = df.isnull().sum()

# Display columns with missing values
print(missing_values[missing_values > 0])

Series([], dtype: int64)


In [121]:
#Checking for blank strings
blank_values = (df == '').sum()
print(blank_values[blank_values > 0])

Series([], dtype: int64)


In [123]:
#Checking for Non-Numeric Values
numeric_columns = df.select_dtypes(include=['number']).columns
non_numeric_values = df[numeric_columns].apply(lambda col: pd.to_numeric(col, errors='coerce').isnull().sum())
print(non_numeric_values[non_numeric_values > 0])

Series([], dtype: int64)


In [125]:
#Features before mean normalization
unscaled_features = x_train
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train_array = sc.fit_transform(x_train.values)
#Assign the scaled data to a DataFrame & use the index and columns arguments to keep your original indices and column names:
x_train = pd.DataFrame(x_train_array, index=x_train.index, columns=x_train.columns)
x_test_array = sc.transform(x_test.values)
x_test = pd.DataFrame(x_test_array, index=x_test.index, columns=x_test.columns)

In [143]:
#Part 1: Build KNN Model
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Build the KNN model
knn = KNeighborsClassifier(n_neighbors=5) 
knn.fit(x_train_scaled, y_train)

# Predict and evaluate
y_pred = knn.predict(x_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"KNN Accuracy: {accuracy:.4f}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

KNN Accuracy: 0.6460
Confusion Matrix:
 [[762   2  18   4  11   8  27  17   5   7]
 [  0 921   2   3   7  11   4   4  14   5]
 [ 49  33 504  71  23   8  15  35  56  23]
 [ 12  24 123 446  22  53   6  29  95  24]
 [ 25  63  13  10 428  31  21 133  15  63]
 [  6  27  25 102  29 417  37  40  28  33]
 [ 28  15  16  11  15  15 716   0   5   0]
 [ 10   9  18  21  78  45   0 516  19 198]
 [ 19  80  81  99  27  38  23  19 377  26]
 [ 25  12   8  32  63  30   0 310  28 339]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.89      0.85       861
           1       0.78      0.95      0.85       971
           2       0.62      0.62      0.62       817
           3       0.56      0.53      0.55       834
           4       0.61      0.53      0.57       802
           5       0.64      0.56      0.60       744
           6       0.84      0.87      0.86       821
           7       0.47      0.56      0.51       914
           8      

In [129]:
#Part 2: Build neural networks
from sklearn.neural_network import MLPClassifier

#Initializing the multilayer perception
mlp = MLPClassifier(100,solver='sgd',learning_rate_init= 0.01, max_iter=10000)

mlp.fit(x_train, y_train)

mlp.score(x_test, y_test)

0.6942857142857143

In [130]:
from sklearn.neural_network import MLPClassifier

#Initializing the multilayer perception
mlp = MLPClassifier(100,solver='sgd',learning_rate_init= 0.01, max_iter=15000)

mlp.fit(x_train, y_train)

mlp.score(x_test, y_test)

0.6989285714285715

In [131]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the neural network model
mlp = MLPClassifier(hidden_layer_sizes=(100,), solver='sgd', learning_rate_init=0.01, max_iter=10000, random_state=42)

# Train the model
mlp.fit(x_train_scaled, y_train)

# Predict and evaluate
y_pred = mlp.predict(x_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Neural Network Accuracy: {accuracy:.4f}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Neural Network Accuracy: 0.6963
Confusion Matrix:
 [[747   1  30   6  16  11  20  10  11   9]
 [  0 905   6   2  11   7   6   0  24  10]
 [ 42  16 512  84  16  12  10  31  66  28]
 [  4  11  72 514   9  55   8  26  95  40]
 [ 14  38   3   5 466  22  19 151  18  66]
 [  3  11   9  80  17 461  40  37  39  47]
 [ 18   9   5  10  17  14 742   0   6   0]
 [  4   6   8  13  19  50   1 616  17 180]
 [ 12  40  42  90  14  31  15  25 486  34]
 [ 15   8   6  15  13  31   0 330  29 400]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.87      0.87       861
           1       0.87      0.93      0.90       971
           2       0.74      0.63      0.68       817
           3       0.63      0.62      0.62       834
           4       0.78      0.58      0.67       802
           5       0.66      0.62      0.64       744
           6       0.86      0.90      0.88       821
           7       0.50      0.67      0.58       914
       

In [109]:
#Part 3: Compare and contrast both models.
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Build and evaluate KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train_scaled, y_train)
y_pred_knn = knn.predict(x_test_scaled)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
conf_matrix_knn = confusion_matrix(y_test, y_pred_knn)
class_report_knn = classification_report(y_test, y_pred_knn)

# Build and evaluate Neural Network model
mlp = MLPClassifier(hidden_layer_sizes=(100,), solver='sgd', learning_rate_init=0.01, max_iter=10000, random_state=42)
mlp.fit(x_train_scaled, y_train)
y_pred_nn = mlp.predict(x_test_scaled)
accuracy_nn = accuracy_score(y_test, y_pred_nn)
conf_matrix_nn = confusion_matrix(y_test, y_pred_nn)
class_report_nn = classification_report(y_test, y_pred_nn)

# Print results for KNN
print("KNN Model Performance:")
print(f"Accuracy: {accuracy_knn:.4f}")
print("Confusion Matrix:\n", conf_matrix_knn)
print("Classification Report:\n", class_report_knn)

# Print results for Neural Network
print("\nNeural Network Model Performance:")
print(f"Accuracy: {accuracy_nn:.4f}")
print("Confusion Matrix:\n", conf_matrix_nn)
print("Classification Report:\n", class_report_nn)

# Compare both models
print("\nComparison of Models:")
print(f"KNN Accuracy: {accuracy_knn:.4f}")
print(f"Neural Network Accuracy: {accuracy_nn:.4f}")

KNN Model Performance:
Accuracy: 0.6460
Confusion Matrix:
 [[762   2  18   4  11   8  27  17   5   7]
 [  0 921   2   3   7  11   4   4  14   5]
 [ 49  33 504  71  23   8  15  35  56  23]
 [ 12  24 123 446  22  53   6  29  95  24]
 [ 25  63  13  10 428  31  21 133  15  63]
 [  6  27  25 102  29 417  37  40  28  33]
 [ 28  15  16  11  15  15 716   0   5   0]
 [ 10   9  18  21  78  45   0 516  19 198]
 [ 19  80  81  99  27  38  23  19 377  26]
 [ 25  12   8  32  63  30   0 310  28 339]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.89      0.85       861
           1       0.78      0.95      0.85       971
           2       0.62      0.62      0.62       817
           3       0.56      0.53      0.55       834
           4       0.61      0.53      0.57       802
           5       0.64      0.56      0.60       744
           6       0.84      0.87      0.86       821
           7       0.47      0.56      0.51       914

In [145]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# K-Nearest Neighbors Model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train_scaled, y_train)
knn_predictions = knn.predict(x_test_scaled)

# Neural Network Model
mlp = MLPClassifier(hidden_layer_sizes=(100,), solver='adam', learning_rate_init=0.001, max_iter=1000, random_state=42)
mlp.fit(x_train_scaled, y_train)
mlp_predictions = mlp.predict(x_test_scaled)

# Evaluation Metrics
def evaluate_model(y_true, y_pred, model_name):
    print(f"\n{model_name} Performance:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"F1 Score: {f1_score(y_true, y_pred, average='weighted'):.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))

# Evaluate KNN
evaluate_model(y_test, knn_predictions, "KNN")

# Evaluate Neural Network
evaluate_model(y_test, mlp_predictions, "Neural Network")



KNN Performance:
Accuracy: 0.6460
Precision: 0.6398
Recall: 0.6460
F1 Score: 0.6399
Confusion Matrix:
 [[762   2  18   4  11   8  27  17   5   7]
 [  0 921   2   3   7  11   4   4  14   5]
 [ 49  33 504  71  23   8  15  35  56  23]
 [ 12  24 123 446  22  53   6  29  95  24]
 [ 25  63  13  10 428  31  21 133  15  63]
 [  6  27  25 102  29 417  37  40  28  33]
 [ 28  15  16  11  15  15 716   0   5   0]
 [ 10   9  18  21  78  45   0 516  19 198]
 [ 19  80  81  99  27  38  23  19 377  26]
 [ 25  12   8  32  63  30   0 310  28 339]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.89      0.85       861
           1       0.78      0.95      0.85       971
           2       0.62      0.62      0.62       817
           3       0.56      0.53      0.55       834
           4       0.61      0.53      0.57       802
           5       0.64      0.56      0.60       744
           6       0.84      0.87      0.86       821
        