# MNIST

In [1]:
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

mnist = datasets.load_digits()

X = pd.DataFrame(np.array(mnist.data))

y = pd.DataFrame(mnist.target)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

print("KNN Score: %0.4f"%model.score(X_test, y_test))

  return self._fit(X, y)


KNN Score: 0.9933


By applying .values.ravel() to y_train and y_test, you ensure that they are converted to 1-dimensional arrays, which should resolve the warning message.

In [2]:
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Load the MNIST dataset
mnist = datasets.load_digits()

# Convert data into Pandas DataFrames
X = pd.DataFrame(np.array(mnist.data))
y = pd.DataFrame(mnist.target)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Convert y_train and y_test to 1-dimensional arrays
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Initialize and train the K-Nearest Neighbors classifier
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

# Print the accuracy score of the KNN model on the test set
print("KNN Score: %0.4f" % model.score(X_test, y_test))


KNN Score: 0.9933


In [3]:
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

# Load the MNIST dataset
mnist = datasets.load_digits()

# Convert data into Pandas DataFrames
X = pd.DataFrame(np.array(mnist.data))
y = pd.DataFrame(mnist.target)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Create an SVM classifier model
svm_model = SVC(kernel='linear', C=1.0)

# Train the SVM model
svm_model.fit(X_train, y_train.values.ravel())

# Evaluate the SVM model
svm_accuracy = svm_model.score(X_test, y_test)
print("SVM Accuracy: %.4f" % svm_accuracy)


SVM Accuracy: 0.9822


In this code, we use a Support Vector Machine (SVM) classifier instead of the k-nearest neighbors algorithm. SVMs are another popular choice for classification tasks. We use a linear kernel and set the regularization parameter C to 1.0. After training the SVM model, we evaluate its accuracy on the test data and print the result.You can experiment with different kernels and parameters to see how they affect the SVM's performance on the MNIST dataset.

In [4]:
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Load the MNIST dataset
mnist = datasets.load_digits()

# Convert data into Pandas DataFrames
X = pd.DataFrame(np.array(mnist.data))
y = pd.DataFrame(mnist.target)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Create a Random Forest classifier model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest model
rf_model.fit(X_train, y_train.values.ravel())

# Evaluate the Random Forest model
rf_accuracy = rf_model.score(X_test, y_test)
print("Random Forest Accuracy: %.4f" % rf_accuracy)


Random Forest Accuracy: 0.9711


In this code, we're using a Random Forest classifier, which is an ensemble learning method based on decision trees. We specify the number of trees in the forest using the n_estimators parameter (set to 100 in this example). After training the Random Forest model, we evaluate its accuracy on the test data and print the result.

Random Forest classifiers often perform well on a variety of datasets, including MNIST, due to their ability to handle high-dimensional data and capture complex relationships between features.

In [5]:
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler

# Load the MNIST dataset
mnist = datasets.load_digits()

# Convert data into Pandas DataFrames
X = pd.DataFrame(np.array(mnist.data))
y = pd.DataFrame(mnist.target)

# Normalize the features
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.25, random_state=42)

# Create a Multinomial Naive Bayes classifier model
nb_model = MultinomialNB()

# Train the Multinomial Naive Bayes model
nb_model.fit(X_train, y_train.values.ravel())

# Evaluate the Multinomial Naive Bayes model
nb_accuracy = nb_model.score(X_test, y_test)
print("Multinomial Naive Bayes Accuracy: %.4f" % nb_accuracy)


Multinomial Naive Bayes Accuracy: 0.9111


In this code, we're using a Multinomial Naive Bayes classifier, which is suitable for classification with discrete features (like word counts for text classification). Even though the MNIST dataset contains pixel values, we can normalize them to a range of [0, 1] and still apply the Multinomial Naive Bayes classifier effectively.

After splitting the data into training and testing sets, we create the Multinomial Naive Bayes model, train it with the training data, and evaluate its accuracy on the test data.

In [6]:
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

# Load the MNIST dataset
mnist = datasets.load_digits()

# Convert data into Pandas DataFrames
X = pd.DataFrame(np.array(mnist.data))
y = pd.DataFrame(mnist.target)

# Normalize the features
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.25, random_state=42)

# Create a Logistic Regression classifier model
logreg_model = LogisticRegression(max_iter=1000, random_state=42)

# Train the Logistic Regression model
logreg_model.fit(X_train, y_train.values.ravel())

# Evaluate the Logistic Regression model
logreg_accuracy = logreg_model.score(X_test, y_test)
print("Logistic Regression Accuracy: %.4f" % logreg_accuracy)


Logistic Regression Accuracy: 0.9689


In this code, we're using logistic regression, which is a linear classification algorithm. We normalize the features using Min-Max scaling, split the data into training and testing sets, create a logistic regression model, train it with the training data, and then evaluate its accuracy on the test data.

Logistic regression is widely used for binary classification tasks but can also be extended to multiclass classification, making it suitable for the MNIST dataset, which has multiple classes (digits 0 through 9).







In [7]:
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Load the MNIST dataset
mnist = datasets.load_digits()

# Convert data into Pandas DataFrames
X = pd.DataFrame(np.array(mnist.data))
y = pd.DataFrame(mnist.target)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Create a Decision Tree classifier model
dt_model = DecisionTreeClassifier(random_state=42)

# Train the Decision Tree model
dt_model.fit(X_train, y_train)

# Evaluate the Decision Tree model
dt_accuracy = dt_model.score(X_test, y_test)
print("Decision Tree Accuracy: %.4f" % dt_accuracy)


Decision Tree Accuracy: 0.8578


In this code, we're using a Decision Tree classifier, which is a popular and interpretable machine learning algorithm. Decision trees recursively split the data into subsets based on the most significant attribute, creating a tree-like structure. We then evaluate the accuracy of the model on the test data.

Decision trees are capable of learning complex decision boundaries and can handle both numerical and categorical data, making them suitable for various classification tasks, including the MNIST dataset.

In [8]:
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler

# Load the MNIST dataset
mnist = datasets.load_digits()

# Convert data into Pandas DataFrames
X = pd.DataFrame(np.array(mnist.data))
y = pd.DataFrame(mnist.target)

# Normalize the features
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.25, random_state=42)

# Create a Gradient Boosting classifier model
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

# Train the Gradient Boosting model
gb_model.fit(X_train, y_train.values.ravel())

# Evaluate the Gradient Boosting model
gb_accuracy = gb_model.score(X_test, y_test)
print("Gradient Boosting Accuracy: %.4f" % gb_accuracy)


Gradient Boosting Accuracy: 0.9711


In this code, we're using a Gradient Boosting classifier, which is an ensemble learning technique that builds a strong model by combining multiple weak models, typically decision trees. Gradient boosting iteratively trains new models to correct errors made by previous models, gradually improving accuracy. After splitting the data into training and testing sets, we create the Gradient Boosting classifier, train it with the training data, and then evaluate its accuracy on the test data.

Gradient Boosting classifiers are known for their high predictive accuracy and robustness against overfitting, making them suitable for various classification tasks, including the MNIST dataset.

In [9]:
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler

# Load the MNIST dataset
mnist = datasets.load_digits()

# Convert data into Pandas DataFrames
X = pd.DataFrame(np.array(mnist.data))
y = pd.DataFrame(mnist.target)

# Standardize the features
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_standardized, y, test_size=0.25, random_state=42)

# Create a Gaussian Naive Bayes classifier model
nb_model = GaussianNB()

# Train the Gaussian Naive Bayes model
nb_model.fit(X_train, y_train.values.ravel())

# Evaluate the Gaussian Naive Bayes model
nb_accuracy = nb_model.score(X_test, y_test)
print("Gaussian Naive Bayes Accuracy: %.4f" % nb_accuracy)


Gaussian Naive Bayes Accuracy: 0.7778


In this code, we're using a Gaussian Naive Bayes classifier, which assumes that the features follow a Gaussian distribution. Before training the model, we standardize the features to have a mean of 0 and a standard deviation of 1 using StandardScaler. Then, we split the data into training and testing sets, create the Gaussian Naive Bayes classifier, train it with the training data, and evaluate its accuracy on the test data.

Naive Bayes classifiers are simple yet effective probabilistic classifiers that are particularly useful for datasets with many features like the MNIST dataset.

In [10]:
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# Load the MNIST dataset
mnist = datasets.load_digits()

# Convert data into Pandas DataFrames
X = pd.DataFrame(np.array(mnist.data))
y = pd.DataFrame(mnist.target)

# Standardize the features
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_standardized, y, test_size=0.25, random_state=42)

# Create an SVM classifier model with RBF kernel
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale')

# Train the SVM model
svm_model.fit(X_train, y_train.values.ravel())

# Evaluate the SVM model
svm_accuracy = svm_model.score(X_test, y_test)
print("SVM with RBF Kernel Accuracy: %.4f" % svm_accuracy)


SVM with RBF Kernel Accuracy: 0.9822


In this code, we're using a Support Vector Machine (SVM) classifier with a radial basis function (RBF) kernel. SVM with an RBF kernel is a powerful classifier that can capture complex decision boundaries. Before training the model, we standardize the features to have a mean of 0 and a standard deviation of 1 using StandardScaler. Then, we split the data into training and testing sets, create the SVM classifier with an RBF kernel, train it with the training data, and evaluate its accuracy on the test data.

SVM with an RBF kernel is particularly effective for high-dimensional datasets like the MNIST dataset and can achieve high accuracy in classification tasks.

In [11]:
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

# Load the MNIST dataset
mnist = datasets.load_digits()

# Convert data into Pandas DataFrames
X = pd.DataFrame(np.array(mnist.data))
y = pd.DataFrame(mnist.target)

# Standardize the features
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_standardized, y, test_size=0.25, random_state=42)

# Create an MLP classifier model
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)

# Train the MLP model
mlp_model.fit(X_train, y_train.values.ravel())

# Evaluate the MLP model
mlp_accuracy = mlp_model.score(X_test, y_test)
print("MLP Classifier Accuracy: %.4f" % mlp_accuracy)


MLP Classifier Accuracy: 0.9844


In this code, we're using a Multilayer Perceptron (MLP) classifier, which is a type of artificial neural network with multiple layers of nodes. MLP classifiers are capable of learning complex relationships in data and are widely used for classification tasks.

Before training the model, we standardize the features to have a mean of 0 and a standard deviation of 1 using StandardScaler. Then, we split the data into training and testing sets, create the MLP classifier with a single hidden layer of 100 neurons, train it with the training data, and evaluate its accuracy on the test data.

MLP classifiers are flexible and can handle various types of data, making them suitable for the MNIST dataset. They can achieve high accuracy but may require tuning of hyperparameters and careful preprocessing of the data.

In [12]:
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler

# Load the MNIST dataset
mnist = datasets.load_digits()

# Convert data into Pandas DataFrames
X = pd.DataFrame(np.array(mnist.data))
y = pd.DataFrame(mnist.target)

# Standardize the features
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_standardized, y, test_size=0.25, random_state=42)

# Initialize all classifiers
classifiers = {
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Support Vector Machine": SVC(kernel='rbf', C=1.0, gamma='scale'),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Naive Bayes": GaussianNB(),
    "Multilayer Perceptron": MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
}

# Train and evaluate all classifiers
results = {}
for name, classifier in classifiers.items():
    classifier.fit(X_train, y_train.values.ravel())
    accuracy = classifier.score(X_test, y_test)
    results[name] = accuracy

# Print the results
print("Classifier\t\tAccuracy")
print("-" * 30)
for name, accuracy in results.items():
    print(f"{name}\t\t{accuracy:.4f}")


Classifier		Accuracy
------------------------------
K-Nearest Neighbors		0.9778
Support Vector Machine		0.9822
Random Forest		0.9756
Naive Bayes		0.7778
Multilayer Perceptron		0.9844
Decision Tree		0.8533
Gradient Boosting		0.9711


In this code:

We define a dictionary called classifiers, which contains instances of various classifiers, such as K-Nearest Neighbors, Support Vector Machine, Random Forest, Naive Bayes, Multilayer Perceptron, Decision Tree, and Gradient Boosting.
We train each classifier on the training data and evaluate its accuracy on the test data.
Finally, we print out the accuracy of each classifier.
This approach allows you to compare the performance of multiple classifiers on the same dataset. You can easily add or remove classifiers from the classifiers dictionary to include different algorithms or variations of the same algorithm with different parameters.

In [13]:
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler

# Load the MNIST dataset
mnist = datasets.load_digits()

# Convert data into Pandas DataFrames
X = pd.DataFrame(np.array(mnist.data))
y = pd.DataFrame(mnist.target)

# Standardize the features
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_standardized, y, test_size=0.25, random_state=42)

# Initialize all classifiers
classifiers = {
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Support Vector Machine": SVC(kernel='rbf', C=1.0, gamma='scale'),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Naive Bayes": GaussianNB(),
    "Multilayer Perceptron": MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
}

# Train and evaluate all classifiers
results = {}
for name, classifier in classifiers.items():
    classifier.fit(X_train, y_train.values.ravel())
    accuracy = classifier.score(X_test, y_test)
    results[name] = accuracy

# Print the results
print("Classifier\t\tAccuracy")
print("-" * 30)
for name, accuracy in results.items():
    print(f"{name}\t\t{accuracy:.4f}")


Classifier		Accuracy
------------------------------
K-Nearest Neighbors		0.9778
Support Vector Machine		0.9822
Random Forest		0.9756
Naive Bayes		0.7778
Multilayer Perceptron		0.9844
Decision Tree		0.8533
Gradient Boosting		0.9711


In this code:

We initialize multiple classifiers, including K-Nearest Neighbors, Support Vector Machine, Random Forest, Naive Bayes, Multilayer Perceptron, Decision Tree, and Gradient Boosting.
We train each classifier on the training data and evaluate its accuracy on the test data.
Finally, we print out the accuracy of each classifier.
This approach allows you to compare the performance of multiple classifiers on the MNIST dataset. You can easily add or remove classifiers from the classifiers dictionary to include different algorithms or variations of the same algorithm with different parameters.

In [14]:
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler

# Load the MNIST dataset
mnist = datasets.load_digits()

# Convert data into Pandas DataFrames
X = pd.DataFrame(np.array(mnist.data))
y = pd.DataFrame(mnist.target)

# Standardize the features
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_standardized, y, test_size=0.25, random_state=42)

# Initialize classifiers
classifiers = {
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Support Vector Machine": SVC(kernel='rbf', C=1.0, gamma='scale'),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Naive Bayes": GaussianNB(),
    "Multilayer Perceptron": MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
}

# Train and evaluate classifiers
results = {}
for name, classifier in classifiers.items():
    classifier.fit(X_train, y_train.values.ravel())
    accuracy = classifier.score(X_test, y_test)
    results[name] = accuracy

# Print the results
print("Classifier\t\tAccuracy")
print("-" * 40)
for name, accuracy in results.items():
    print(f"{name}\t\t{accuracy:.4f}")


Classifier		Accuracy
----------------------------------------
K-Nearest Neighbors		0.9778
Support Vector Machine		0.9822
Random Forest		0.9756
Naive Bayes		0.7778
Multilayer Perceptron		0.9844
Decision Tree		0.8533
Gradient Boosting		0.9711


In this code:

We define a dictionary called classifiers, which contains instances of multiple classifiers, including K-Nearest Neighbors, Support Vector Machine, Random Forest, Naive Bayes, Multilayer Perceptron, Decision Tree, and Gradient Boosting.
We train each classifier on the training data and evaluate its accuracy on the test data.
Finally, we print out the accuracy of each classifier.
This approach allows you to compare the performance of multiple classifiers on the same MNIST dataset. You can easily add or remove classifiers from the classifiers dictionary to include different algorithms or variations of the same algorithm with different parameters.