Train a naive logistic regression on raw MNIST images to distinguish between 0s and 8s. We are calling
this our baseline. What can you tell about the baseline performance?

In [26]:
# Importing necessary libraries:
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [27]:
# Loading the MNIST dataset: The load_digits function is used to load the MNIST dataset, setting return_X_y to False, which means it will return a dictionary-like object containing both data and target labels.

mnist = load_digits(return_X_y=False)
mnist

{'data': array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ..., 10.,  0.,  0.],
        [ 0.,  0.,  0., ..., 16.,  9.,  0.],
        ...,
        [ 0.,  0.,  1., ...,  6.,  0.,  0.],
        [ 0.,  0.,  2., ..., 12.,  0.,  0.],
        [ 0.,  0., 10., ..., 12.,  1.,  0.]]),
 'target': array([0, 1, 2, ..., 8, 9, 8]),
 'frame': None,
 'feature_names': ['pixel_0_0',
  'pixel_0_1',
  'pixel_0_2',
  'pixel_0_3',
  'pixel_0_4',
  'pixel_0_5',
  'pixel_0_6',
  'pixel_0_7',
  'pixel_1_0',
  'pixel_1_1',
  'pixel_1_2',
  'pixel_1_3',
  'pixel_1_4',
  'pixel_1_5',
  'pixel_1_6',
  'pixel_1_7',
  'pixel_2_0',
  'pixel_2_1',
  'pixel_2_2',
  'pixel_2_3',
  'pixel_2_4',
  'pixel_2_5',
  'pixel_2_6',
  'pixel_2_7',
  'pixel_3_0',
  'pixel_3_1',
  'pixel_3_2',
  'pixel_3_3',
  'pixel_3_4',
  'pixel_3_5',
  'pixel_3_6',
  'pixel_3_7',
  'pixel_4_0',
  'pixel_4_1',
  'pixel_4_2',
  'pixel_4_3',
  'pixel_4_4',
  'pixel_4_5',
  'pixel_4_6',
  'pixel_4_7',
  'pixel_5_0',
  'pixel_5_1',
 

In [29]:
# Creating DataFrames for the data and labels: The MNIST data is loaded into a Pandas DataFrame called X, which contains the pixel values of the digits.

X = pd.DataFrame(mnist.data)
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1792,0.0,0.0,4.0,10.0,13.0,6.0,0.0,0.0,0.0,1.0,...,4.0,0.0,0.0,0.0,2.0,14.0,15.0,9.0,0.0,0.0
1793,0.0,0.0,6.0,16.0,13.0,11.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,6.0,16.0,14.0,6.0,0.0,0.0
1794,0.0,0.0,1.0,11.0,15.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,9.0,13.0,6.0,0.0,0.0
1795,0.0,0.0,2.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,5.0,12.0,16.0,12.0,0.0,0.0


In [17]:
y = pd.DataFrame(mnist.target)
y

Unnamed: 0,0
0,0
1,1
2,2
3,3
4,4
...,...
1792,9
1793,0
1794,8
1795,9


In [30]:
# create a Subset of the labels, which contains only the rows corresponding to digits 0 and 8.
y_sub = y[y[0].isin([0, 8])]
y_sub

Unnamed: 0,0
0,0
8,8
10,0
18,8
20,0
...,...
1789,8
1790,8
1793,0
1794,8


In [32]:
# Crate a subset of the data, by selecting rows from X that correspond to the indices present in y_sub.

X_sub = X.iloc[y_sub.index]
X_sub

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
8,0.0,0.0,9.0,14.0,8.0,1.0,0.0,0.0,0.0,0.0,...,8.0,0.0,0.0,0.0,11.0,16.0,15.0,11.0,1.0,0.0
10,0.0,0.0,1.0,9.0,15.0,11.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,10.0,13.0,3.0,0.0,0.0
18,0.0,0.0,10.0,7.0,13.0,9.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,14.0,5.0,0.0,0.0,0.0
20,0.0,0.0,3.0,13.0,11.0,7.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,2.0,12.0,13.0,4.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1789,0.0,0.0,8.0,16.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,10.0,16.0,10.0,1.0,0.0,0.0
1790,0.0,0.0,5.0,12.0,8.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,12.0,12.0,3.0,0.0,0.0
1793,0.0,0.0,6.0,16.0,13.0,11.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,6.0,16.0,14.0,6.0,0.0,0.0
1794,0.0,0.0,1.0,11.0,15.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,9.0,13.0,6.0,0.0,0.0


In [33]:
# Create the training and test data

X_train, X_test, y_train, y_test = train_test_split(
    X_sub, y_sub, test_size=0.99, random_state=42)

In [21]:
y_train.squeeze().shape

(3,)

In [22]:
X_train.shape

(3, 64)

In [34]:
# Creating a Logistic Regression model:

model = LogisticRegression()
model

LogisticRegression()

In [36]:
# Training the model
model.fit(X_train, y_train.squeeze())

# Making predictions
y_pred = model.predict(X_test)
y_pred


array([0, 0, 8, 0, 0, 0, 0, 0, 8, 0, 8, 0, 8, 8, 8, 0, 8, 8, 0, 0, 0, 8,
       8, 0, 8, 8, 0, 8, 0, 0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 8, 0, 8, 8,
       8, 0, 8, 0, 0, 8, 8, 8, 8, 0, 8, 0, 0, 8, 8, 8, 8, 0, 0, 8, 8, 0,
       0, 0, 8, 0, 0, 8, 0, 8, 0, 0, 8, 0, 0, 8, 8, 8, 0, 8, 0, 8, 8, 8,
       8, 0, 8, 0, 0, 0, 8, 8, 0, 8, 8, 8, 8, 8, 0, 8, 8, 0, 0, 0, 8, 8,
       8, 8, 8, 8, 8, 0, 0, 0, 0, 8, 8, 0, 0, 8, 8, 0, 8, 0, 0, 0, 0, 0,
       8, 8, 0, 8, 0, 8, 8, 8, 8, 8, 0, 8, 0, 0, 0, 8, 0, 0, 0, 0, 8, 0,
       8, 0, 8, 8, 0, 0, 8, 8, 8, 0, 0, 0, 0, 8, 8, 8, 0, 8, 0, 0, 0, 8,
       0, 8, 0, 8, 8, 8, 0, 8, 0, 8, 8, 0, 0, 8, 8, 0, 8, 0, 0, 0, 8, 8,
       8, 0, 8, 0, 0, 0, 0, 8, 0, 0, 8, 8, 0, 0, 8, 8, 0, 8, 0, 8, 8, 0,
       8, 0, 8, 0, 0, 8, 8, 8, 0, 0, 0, 8, 8, 8, 8, 0, 0, 0, 8, 8, 0, 0,
       0, 0, 8, 8, 0, 8, 0, 0, 0, 0, 0, 0, 8, 0, 0, 8, 8, 0, 8, 8, 8, 8,
       0, 0, 0, 8, 8, 0, 0, 8, 8, 8, 0, 8, 0, 0, 0, 0, 0, 0, 8, 8, 0, 0,
       0, 0, 8, 8, 8, 0, 0, 8, 8, 8, 0, 8, 8, 0, 8,

In [37]:
# accuracy

accuracy_score(y_test, y_pred)

0.994269340974212

# Conclusions


1. The accuracy score is calculated using accuracy_score(y_test, y_pred). It represents the proportion of correctly predicted labels in the test set. A higher accuracy score indicates better performance.

2. Since the task involves distinguishing between only two digits (0 and 8), it's a binary classification problem. The accuracy score reflects the model's ability to correctly classify instances as either 0 or 8.

3. Logistic regression is a simple linear classifier. The baseline performance give us an idea of how well a basic, linear model can separate the chosen digits in the dataset. If the baseline accuracy is high, it suggests that distinguishing between 0 and 8 in MNIST is relatively straightforward, even with a simple model.