In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Loading the DataFrame(mnist_train_small.csv)
df = pd.read_csv('/content/sample_data/mnist_train_small.csv')

print("First 5 rows of df:")
df.head()

First 5 rows of df:


Unnamed: 0,6,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.581,0.582,0.583,0.584,0.585,0.586,0.587,0.588,0.589,0.590
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Let's define a function to split the data into training and testing sets.

In [7]:
def split_data(dataframe, target_column, test_size=0.2, random_state=42):

   X = dataframe.drop(columns=[target_column])
   y = dataframe[target_column]

   X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

   return X_train, X_test, y_train, y_test

print("Function 'split_data' defined.")

Function 'split_data' defined.


Let's use the function to split the DataFrame. For this `mnist_train_small.csv` dataset, let's assume the last column (index 784) is the target variable (label) and the rest are features. We will set `target_column` to '784'.

In [11]:
# Assuming the last column '784' is the target variable for the MNIST dataset
target_column_name = str(df.columns[-1]) # Getting the name of the last column

X_train, X_test, y_train, y_test = split_data(df, target_column=target_column_name)

print(f"\nShape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

print("\nFirst 5 rows of X_train:")
display(X_train.head())
print("\nFirst 5 rows of y_train:")
y_train.head()


Shape of X_train: (15999, 784)
Shape of X_test: (4000, 784)
Shape of y_train: (15999,)
Shape of y_test: (4000,)

First 5 rows of X_train:


Unnamed: 0,6,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.580,0.581,0.582,0.583,0.584,0.585,0.586,0.587,0.588,0.589
5894,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3728,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8958,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7671,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5999,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0



First 5 rows of y_train:


Unnamed: 0,0.590
5894,0
3728,0
8958,0
7671,0
5999,0


Define a function to normalize the features using `MinMaxScaler`. There is other alternative method of normalizing which is `StandardScaler`

In [14]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

def normalize_features(X_train_df, X_test_df, feature_range=(0, 1)):

    scaler = MinMaxScaler(feature_range=feature_range)

    # Fit the scaler on the training data and transform both training and test data
    X_train_scaled = scaler.fit_transform(X_train_df)
    X_test_scaled = scaler.transform(X_test_df)

    # Convert scaled arrays back to DataFrames
    X_train_normalized = pd.DataFrame(X_train_scaled, columns=X_train_df.columns, index=X_train_df.index)
    X_test_normalized = pd.DataFrame(X_test_scaled, columns=X_test_df.columns, index=X_test_df.index)

    return X_train_normalized, X_test_normalized

print("Function 'normalize_features' defined.")

Function 'normalize_features' defined.


Let's use the `normalize_features` function to normalize our `X_train` and `X_test` data.

In [13]:
X_train_normalized, X_test_normalized = normalize_features(X_train, X_test)

print("Shapes after normalization:")
print(f"Shape of X_train_normalized: {X_train_normalized.shape}")
print(f"Shape of X_test_normalized: {X_test_normalized.shape}")

print("\nFirst 5 rows of normalized X_train:")
display(X_train_normalized.head())

Shapes after normalization:
Shape of X_train_normalized: (15999, 784)
Shape of X_test_normalized: (4000, 784)

First 5 rows of normalized X_train:


Unnamed: 0,6,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.580,0.581,0.582,0.583,0.584,0.585,0.586,0.587,0.588,0.589
5894,0.222222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3728,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8958,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7671,0.444444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
