# Multiclass Classification


## Using Logistic Regression
- Imports 

In [1]:
import numpy as np
import pandas as pd
from ml_lib.linear_models.logistic_regression import LogisticRegression
from ml_lib.metrics.math import accuracy
from ml_lib.preprocessing.scaler import StandardScaler
from ml_lib.utils.data import train_test_split
from ml_lib.preprocessing.pipeline import Pipeline
from ml_lib.preprocessing.imputer import SimpleImputer

- Load Data

In [2]:
train_df= pd.read_csv(r"C:/project/datasets/train_multi_class.csv")
test_df= pd.read_csv(r"C:/project/datasets/test_multi_class.csv")

test_df.info()
train_df_clean=train_df.dropna(subset=[train_df.columns[-1]])

print("Original rows:", len(train_df))
print("Rows kept:", len(train_df_clean))
print("Rows removed:", len(train_df) - len(train_df_clean))



X= train_df_clean.drop("target",axis=1).values
y=train_df_clean["target"].values

print(f"Shape of X : {X.shape}\nShape of y : {y.shape}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 40 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   feature_0   25000 non-null  float64
 1   feature_1   25000 non-null  float64
 2   feature_2   25000 non-null  float64
 3   feature_3   25000 non-null  float64
 4   feature_4   25000 non-null  float64
 5   feature_5   25000 non-null  float64
 6   feature_6   25000 non-null  float64
 7   feature_7   25000 non-null  float64
 8   feature_8   25000 non-null  float64
 9   feature_9   25000 non-null  float64
 10  feature_10  25000 non-null  float64
 11  feature_11  25000 non-null  float64
 12  feature_12  25000 non-null  float64
 13  feature_13  25000 non-null  float64
 14  feature_14  25000 non-null  float64
 15  feature_15  25000 non-null  float64
 16  feature_16  25000 non-null  float64
 17  feature_17  25000 non-null  float64
 18  feature_18  25000 non-null  float64
 19  feature_19  25000 non-nul

preprocessing

In [3]:
pipe = Pipeline([
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
    ])

X_trans= pipe.fit(X)
print(X_trans.shape)

X_train,y_train,X_val,y_val= train_test_split(X_trans,y)

(49999, 40)


count unique classes

In [4]:
classes = np.unique(y)
n_classes = len(classes)

print("the classes are",classes ,"and the no of classes are ", n_classes )

the classes are [0. 1. 2. 3. 4.] and the no of classes are  5


### Train Model 

In [5]:
models=[]
for cls in classes:
    y_binary= (y_train==cls).astype(int)

    model = LogisticRegression(
        lr= 0.005,
        epochs=5000
    )
    model.fit(X_train,y_binary)
    models.append(model)

NOte  it takes much time

Predictions function for all classes 

In [6]:
def multiclass_predict(X):
    probs=[]
    for model in models:
        probs.append(model.predict_proba(X))

    probs = np.array(probs)
    return np.argmax(probs,axis=0)

In [7]:
y_pred = multiclass_predict(X_val)
acc=accuracy(y_val,y_pred)

print(f"Test accuracy : {acc}")

Test accuracy : 0.5766576657665766


- as accuracy is very low so we use neural network method for the multiclass classification

## Classification using Neural Network

- IMPORTS


In [8]:
from ml_lib.neural_network.layers import Dense
from ml_lib.neural_network.sequential import Sequential
from ml_lib.neural_network.losses import CategoricalCrossEntropy
from ml_lib.preprocessing.encoding import LabelEncoder, OneHotEncoder

- Preprocessing

In [9]:
y= np.nan_to_num(y, nan=5)
y = y.astype(int)

le= LabelEncoder()
y_train_enc = le.fit_transform(y)

ohe = OneHotEncoder()
y_train_oh = ohe.fit_transform(y_train_enc.reshape(-1,1))



print("Train labels:", np.unique(y_train_enc))
print("One-hot shape:", y_train_oh.shape)



Train labels: [0 1 2 3 4]
One-hot shape: (49999, 5)


In [10]:
model = Sequential([
(Dense(128, activation="relu",init= "he")),
(Dense(64, activation="relu",init="he")),
(Dense(n_classes, activation="softmax",init="xavier"))])

model.compile(optimizer="adam", loss= CategoricalCrossEntropy(),lr=0.005)

history = model.fit(X_trans, y_train_oh, epochs=1000)

Epoch 0 | Loss: 11.0305
Epoch 20 | Loss: 2.0839
Epoch 40 | Loss: 1.1311
Epoch 60 | Loss: 0.9295
Epoch 80 | Loss: 0.8367
Epoch 100 | Loss: 0.7785
Epoch 120 | Loss: 0.7366
Epoch 140 | Loss: 0.7056
Epoch 160 | Loss: 0.6817
Epoch 180 | Loss: 0.6619
Epoch 200 | Loss: 0.6450
Epoch 220 | Loss: 0.6302
Epoch 240 | Loss: 0.6173
Epoch 260 | Loss: 0.6059
Epoch 280 | Loss: 0.5955
Epoch 300 | Loss: 0.5860
Epoch 320 | Loss: 0.5775
Epoch 340 | Loss: 0.5703
Epoch 360 | Loss: 0.5640
Epoch 380 | Loss: 0.5583
Epoch 400 | Loss: 0.5530
Epoch 420 | Loss: 0.5482
Epoch 440 | Loss: 0.5438
Epoch 460 | Loss: 0.5397
Epoch 480 | Loss: 0.5359
Epoch 500 | Loss: 0.5324
Epoch 520 | Loss: 0.5290
Epoch 540 | Loss: 0.5258
Epoch 560 | Loss: 0.5227
Epoch 580 | Loss: 0.5199
Epoch 600 | Loss: 0.5172
Epoch 620 | Loss: 0.5147
Epoch 640 | Loss: 0.5123
Epoch 660 | Loss: 0.5100
Epoch 680 | Loss: 0.5077
Epoch 700 | Loss: 0.5055
Epoch 720 | Loss: 0.5034
Epoch 740 | Loss: 0.5013
Epoch 760 | Loss: 0.4993
Epoch 780 | Loss: 0.4973
Epoch

- check the accuracy percentage

In [11]:
y_val_pred = model.predict(X_val)
y_val_pred = np.argmax(y_val_pred, axis=1)
y_val_labels=le.inverse_transform(y_val_pred)

val_acc= accuracy(y_val,y_val_labels)*100
print(f"Validation Accuracy : ",val_acc,"%")

Validation Accuracy :  85.06850685068507 %


for test data

In [15]:
feature_cols = train_df.columns[:-1] 
X_test = test_df[feature_cols].values
print(X_test.shape,X_test.shape[0])
X_test_trans = pipe.transform(X_test)
y_t_pred = model.predict(X_test_trans)
y_test_pred=np.argmax(y_t_pred,axis=1)
y_test_pred=le.inverse_transform(y_test_pred)
print(y_test_pred.shape)
id = np.arange(1,X_test_trans.shape[0]+1)
df2=pd.DataFrame({
    'id':id,
    'target':y_test_pred
})
df2.to_csv(r"C:/project/datasets/processed/multiclass_classification_NN.csv",index=False)


(25000, 40) 25000
(25000,)
