In [2]:
# softmax regression
import numpy as np
import math
import random

z = np.array([5,4,3,6,6])

def softmax(z):
    
    # z--> linear part.
    
    # subtracting the max of z for numerical stability.
    exp = np.exp(z - np.max(z))
    
    # Calculating softmax for all examples.
    for i in range(len(z)):
        exp[i] /= np.sum(exp[i])
        
    return exp

def softmax_stable(Z):
    e_Z = np.exp(Z - np.max(Z, axis = 0, keepdims = True))
    A = e_Z / e_Z.sum(axis = 0)
    return A

def number_classes(y):
    c = 0
    lst_class = []
    dict = {}
    for i in range(len(y)):
        if y[i] not in lst_class:
            c += 1
            lst_class.append(y[i])
    return lst_class, c

def one_hot(y, c):
    
    # y--> label/ground truth.
    # c--> Number of classes.
    
    # A zero matrix of size (m, c)
    y_hot = np.zeros((len(y), c))
    
    # Putting 1 for column where the label is,
    # Using multidimensional indexing.
    y_hot[np.arange(len(y)), y] = 1
    
    return y_hot

def predict(X, w, b):
    
    # X --> Input.
    # w --> weights.
    # b --> bias.
    
    # Predicting
    z = X@w + b
    y_hot = softmax(z)
    
    # Returning the class with highest probability.
    return np.argmax(y_hot, axis=1)

def accuracy(y, y_hat):
    return str(np.sum(y==y_hat)/len(y)*100) +"%"

print(softmax(z))
print(softmax_stable(z))

[1. 1. 1. 1. 1.]
[0.14409682 0.05301026 0.01950138 0.39169577 0.39169577]


In [3]:
def predict_top(x, w, b, n):
    z = x@w + b
    y_hot = softmax_stable(z)
    
    #sort the y_hot
    sort = np.sort(y_hot)
    sort_convert = sort[::-1]
    
    top_n = sort_convert[:n]
    top_n_index = []
    for j in range(len(top_n)):
        for i in range(len(y_hot)):
            if y_hot[i] == top_n[j]:
                top_n_index.append(i)
    
    top_list = {}
    for i in range(n):
        top_list[(dict_convert[classes[top_n_index[i]]])] = str(sort_convert[i] * 100) + "%"
    return top_n, top_n_index, top_list

In [4]:
import pandas as pd

df = pd.read_csv("dataset.csv")
df_raw = df.iloc[:,1:]

In [5]:
X = df_raw.iloc[:,:-1].to_numpy()
y = df_raw.iloc[:,-1].to_numpy()

In [6]:
y

array(['Fungal infection', 'Fungal infection', 'Fungal infection', ...,
       'Covid', 'Covid', 'Covid'], dtype=object)

In [7]:
#create dictionary of classes

dict = {}
count = 0
for x in y:
    if x not in dict:
        dict[x] = count
        count += 1

In [8]:
#create dictionary of convert classes
dict_convert = {}
for x in dict:
    dict_convert[dict[x]] = x

In [9]:
#change text to integer
for i in range(len(y)):
    y[i] = dict[y[i]]

In [10]:
y = y.astype(np.int32)

In [11]:
classes, c = number_classes(y)
y_hot = one_hot(y, c)

In [12]:
loss = -np.mean(np.log(y_hot[np.arange(len(y)), y]))

In [13]:
y_hot

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [14]:
def fit(X, y, lr, c, epochs):
    
    # X --> Input.
    # y --> true/target value.
    # lr --> Learning rate.
    # c --> Number of classes.
    # epochs --> Number of iterations.
    
        
    # m-> number of training examples
    # n-> number of features 
    m, n = X.shape
    
    # Initializing weights and bias randomly.
    w = np.random.random((n, c))
    b = np.random.random(c)
    # Empty list to store losses.
    losses = []
    
    # Training loop.
    for epoch in range(epochs):
        
        # Calculating hypothesis/prediction.
        z = X@w + b
        y_hat = softmax(z)
        
        # One-hot encoding y.
        y_hot = one_hot(y, c)
        
        # Calculating the gradient of loss w.r.t w and b.
        w_grad = (1/m)*np.dot(X.T, (y_hat - y_hot)) 
        b_grad = (1/m)*np.sum(y_hat - y_hot)
        
        # Updating the parameters.
        w = w - lr*w_grad
        b = b - lr*b_grad
        
        # Calculating loss and appending it in the list.
        loss = -np.mean(np.log(y_hat[np.arange(len(y)), y]))
        losses.append(loss)
        # Printing out the loss at every 100th iteration.
        if epoch%100==0:
            print('Epoch {epoch}==> Loss = {loss}'
                  .format(epoch=epoch, loss=loss))
    return w, b, losses

In [15]:
#train vs sample data
import numpy as np
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [16]:
# Training
w, b, l = fit(X_train, y_train, lr=1, c=len(classes), epochs=2000)

Epoch 0==> Loss = 15.945728091562515


  loss = -np.mean(np.log(y_hat[np.arange(len(y)), y]))


Epoch 100==> Loss = 0.002941958754275336
Epoch 200==> Loss = 0.00239918080103578
Epoch 300==> Loss = 0.0021471607672331012
Epoch 400==> Loss = 0.001990989457306159
Epoch 500==> Loss = 0.00188541627465411
Epoch 600==> Loss = 0.0018079147530272305
Epoch 700==> Loss = 0.0017471384188264992
Epoch 800==> Loss = 0.0016975651940782299
Epoch 900==> Loss = 0.0016561881019201453
Epoch 1000==> Loss = 0.0016211138198102646
Epoch 1100==> Loss = 0.0015910292502261847
Epoch 1200==> Loss = 0.0015649721251688106
Epoch 1300==> Loss = 0.0015422133355567297
Epoch 1400==> Loss = 0.001522187789514879
Epoch 1500==> Loss = 0.0015044500493187564
Epoch 1600==> Loss = 0.001488644207451563
Epoch 1700==> Loss = 0.0014744826314878254
Epoch 1800==> Loss = 0.0014617305481069286
Epoch 1900==> Loss = 0.001450194626552295


In [17]:
# Accuracy for training set.
train_preds = predict(X_train, w, b)
print("train set accuracy: ", accuracy(y_train, train_preds))

# Accuracy for test set.
# Flattening and normalizing.
test_preds = predict(X_test, w, b)
print("test set accuracy: ", accuracy(y_test, test_preds))

train set accuracy:  99.96768982229402%
test set accuracy:  99.9515503875969%


In [18]:
x = X[5115]
predict_top(x, w, b, 5)

(array([1.00000000e+00, 2.43891460e-30, 5.55912089e-32, 3.68280355e-32,
        3.08691828e-32]),
 [41, 37, 35, 38, 10],
 {'Covid': '100.0%',
  'Acne': '2.4389145997569186e-28%',
  'Arthritis': '5.5591208901950144e-30%',
  'Urinary tract infection': '3.682803548309472e-30%',
  'Hypertension ': '3.086918275335293e-30%'})

In [19]:
def create_random_x():
    n = random.randint(10,30)
    random_index = random.sample(range(1, len(X[5] - 1)), n)
    zero_matrix = np.zeros(len(X[5]))
    for i in random_index:
        zero_matrix[i] = 1
    return n, zero_matrix

In [20]:
n, k = create_random_x()
n, k
predict_top(k, w, b, 5)

(array([0.88468886, 0.05060782, 0.01627917, 0.01589509, 0.01255453]),
 [22, 24, 19, 12, 23],
 {'Hepatitis D': '88.46888552221093%',
  'Alcoholic hepatitis': '5.060782175048032%',
  'hepatitis A': '1.627917025887722%',
  'Cervical spondylosis': '1.589509089854039%',
  'Hepatitis E': '1.2554525256490545%'})

In [21]:
#train model with sklearn
import numpy as np 
from mnist import MNIST
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import accuracy_score

# train
logreg = linear_model.LogisticRegression(C=1e5, solver = 'lbfgs', multi_class = 'multinomial')
logreg.fit(X_train, y_train)

# test
y_pred = logreg.predict(X_test)
print ("Accuracy: %.2f %%" %(100*accuracy_score(y_test, y_pred.tolist())))

Accuracy: 100.00 %


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
