# Machine Learning Model: Multi-Class Classification

## Imports

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import pandas as pd 


from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression

## 1 Loading CSV

In [2]:
#Load csv
df = pd.read_csv("./dataset/tbpred.csv")

## 2 Data Preprocessing

In [3]:
#Label Encode 'Country of Birth' 
le = preprocessing.LabelEncoder()
df['encoded_country'] = le.fit_transform(df['CountryofBirth'])
X = df.drop(['Name','CountryofBirth'], axis=1)

In [4]:
X1 = X.drop('tb_pred', axis=1)
Y1 = X['tb_pred']

## 3 Split Data into Training and Testing Set

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X1, Y1, test_size = 0.20)

## 4 Generate Classification Model

In [6]:
# Fitting Logistic Regression to the Training set
classifier = LogisticRegression(solver='liblinear', random_state = 0)
classifier.fit(X_train, y_train)

LogisticRegression(random_state=0, solver='liblinear')

## 5 Predict and Evaluate Model

In [7]:
y_pred=classifier.predict(X_test)

accuracy = metrics.accuracy_score(y_test, y_pred)
print("train accuracy: %.2f." %accuracy)

precision = metrics.precision_score(y_test, y_pred)
print("train precision: %.2f." %precision)

recall = metrics.recall_score(y_test, y_pred)
print("train recall: %.2f." %recall)

train accuracy: 0.80.
train precision: 0.57.
train recall: 0.43.


In [8]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.91      0.88       310
           1       0.57      0.43      0.49        90

    accuracy                           0.80       400
   macro avg       0.71      0.67      0.68       400
weighted avg       0.78      0.80      0.79       400



### Test

In [21]:
feature_names = df.columns.tolist()

In [22]:
a = np.array([[1,24,0,4,1,0,0,1,1,1,1,3,0,0,1,1,0,1,0,0,0,34.8]])
b = classifier.predict(a)
print(b)

[0]


  "X does not have valid feature names, but"


In [30]:
c = classifier.predict_proba(a)
print(c)

[[0.72077085 0.27922915]]


  "X does not have valid feature names, but"


In [32]:
# Assuming 'classifier' is your trained classifier and 'a' is the input data
# Predict classes and probabilities
b = classifier.predict(a)
c = classifier.predict_proba(a)

# Initialize a list to store the modified probabilities
modified_probabilities = []

# Iterate through the predicted classes and choose the corresponding probabilities
for pred, prob in zip(b, c):
    if pred == 0:
        modified_probabilities.append(prob[0])  # Choose the first value from c
    else:
        modified_probabilities.append(prob[1])  # Choose the last value from c

# Convert the modified probabilities list to a numpy array
modified_probabilities = np.array(modified_probabilities)

# Display modified probabilities
print(modified_probabilities)


[0.72077085]


  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


In [39]:
import numpy as np

# Assuming 'classifier' is your trained classifier and 'a' is the input data
# Predict classes and probabilities
b = classifier.predict(a)
c = classifier.predict_proba(a)

b= np.array([1])

# Initialize modified probabilities array
modified_probabilities = np.zeros_like(b, dtype=float)

# Use numpy indexing to assign values based on the predicted classes
modified_probabilities[b == 0] = c[b == 0, 0]  # Assign the first value from c where b is 0
modified_probabilities[b == 1] = c[b == 1, 1]  # Assign the last value from c where b is 1

# Display modified probabilities
print(modified_probabilities)


[0.27922915]


  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


### Checking for Percentages

In [44]:
# Calculate the percentage of True and False predictions
percentage_true = (y_pred == 1).sum() / len(y_pred) * 100
percentage_false = (y_pred == 0).sum() / len(y_pred) * 100

# Display the percentages
print(f"Percentage of True Predictions: {percentage_true:.2f}%")
print(f"Percentage of False Predictions: {percentage_false:.2f}%")

Percentage of True Predictions: 17.00%
Percentage of False Predictions: 83.00%


In [45]:
df1 = pd.DataFrame()
df1['Original'] = y_test[:10]
df1['Predicted'] = y_pred[:10]

print(df1)

      Original  Predicted
555          0          0
908          0          0
1553         0          0
578          0          0
12           0          0
1974         1          1
1821         0          0
1209         1          0
812          0          0
1891         0          0


## 6 Saving Model

In [46]:
# To save an encoder
from joblib import dump
dump(le, 'label_encoder.joblib')

dump(classifier, 'classifier.joblib')

['classifier.joblib']