# Import Packages

In [23]:
## Data Analysis Packages
import os
import sys
import numpy as np
import pandas as pd
import joblib

## Machine Learning Packages
import sklearn
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_curve, auc
from category_encoders import OrdinalEncoder

import warnings
warnings.filterwarnings("ignore")

In [24]:

print(f"Pandas Version: {pd.__version__}")
print(f"Numpy Version: {np.__version__}")
print(f"scikit-learn Version: {sklearn.__version__}")
print(f"joblib Version: {joblib.__version__}")
print(f"lightgbm Version: {lgb.__version__}")

Pandas Version: 1.5.3
Numpy Version: 1.23.5
scikit-learn Version: 1.2.1
joblib Version: 1.1.1
lightgbm Version: 4.1.0


In [25]:
pwd

'e:\\XilyticaTrainingRepo\\API Projects\\ML-FastAP-Streamlit-Docker'

# Load Data

In [31]:
# Files
data_file = "./data/Placement_Data_Full_Class.csv"

# Load train data
try:
    data = pd.read_csv(data_file)
    print("The dataset has {} samples with {} features.".format(*data.shape))
except:
    print("The dataset could not be loaded. Is the dataset missing?")
    

The dataset has 215 samples with 14 features.


In [32]:
data.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed


## Field	Description
* sl_no	Serial Number
* gender	Male= ‘M’ (or) Female= ‘F’
* ssc_p	SSC (10th) Percentage
* ssc_b	SSC Board of Education. - Value = [Central (or) Others]
* hsc_p	HSC (12th) percentage
* hsc_b	HSC Board of Education - Value = Central (or) Others
* hsc_s	Specialization in HSC
* degree_p	Degree Percentage
* degree_t	Under Graduation (Degree type), Field of degree education
* workex	Work Experience - Value = Yes (or) No
* etest_p	Employability test percentage (conducted by the college)
* specialisation	Post Graduation(MBA)- Specialization
* mba_p	MBA percentage
* status	Status of placement - Value = Placed (or) Not placed


In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sl_no           215 non-null    int64  
 1   gender          215 non-null    object 
 2   ssc_p           215 non-null    float64
 3   ssc_b           215 non-null    object 
 4   hsc_p           215 non-null    float64
 5   hsc_b           215 non-null    object 
 6   hsc_s           215 non-null    object 
 7   degree_p        215 non-null    float64
 8   degree_t        215 non-null    object 
 9   workex          215 non-null    object 
 10  etest_p         215 non-null    float64
 11  specialisation  215 non-null    object 
 12  mba_p           215 non-null    float64
 13  status          215 non-null    object 
dtypes: float64(5), int64(1), object(8)
memory usage: 23.6+ KB


# Prepare Data for Model Training

In [34]:
# Featrues that are not required
exclude_feature = ["sl_no","status"]

# Define Target Columns
target = data["status"].map({"Placed":0, "Not Placed":1})

# Define numeric & categorical features
numeric_columns = data.select_dtypes(include=["int64","float64"]).columns.tolist()
categorical_columns = data.select_dtypes(include=["object"]).columns.tolist()
numeric_features = [col for col in numeric_columns if col not in exclude_feature]
categorical_features = [col for col in categorical_columns if col not in exclude_feature]

# Define final feature list for training & validation
features = numeric_features + categorical_features

# final data for training and validation
data = data[features]
data = data.fillna(0)

# split data in train & validation
X_train, X_valid, y_train, y_valid = train_test_split(data, target, test_size=0.15, random_state=101)
X_valid.to_json(path_or_buf="./data/valid.json",orient = "records", lines = True)

# Perform label encoding for categorical variable
le = OrdinalEncoder(cols = categorical_features)
le.fit(X_train[categorical_features])
X_train[categorical_features] = le.transform(X_train[categorical_features])
X_valid[categorical_features] = le.transform(X_valid[categorical_features])

# Train and Evaluate Model

In [36]:
# Perform model training
clf = LGBMClassifier(random_state=101)
clf.fit(X_train, y_train)

# Perform model evaluation
valid_prediction = clf.predict_proba(X_valid)[:,1]
fpr, tpr, thresholds = roc_curve(y_valid, valid_prediction)
roc_auc = auc(fpr, tpr) # compute area under the curve

print("=================================")
print("Vlidation AUC:{}".format(roc_auc))
print("=================================")

[LightGBM] [Info] Number of positive: 55, number of negative: 127
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000100 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 254
[LightGBM] [Info] Number of data points in the train set: 182, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.302198 -> initscore=-0.836854
[LightGBM] [Info] Start training from score -0.836854
Vlidation AUC:0.8809523809523809


In [38]:
clf.predict_proba(X_valid)

array([[9.24210923e-01, 7.57890765e-02],
       [9.99613885e-01, 3.86115271e-04],
       [9.98621379e-01, 1.37862096e-03],
       [9.97476866e-01, 2.52313356e-03],
       [9.21598799e-01, 7.84012006e-02],
       [2.00586568e-01, 7.99413432e-01],
       [5.53818126e-02, 9.44618187e-01],
       [9.27148202e-01, 7.28517981e-02],
       [6.49369799e-01, 3.50630201e-01],
       [1.38836747e-02, 9.86116325e-01],
       [2.71790829e-01, 7.28209171e-01],
       [6.42989888e-01, 3.57010112e-01],
       [2.64346459e-01, 7.35653541e-01],
       [5.26997119e-01, 4.73002881e-01],
       [9.85984174e-01, 1.40158262e-02],
       [9.96138180e-01, 3.86182020e-03],
       [1.16840832e-01, 8.83159168e-01],
       [9.84670599e-01, 1.53294010e-02],
       [3.34459898e-03, 9.96655401e-01],
       [4.48022826e-01, 5.51977174e-01],
       [9.90272324e-01, 9.72767576e-03],
       [1.03270440e-02, 9.89672956e-01],
       [9.77377860e-01, 2.26221405e-02],
       [9.58278177e-01, 4.17218233e-02],
       [9.664997

In [39]:
clf.predict_proba(X_valid)[:,1]

array([7.57890765e-02, 3.86115271e-04, 1.37862096e-03, 2.52313356e-03,
       7.84012006e-02, 7.99413432e-01, 9.44618187e-01, 7.28517981e-02,
       3.50630201e-01, 9.86116325e-01, 7.28209171e-01, 3.57010112e-01,
       7.35653541e-01, 4.73002881e-01, 1.40158262e-02, 3.86182020e-03,
       8.83159168e-01, 1.53294010e-02, 9.96655401e-01, 5.51977174e-01,
       9.72767576e-03, 9.89672956e-01, 2.26221405e-02, 4.17218233e-02,
       3.35002613e-02, 4.36980244e-01, 3.07323257e-03, 9.77262470e-01,
       4.39726806e-01, 5.84611317e-04, 9.84686206e-03, 8.07385542e-02,
       3.51474695e-03])

In [40]:
# Perform model evaluation 
print(classification_report(y_valid,clf.predict(X_valid)))

              precision    recall  f1-score   support

           0       0.78      0.86      0.82        21
           1       0.70      0.58      0.64        12

    accuracy                           0.76        33
   macro avg       0.74      0.72      0.73        33
weighted avg       0.75      0.76      0.75        33



# Save Model Artifaction

In [42]:
joblib.dump(le, './model/label_encoder.joblib')
joblib.dump(clf, './model/lgb_model.joblib')
joblib.dump(features, './model/features.joblib')
joblib.dump(categorical_features, './model/categorical_features.joblib')

['./model/categorical_features.joblib']

# Sending request to Local FastAPI model endpoint

In [86]:
# Calling/ invoking Machine learning endpoint

In [81]:
import requests
data = {
  "sl_no": 112,
  "ssc_p": 84.0,
  "hsc_p": 90.9,
  "degree_p": 64.5,
  "etest_p": 86.04,
  "mba_p": 59.42,
  "gender": "M",
  "ssc_b": "Others",
  "hsc_b": "Others",
  "hsc_s": "Science",
  "degree_t": "Sci&Tech",
  "workex": "No",
  "specialisation": "Mkt&Fin"
}
response = requests.post("http://127.0.0.1:8000/predict", json=data)
print(response.text)

{"prediction":["Placed"]}


In [85]:
data = {
  "sl_no": 113,
  "ssc_p": 52.0,
  "hsc_p": 57.0,
  "degree_p": 50.8,
  "etest_p": 67.0,
  "mba_p": 62.79,
  "gender": "M",
  "ssc_b": "Central",
  "hsc_b": "Central",
  "hsc_s": "Commerce",
  "degree_t": "Comm&Mgmt",
  "workex": "No",
  "specialisation": "Mkt&HR"
}
response = requests.post("http://127.0.0.1:8000/predict", json=data)
print(response.text)

{"prediction":["Not Placed"]}


In [84]:
val = {
  "sl_no": 112,
  "ssc_p": 84.0,
  "hsc_p": 90.9,
  "degree_p": 64.5,
  "etest_p": 86.04,
  "mba_p": 59.42,
  "gender": "M",
  "ssc_b": "Others",
  "hsc_b": "Others",
  "hsc_s": "Science",
  "degree_t": "Sci&Tech",
  "workex": "No",
  "specialisation": "Mkt&Fin"
}
response = requests.post("http://127.0.0.1:8000/predict", json=data)
print(response.text)

{"prediction":["Placed"]}


In [79]:
import requests
data = {
  "sl_no": 12,
  "ssc_p": 90.0,
  "hsc_p": 10.9,
  "degree_p": 4.5,
  "etest_p": 6.04,
  "mba_p": 19.42,
  "gender": "M",
  "ssc_b": "Others",
  "hsc_b": "Others",
  "hsc_s": "Science",
  "degree_t": "Sci&Tech",
  "workex": "No",
  "specialisation": "Mkt&Fin"
}
response = requests.post("http://127.0.0.1:8000/predict", json=data)
print(response.text)

{"prediction":["Placed"]}
