We are going to have a full fledged ML Project implemented to understand its pipeline

__Project Name: Student Grant Recommendation__

# Step 1: Data Retrieval

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None # default='warn'

In [2]:
df = pd.read_csv("student_records.csv")
df.head()

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore,Recommend
0,Henry,A,Y,90,85,Yes
1,John,C,N,85,51,Yes
2,David,F,N,10,17,No
3,Holmes,B,Y,75,71,No
4,Marvin,E,N,20,30,No


# Step 2: Data Preparation

Since it is a clean dataset, Data Wrangling is not required. Hence here we will be mainly focussing on Feature Engineering and Scaling.

# Step 3: Feature Extraction and Engineering:

We will extract the existing features and outcomes in separate variables.

In [3]:
feature_names = ['OverallGrade', 'Obedient','ResearchScore','ProjectScore']
training_features = df[feature_names]

# training_features = df[['OverallGrade', 'Obedient','ResearchScore','ProjectScore']]

In [4]:
training_features

Unnamed: 0,OverallGrade,Obedient,ResearchScore,ProjectScore
0,A,Y,90,85
1,C,N,85,51
2,F,N,10,17
3,B,Y,75,71
4,E,N,20,30
5,A,Y,92,79
6,B,Y,60,59
7,C,Y,75,33


In [5]:
type(training_features)

pandas.core.frame.DataFrame

In [6]:
outcome_name = ['Recommend']
outcome_labels = df[outcome_name]

In [7]:
outcome_labels

Unnamed: 0,Recommend
0,Yes
1,Yes
2,No
3,No
4,No
5,Yes
6,No
7,No


In [8]:
type(outcome_labels)

pandas.core.frame.DataFrame

In [9]:
# Lets separate the categorical and numerical features now
numeric_feature_names = ['ResearchScore','ProjectScore']
categorical_feature_names = ['OverallGrade','Obedient']

In [10]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

ss.fit(training_features[numeric_feature_names]) # ss, go and study

# Once ur study is done, go and give exam
training_features[numeric_feature_names] = ss.transform(training_features[numeric_feature_names])

training_features

Unnamed: 0,OverallGrade,Obedient,ResearchScore,ProjectScore
0,A,Y,0.899583,1.37665
1,C,N,0.730648,-0.091777
2,F,N,-1.80339,-1.560203
3,B,Y,0.392776,0.772004
4,E,N,-1.465519,-0.998746
5,A,Y,0.967158,1.117516
6,B,Y,-0.114032,0.253735
7,C,Y,0.392776,-0.869179


In [11]:
import warnings;
warnings.simplefilter('ignore')

In [12]:
# Treat the categorical columns using One Hot Encoding Method
training_features = pd.get_dummies(training_features, columns=categorical_feature_names)

training_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_B,OverallGrade_C,OverallGrade_E,OverallGrade_F,Obedient_N,Obedient_Y
0,0.899583,1.37665,1,0,0,0,0,0,1
1,0.730648,-0.091777,0,0,1,0,0,1,0
2,-1.80339,-1.560203,0,0,0,0,1,1,0
3,0.392776,0.772004,0,1,0,0,0,0,1
4,-1.465519,-0.998746,0,0,0,1,0,1,0
5,0.967158,1.117516,1,0,0,0,0,0,1
6,-0.114032,0.253735,0,1,0,0,0,0,1
7,0.392776,-0.869179,0,0,1,0,0,0,1


In [13]:
set(training_features.columns)

{'Obedient_N',
 'Obedient_Y',
 'OverallGrade_A',
 'OverallGrade_B',
 'OverallGrade_C',
 'OverallGrade_E',
 'OverallGrade_F',
 'ProjectScore',
 'ResearchScore'}

In [14]:
set(numeric_feature_names)

{'ProjectScore', 'ResearchScore'}

In [15]:
categorical_engineered_features = list(set(training_features.columns) - set(numeric_feature_names))

In [16]:
categorical_engineered_features

['OverallGrade_B',
 'Obedient_N',
 'OverallGrade_A',
 'OverallGrade_C',
 'OverallGrade_F',
 'Obedient_Y',
 'OverallGrade_E']

# Modeling: Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression
import numpy as np

lr = LogisticRegression()

model = lr.fit(training_features, np.array(outcome_labels['Recommend']))

print(model)

LogisticRegression()


# Model Evaluation:

In [18]:
pred_labels = model.predict(training_features)
actual_labels = np.array(outcome_labels['Recommend'])

In [19]:
actual_labels

array(['Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No'], dtype=object)

In [20]:
pred_labels

array(['Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No'], dtype=object)

In [21]:
df.groupby('Recommend').size()

Recommend
No     5
Yes    3
dtype: int64

In [22]:
# Evaluate the performance of a model
from sklearn.metrics import accuracy_score

accuracy_score(actual_labels,pred_labels)

1.0

In [23]:
# Evaluate the performance of a model
from sklearn.metrics import classification_report

print(classification_report(actual_labels,pred_labels))

              precision    recall  f1-score   support

          No       1.00      1.00      1.00         5
         Yes       1.00      1.00      1.00         3

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8



In [24]:
# Evaluate the performance of a model
from sklearn.metrics import confusion_matrix

print(confusion_matrix(actual_labels,pred_labels))

[[5 0]
 [0 3]]


# Model Deployment

In [26]:
!pip install --upgrade joblib

Collecting joblib
  Downloading joblib-0.17.0-py3-none-any.whl (301 kB)
Installing collected packages: joblib
  Attempting uninstall: joblib
    Found existing installation: joblib 0.16.0
    Uninstalling joblib-0.16.0:
      Successfully uninstalled joblib-0.16.0
Successfully installed joblib-0.17.0


In [32]:
import joblib
import os
# save the model
if not os.path.exists('Model'):
    os.mkdir('Model')
if not os.path.exists('Scaler'):
    os.mkdir('Scaler')
    
joblib.dump(model,r'Model/client_model.pickle')
joblib.dump(ss,r'Scaler/client_scaler.pickle')        

['Scaler/client_scaler.pickle']

# Prediction in Action

In [33]:
model1 = joblib.load(r'Model/client_model.pickle')
ss1 = joblib.load(r'Scaler/client_scaler.pickle')

In [34]:
new_data = pd.DataFrame([
    {'Name' : 'Ninad' , 'OverallGrade' : 'F' , 'Obedient' : 'N', 'ResearchScore' : 30 , 'ProjectScore' : 20} ,
    {'Name' : 'Darshan' , 'OverallGrade' : 'A' , 'Obedient' : 'Y', 'ResearchScore' : 78 , 'ProjectScore' : 80} ,
])
new_data # Unseen

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore
0,Ninad,F,N,30,20
1,Darshan,A,Y,78,80


In [36]:
prediction_features = new_data[feature_names]
prediction_features

Unnamed: 0,OverallGrade,Obedient,ResearchScore,ProjectScore
0,F,N,30,20
1,A,Y,78,80


In [38]:
prediction_features[numeric_feature_names] = ss1.transform(prediction_features[numeric_feature_names])
prediction_features

Unnamed: 0,OverallGrade,Obedient,ResearchScore,ProjectScore
0,F,N,-1.127647,-1.430636
1,A,Y,0.494137,1.160705


In [39]:
prediction_features = pd.get_dummies(prediction_features, columns=categorical_feature_names)
prediction_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_F,Obedient_N,Obedient_Y
0,-1.127647,-1.430636,0,1,1,0
1,0.494137,1.160705,1,0,0,1


In [40]:
set(prediction_features.columns)

{'Obedient_N',
 'Obedient_Y',
 'OverallGrade_A',
 'OverallGrade_F',
 'ProjectScore',
 'ResearchScore'}

In [42]:
set(numeric_feature_names)

{'ProjectScore', 'ResearchScore'}

In [43]:
current_categorical_engineered_features = set(prediction_features.columns) - set(numeric_feature_names)
current_categorical_engineered_features

{'Obedient_N', 'Obedient_Y', 'OverallGrade_A', 'OverallGrade_F'}

In [44]:
missing_features = set(categorical_engineered_features) - current_categorical_engineered_features
missing_features

{'OverallGrade_B', 'OverallGrade_C', 'OverallGrade_E'}

In [45]:
prediction_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_F,Obedient_N,Obedient_Y
0,-1.127647,-1.430636,0,1,1,0
1,0.494137,1.160705,1,0,0,1


In [47]:
len(prediction_features)

2

In [48]:
for feature in missing_features:
    prediction_features[feature] = 0 * len(prediction_features)

prediction_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_F,Obedient_N,Obedient_Y,OverallGrade_C,OverallGrade_B,OverallGrade_E
0,-1.127647,-1.430636,0,1,1,0,0,0,0
1,0.494137,1.160705,1,0,0,1,0,0,0


In [50]:
predictions = model1.predict(prediction_features)
new_data['Recommend'] = predictions
new_data

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore,Recommend
0,Ninad,F,N,30,20,No
1,Darshan,A,Y,78,80,Yes
