In [157]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import plotly.express as px
from sklearn.metrics import classification_report

from warnings import filterwarnings
filterwarnings('ignore')

Loading Cleaned Data

In [158]:
data = pd.read_csv('IAS_Cleaned.csv', keep_default_na=False)
data.head()

Unnamed: 0,ID,Name,Gender,DOB,Age,Domicile,Date of Appointment,Allotment Year,Service Tenure,Source of Recruitment,...,Spec 2,Current Post,Post Type,Department,Department Type,Location,Job Status,With Effect From Date,Current Tenure,Pay Level
0,7600,Shri Pradip Kumar Tripathi,Male,1964-06-18,59,Uttar Pradesh,1987-08-24,1987,36,RR,...,Structural Engg,Secretary (Coordination),Secretary,Cabinet Secretariat,Secretariat Department,New Delhi,Active,2022-02-05 00:00:00,2.0,Level 17
1,18400,Shri Naresh Kumar,Male,1963-11-18,60,Delhi,1987-08-24,1987,36,RR,...,Public Admn,Chief Secretary,Secretary,N.A.,Secretariat Department,Delhi,Active,2022-04-21 00:00:00,2.0,Level 17
2,18800,Shri Chetan Bhushan Sanghi,Male,1965-06-26,58,Andhra Pradesh,1988-08-25,1988,35,RR,...,Hons,Financial Commissioner,Commissioner,Govt. of National capital Territory of Delhi (...,Other Department,New Delhi,Active,2021-01-04 00:00:00,3.0,Level 17
3,19002,Dr.(Ms.) Renu Sharma,Female,1964-10-19,59,Delhi,1988-08-25,1988,35,RR,...,Political Sc.,Chief Secretary to Government of Mizoram,Secretary,N.A.,Secretariat Department,Aizawl (Mizoram),Active,2021-02-11 00:00:00,3.0,Level 17
4,8200,Shri Atal Dulloo,Male,1966-10-24,57,Jammu & Kashmir,1989-08-21,1989,34,RR,...,,Chief Secretary,Secretary,N.A.,Secretariat Department,Jammu & Kashmir,Active,2023-01-12 00:00:00,1.0,Level 17


In [159]:
data.columns

Index(['ID', 'Name', 'Gender', 'DOB', 'Age', 'Domicile', 'Date of Appointment',
       'Allotment Year', 'Service Tenure', 'Source of Recruitment', 'Cadre',
       'Qual 1', 'Qual 2', 'Qual Type', 'Spec 1', 'Spec 2', 'Current Post',
       'Post Type', 'Department', 'Department Type', 'Location', 'Job Status',
       'With Effect From Date', 'Current Tenure', 'Pay Level'],
      dtype='object')

Removing single instance of target variable having only one occurence of that class

In [161]:
ind = data[data['Pay Level'] == 'Level 18'].index
data = data.drop(index = ind)

Classifying Infrequent location values to 'other'

In [162]:

loc = (data['Location'].value_counts())
c = loc > 100
loc = loc[c].index

data['Location'] = data['Location'].apply(lambda x: 'Other' if x not in loc else x)


Removing Irrelevant Features like ID, Name, DOB, etc.

In [163]:
data = data.drop(columns = ['ID', 'Name', 'DOB', 'Qual 2', 'Date of Appointment', 'With Effect From Date', 'Qual 2', 'Qual 1', 'Spec 1', 'Spec 2', 'Current Post', 'Department'])
data

Unnamed: 0,Gender,Age,Domicile,Allotment Year,Service Tenure,Source of Recruitment,Cadre,Qual Type,Post Type,Department Type,Location,Job Status,Current Tenure,Pay Level
0,Male,59,Uttar Pradesh,1987,36,RR,UT,Post Graduate,Secretary,Secretariat Department,New Delhi,Active,2.0,Level 17
1,Male,60,Delhi,1987,36,RR,UT,Post Graduate,Secretary,Secretariat Department,Other,Active,2.0,Level 17
2,Male,58,Andhra Pradesh,1988,35,RR,UT,Post Graduate,Commissioner,Other Department,New Delhi,Active,3.0,Level 17
3,Female,59,Delhi,1988,35,RR,UT,Doctorate,Secretary,Secretariat Department,Other,Active,3.0,Level 17
4,Male,57,Jammu & Kashmir,1989,34,RR,UT,Graduate,Secretary,Secretariat Department,Other,Active,1.0,Level 17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5536,Male,32,Rajasthan,2023,1,RR,WB,Graduate,Trainee,Other Department,Mussorrie,Inactive,0.0,Level 10
5537,Female,29,Rajasthan,2023,1,RR,WB,Post Graduate,Trainee,Other Department,Mussorrie,Inactive,0.0,Level 10
5538,Female,25,Punjab,2023,1,RR,WB,Graduate,Trainee,Other Department,Mussorrie,Inactive,0.0,Level 10
5539,Male,30,Telangana,2023,1,RR,WB,Post Graduate,Trainee,Other Department,Mussorrie,Inactive,0.0,Level 10


Encoding Ordinal Variables with LabelEncoder



In [164]:

le = LabelEncoder()
data['Gender'] = le.fit_transform(data['Gender'])
data['Allotment Year'] = le.fit_transform(data['Allotment Year'])
data['Source of Recruitment'] = le.fit_transform(data['Source of Recruitment'])
data['Qual Type'] = le.fit_transform(data['Qual Type'])
data['Job Status'] = le.fit_transform(data['Job Status'])
data['Post Type'] = le.fit_transform(data['Post Type'])


Encoding Target Variable with LabelEncoder

In [165]:
le2 =  LabelEncoder()
data['Pay Level'] = le2.fit_transform(data['Pay Level'])

pay_levels = le2.inverse_transform(np.arange(len(le2.classes_)))
pay_levels

array(['Level 10', 'Level 11', 'Level 12', 'Level 13', 'Level 14',
       'Level 15', 'Level 17'], dtype=object)

Applying One Hot Encoding of Nominal Variables

In [167]:
one_hot_encoded_data = pd.get_dummies(data, columns = ['Domicile', 'Location', 'Cadre', 'Department Type'])
data = one_hot_encoded_data.copy()

Scaling Numerical Variables with MinMaxScaler

In [168]:
scaler = MinMaxScaler()
scaler.fit(data[['Age', 'Service Tenure', 'Current Tenure']])
scaled_data = scaler.transform(data[['Age', 'Service Tenure', 'Current Tenure']])
data[scaler.get_feature_names_out()] = scaled_data

Saving Engineered Data to a csv file

In [169]:
data.to_csv('Feature_Engineered_Data.csv', index = False)

Separating Independent and Dependent Variables

In [170]:
X = data.drop(columns = 'Pay Level')
y = data['Pay Level']

Applying PCA for Feature Extraction

In [171]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(X)

Visualizing Explained Variance Ratios of Principal Components

In [172]:
fig = px.line(y = pca.explained_variance_ratio_, x = np.arange(len(pca.explained_variance_ratio_)), markers=True,
              labels = {'x': 'Number of Principal Components', 'y': 'Explained Variance Ratio' },
              title = 'Scree Plot of PCA')

fig.update_layout(title_x=0.5)

fig.show()

In [173]:
pca = PCA(n_components = 0.99)
pca_data = pca.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(pca_data, y, test_size=0.25, random_state=42)


Showing Confusion Matrix of each Classifier

In [174]:
def show_conf_mat(conf_mat, model_name):

    conf_mat = pd.DataFrame(conf_mat, columns = pay_levels, index = pay_levels)

    fig = px.imshow(conf_mat, text_auto=True, height = 500,
                   title = f'Confusion Matrix of {model_name}',
                   labels=dict(x="Pay Level (Predicted)", y="Pay Level (Actual)"),
                   color_continuous_scale='Viridis')

    fig.update_layout(title_x=0.5)
    fig.show()


Applying GridSearchCV for hyperparameter tuning

In [193]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier


def get_logistic(X_train, y_train):

    log_reg = LogisticRegression(random_state = 42)
    params = [{'solver': ('lbfgs', 'newton-cg'), 'multi_class': ('multinomial', 'auto')}]
    log_reg_clf = GridSearchCV(log_reg, params, cv = 5, scoring='accuracy')

    log_reg_clf.fit(X_train, y_train)
    params = log_reg_clf.best_params_

    return log_reg_clf, params


def get_svc(X_train, y_train):

    svc = LinearSVC()
    params = [{'multi_class': ['crammer_singer', 'ovr'], 'tol': [1e-4, 1e-5, 1e-6]}]
    svc_clf = GridSearchCV(svc, params, cv = 5, scoring='accuracy')

    svc_clf.fit(X_train, y_train)
    params = svc_clf.best_params_
    return svc_clf, params


def get_rf(X_train, y_train):

    rf = RandomForestClassifier()
    params = [{'n_estimators': [100, 200], 'max_features': ['sqrt', 'log2'], 'max_depth': np.arange(7,14)}]
    rf_clf = GridSearchCV(rf, params, cv = 5, scoring='accuracy')

    rf_clf.fit(X_train, y_train)
    params = rf_clf.best_params_

    return rf_clf, params


def get_knn(X_train, y_train):

    knn = KNeighborsClassifier()
    params = [{'n_neighbors': range(3, 10, 2), 'metric': ['minkowski', 'euclidean']}]
    knn_clf = GridSearchCV(knn, params, cv = 5, scoring='accuracy')

    knn_clf.fit(X_train, y_train)
    params = knn_clf.best_params_
    return knn_clf, params



* Model Training with best params of each Classifier  

* Model Evaluation by Classification Report and COnfusion Matrix

1. Logistic Regression

In [190]:
from sklearn.linear_model import LogisticRegression

log_reg, log_reg_params = get_logistic(X_train, y_train)

print(f'Best Parameters for Logistic Regression: {log_reg_params}')

log_reg.fit(X_train, y_train)

y_pred_log_reg = log_reg.predict(X_test)

print('\nClassification Report: \n\n', classification_report(y_test, y_pred_log_reg))


Best Parameters for Logistic Regression: {'multi_class': 'multinomial', 'solver': 'newton-cg'}

Classification Report: 

               precision    recall  f1-score   support

           0       0.93      0.87      0.90       165
           1       0.81      0.78      0.80       297
           2       0.63      0.77      0.69       311
           3       0.61      0.48      0.53       222
           4       0.83      0.83      0.83       211
           5       0.85      0.90      0.87        87
           6       0.92      0.91      0.92        92

    accuracy                           0.76      1385
   macro avg       0.80      0.79      0.79      1385
weighted avg       0.77      0.76      0.76      1385



In [191]:
from sklearn.metrics import confusion_matrix

conf_mat_log_reg = confusion_matrix(y_test, y_pred_log_reg)
show_conf_mat(conf_mat_log_reg, 'Logistic Regression')

2. Random Forest Classifier

In [194]:
from sklearn.ensemble import RandomForestClassifier

rf_clf, rf_clf_params = get_rf(X_train, y_train)

print(f'Best Parameters for Logistic Regression: {rf_clf_params}')

rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

print('\nClassification Report: \n\n', classification_report(y_test, y_pred_rf))


Best Parameters for Logistic Regression: {'max_depth': 13, 'max_features': 'sqrt', 'n_estimators': 100}

Classification Report: 

               precision    recall  f1-score   support

           0       0.94      0.86      0.90       165
           1       0.86      0.87      0.86       297
           2       0.77      0.84      0.80       311
           3       0.72      0.65      0.69       222
           4       0.85      0.84      0.85       211
           5       0.86      0.93      0.90        87
           6       0.93      0.89      0.91        92

    accuracy                           0.83      1385
   macro avg       0.85      0.84      0.84      1385
weighted avg       0.83      0.83      0.83      1385



In [195]:
conf_mat_rf = confusion_matrix(y_test, y_pred_rf)
show_conf_mat(conf_mat_rf, 'Random Forest Classifier')

3. K Nearest Neighbors Classifier

In [196]:
from sklearn.neighbors import KNeighborsClassifier

knn, knn_params = get_knn(X_train, y_train)
print(f'Best Parameters for Logistic Regression: {knn_params}')

knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)
print('\nClassification Report: \n\n', classification_report(y_test, y_pred_knn))


Best Parameters for Logistic Regression: {'metric': 'minkowski', 'n_neighbors': 5}

Classification Report: 

               precision    recall  f1-score   support

           0       0.87      0.94      0.90       165
           1       0.82      0.82      0.82       297
           2       0.77      0.79      0.78       311
           3       0.72      0.69      0.70       222
           4       0.89      0.81      0.85       211
           5       0.84      0.93      0.89        87
           6       0.94      0.88      0.91        92

    accuracy                           0.82      1385
   macro avg       0.84      0.84      0.84      1385
weighted avg       0.82      0.82      0.82      1385



In [197]:
conf_mat_knn = confusion_matrix(y_test, y_pred_knn)
show_conf_mat(conf_mat_knn, 'K Nearest Neighbors Classifier')

4. Support Vector Classifier

In [198]:
from sklearn.svm import LinearSVC

svc, svc_params = get_svc(X_train, y_train)
print(f'Best Parameters for Logistic Regression: {svc_params}')

svc.fit(X_train, y_train)

y_pred_svc = svc.predict(X_test)
print('\nClassification Report: \n\n', classification_report(y_test, y_pred_svc))


Best Parameters for Logistic Regression: {'multi_class': 'crammer_singer', 'tol': 1e-05}

Classification Report: 

               precision    recall  f1-score   support

           0       0.89      0.86      0.88       165
           1       0.76      0.81      0.79       297
           2       0.64      0.78      0.71       311
           3       0.62      0.30      0.40       222
           4       0.77      0.87      0.82       211
           5       0.83      0.91      0.87        87
           6       0.92      0.91      0.92        92

    accuracy                           0.75      1385
   macro avg       0.78      0.78      0.77      1385
weighted avg       0.75      0.75      0.74      1385



In [199]:
conf_mat_svc = confusion_matrix(y_test, y_pred_svc)
show_conf_mat(conf_mat_svc, 'Support Vector Classifier')