In [1]:
!pip install -q mord
!pip install -q catboost
!pip install -q xgboost
!pip install -q lightgbm

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for mord (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import plotly.express as px
from sklearn.metrics import classification_report

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.base import BaseEstimator, TransformerMixin

from warnings import filterwarnings
filterwarnings('ignore')

Loading Cleaned Data

In [43]:
data = pd.read_csv('IAS_Cleaned.csv', keep_default_na=False)
data.head()

Unnamed: 0,ID,Name,Gender,DOB,Age,Domicile,Date of Appointment,Allotment Year,Service Tenure,Source of Recruitment,...,Spec 2,Current Post,Post Type,Department,Department Type,Location,Job Status,With Effect From Date,Current Tenure,Pay Level
0,7600,Shri Pradip Kumar Tripathi,Male,1964-06-18,59,Uttar Pradesh,1987-08-24,1987,36,RR,...,Structural Engg,Secretary (Coordination),Secretary,Cabinet Secretariat,Secretariat Department,New Delhi,Active,2022-02-05 00:00:00,2.0,Level 17
1,18400,Shri Naresh Kumar,Male,1963-11-18,60,Delhi,1987-08-24,1987,36,RR,...,Public Admn,Chief Secretary,Secretary,N.A.,Secretariat Department,Delhi,Active,2022-04-21 00:00:00,2.0,Level 17
2,18800,Shri Chetan Bhushan Sanghi,Male,1965-06-26,58,Andhra Pradesh,1988-08-25,1988,35,RR,...,Hons,Financial Commissioner,Commissioner,Govt. of National capital Territory of Delhi (...,Other Department,New Delhi,Active,2021-01-04 00:00:00,3.0,Level 17
3,19002,Dr.(Ms.) Renu Sharma,Female,1964-10-19,59,Delhi,1988-08-25,1988,35,RR,...,Political Sc.,Chief Secretary to Government of Mizoram,Secretary,N.A.,Secretariat Department,Aizawl (Mizoram),Active,2021-02-11 00:00:00,3.0,Level 17
4,8200,Shri Atal Dulloo,Male,1966-10-24,57,Jammu & Kashmir,1989-08-21,1989,34,RR,...,,Chief Secretary,Secretary,N.A.,Secretariat Department,Jammu & Kashmir,Active,2023-01-12 00:00:00,1.0,Level 17


In [44]:
data.columns

Index(['ID', 'Name', 'Gender', 'DOB', 'Age', 'Domicile', 'Date of Appointment',
       'Allotment Year', 'Service Tenure', 'Source of Recruitment', 'Cadre',
       'Qual 1', 'Qual 2', 'Qual Type', 'Spec 1', 'Spec 2', 'Current Post',
       'Post Type', 'Department', 'Department Type', 'Location', 'Job Status',
       'With Effect From Date', 'Current Tenure', 'Pay Level'],
      dtype='object')

Removing single instance of target variable having only one occurence of that class

In [45]:
ind = data[data['Pay Level'] == 'Level 18'].index
data = data.drop(index = ind)

Classifying Infrequent location values to 'other'

In [46]:

loc = (data['Location'].value_counts())
c = loc > 100
loc = loc[c].index

data['Location'] = data['Location'].apply(lambda x: 'Other' if x not in loc else x)


Removing Irrelevant Features like ID, Name, DOB, etc.

In [49]:
# Define pipeline from earlier (without classifier)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_classif

scale_cols = ['Age', 'Service Tenure', 'Current Tenure']
ordinal_cols = ['Source of Recruitment', 'Qual Type', 'Job Status', 'Post Type']
onehot_cols = ['Domicile', 'Location', 'Cadre', 'Department Type', 'Gender', 'Allotment Year']

preprocessor = ColumnTransformer([
    ('scale', StandardScaler(), scale_cols),
    ('ordinal', OrdinalEncoder(), ordinal_cols),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), onehot_cols)
])

transform_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('feature_selection', SelectKBest(score_func=mutual_info_classif, k=30))
])


In [50]:
X = data.drop(columns = 'Pay Level')
y = data['Pay Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_processed = transform_pipeline.fit_transform(X_train, y_train)
X_test_processed = transform_pipeline.transform(X_test)


In [51]:
from joblib import dump
dump(transform_pipeline, 'preprocessor_pipeline.joblib')

['preprocessor_pipeline.joblib']

Encoding Target Variable with LabelEncoder

In [52]:
from sklearn.utils.class_weight import compute_class_weight

le2 =  LabelEncoder()
y_train = le2.fit_transform(y_train)
y_test = le2.transform(y_test)

classes = np.unique(y_train)

weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

pay_levels = le2.inverse_transform(np.arange(len(le2.classes_)))
pay_levels

array(['Level 10', 'Level 11', 'Level 12', 'Level 13', 'Level 14',
       'Level 15', 'Level 17'], dtype=object)

In [53]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, mean_absolute_error, cohen_kappa_score, accuracy_score, precision_score, recall_score
from mord import LogisticAT
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Custom scorers for weighted metrics
scoring = {
    'weighted_accuracy': make_scorer(accuracy_score),
    'weighted_precision': make_scorer(precision_score, average='weighted', zero_division=0),
    'weighted_recall': make_scorer(recall_score, average='weighted', zero_division=0),
    'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
    'QWK': make_scorer(cohen_kappa_score, weights='quadratic')
}


In [54]:
# Define models and their hyperparameter grids
models = {
    'LogisticAT': {
        'model': LogisticAT(),
        'params': {'alpha': [0.5, 1.0, 1.5]}
    },
    'LGBM': {
        'model': LGBMClassifier(class_weight=class_weights, verbose=-1),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [5, 10],
            'learning_rate': [0.01, 0.1]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(scale_pos_weight=class_weights, eval_metric='mlogloss'),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [3, 5],
            'learning_rate': [0.01, 0.1]
        }
    },
    'CatBoost': {
        'model': CatBoostClassifier(class_weights=class_weights, verbose=0),
        'params': {
            'iterations': [100, 200],
            'depth': [4, 6],
            'learning_rate': [0.01, 0.1]
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(class_weight=class_weights),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [5, 10]
        }
    },
    'LinearSVC': {
        'model': LinearSVC(class_weight=class_weights),
        'params': {
            'C': [0.1, 1.0, 10.0],
            'max_iter': [1000, 2000]
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [5, 9, 11],
            'weights': ['uniform', 'distance']
        }
    }
}

# 5-Fold CV with StratifiedKFold (preserves class distribution)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [59]:
# Store best models

from time import perf_counter
t1 = perf_counter()
best_models = {}

for name, config in models.items():
    grid_search = GridSearchCV(
        estimator=config['model'],
        param_grid=config['params'],
        cv=cv,
        scoring=scoring,
        refit='weighted_accuracy',  # Primary metric for selection
        n_jobs=-1,
        verbose=0,
    )

    grid_search.fit(X_train_processed, y_train)
    best_models[name] = {'model': grid_search.best_estimator_, 'params': grid_search.best_params_}

t2 = perf_counter()
print(f"Time taken: {(t2-t1) / 60} minutes")


Time taken:  2.261548 minutes


In [68]:
# Evaluate all models on test set
results = []

for name, model in best_models.items():
    y_pred = model['model'].predict(X_test_processed)

    # Compute metrics
    metrics = {
        'Model': name,
        'Weighted Accuracy': accuracy_score(y_test, y_pred),
        'Weighted Precision': precision_score(y_test, y_pred, average='weighted', zero_division=0),
        'Weighted Recall': recall_score(y_test, y_pred, average='weighted', zero_division=0),
        'MAE': mean_absolute_error(y_test, y_pred),
        'QWK': cohen_kappa_score(y_test, y_pred, weights='quadratic')
    }
    results.append(metrics)

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by=['QWK', 'Weighted Accuracy', 'MAE'], ascending=[False, False, True])
results_df = results_df.reset_index(drop=True)
results_df

Unnamed: 0,Model,Weighted Accuracy,Weighted Precision,Weighted Recall,MAE,QWK
0,XGBoost,0.84296,0.846054,0.84296,0.169675,0.966049
1,LGBM,0.838448,0.842632,0.838448,0.173285,0.966048
2,CatBoost,0.836643,0.841045,0.836643,0.17509,0.965224
3,RandomForest,0.8213,0.833491,0.8213,0.192238,0.96139
4,LogisticAT,0.785199,0.791828,0.785199,0.223827,0.957258
5,KNN,0.772563,0.776312,0.772563,0.25722,0.943796
6,LinearSVC,0.717509,0.728625,0.717509,0.34296,0.920918


In [69]:
# Select best model based on weighted metrics
best_model_name = results_df.iloc[0]['Model']
best_model = best_models[best_model_name]

print(f"Best Model: {best_model_name}")
print(f"QWK: {results_df.iloc[0]['QWK']:.4f}")
print(f"Weighted Accuracy: {results_df.iloc[0]['Weighted Accuracy']:.4f}")
print(f"MAE: {results_df.iloc[0]['MAE']:.4f}")

Best Model: XGBoost
QWK: 0.9660
Weighted Accuracy: 0.8430
MAE: 0.1697


In [157]:
# Map models to numeric IDs
model_to_id = {model: i for i, model in enumerate(results_df['Model'].unique())}
results_df['Model Name'] = results_df['Model'].map(model_to_id)

# Create parallel coordinates plot
fig = px.parallel_coordinates(
    results_df,
    dimensions=['Weighted Accuracy', 'Weighted Precision', 'Weighted Recall', 'MAE', 'QWK'],
    color='Model Name',
    color_continuous_scale=px.colors.qualitative.Safe,  # Colorblind-friendly palette
    title = 'Parellel Coordinates Plot for Model Comparison'
)

# Invert MAE axis
fig.update_traces(
    dimensions=[dict(
        label=dim.label,
        values=dim.values,
        range=[max(dim.values), min(dim.values)] if dim.label == 'MAE' else None
    ) for dim in fig.data[0].dimensions]
)

# Update color bar with model names
fig.update_coloraxes(
    colorbar=dict(
        tickvals=list(model_to_id.values()),
        ticktext=list(model_to_id.keys()),
    )
)

fig.update_layout(title_x=0.5)

fig.data[0].line.colorbar.thickness = 50

fig.show()

In [158]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

y_test_pred = best_model['model'].predict(X_test_processed)
classes = le2.inverse_transform(np.unique(y_test))

# Compute confusion matrix

cm = confusion_matrix(y_test, y_test_pred)

fig = px.imshow(cm,
                labels=dict(x="Predicted values", y="Actual values", color="Value Counts"),
                x=[i + '' for i in classes],
                y=[i + '  ' for i in classes],
                text_auto=True,
                color_continuous_scale='Blues',
                title = 'Confusion Matrix')

fig.update_traces(text=cm, texttemplate="%{text}")
fig.update_layout(title_x=0.5, height=600, width=800)

fig.show()