In [27]:
import pandas as pd
from xgboost import XGBRegressor
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [28]:
tnea_df = pd.read_csv('data/a2017-2020.csv', index_col=0)
tnea_df.head(20)

Unnamed: 0_level_0,College_Code,College_Name,Branch_Code,Branch_Name,OC,BC,BCM,MBC,SC,SCA,ST
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2017,1,"UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHE...",AI,Agricultural and Irrigation Engg.(SS),196.25,195.25,193.25,194.25,188.75,185.75,175.25
2017,1,"UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHE...",BY,Bio Medical Engg(SS),198.25,197.5,197.25,196.5,193.0,191.75,188.0
2017,1,"UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHE...",CE,Civil Engineering,198.5,198.0,197.5,197.5,196.0,193.5,196.0
2017,1,"UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHE...",CM,Computer Science and Engg.(SS),199.0,198.75,199.0,197.75,193.5,190.5,193.33
2017,1,"UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHE...",CS,Computer Science and Engg.,199.75,199.5,199.5,199.0,197.5,196.25,193.75
2017,1,"UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHE...",EC,Electronics and Comm Engg.,200.0,199.75,199.75,199.25,198.25,197.0,190.25
2017,1,"UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHE...",EE,Electrical and Elec. Engg.,199.25,199.0,198.5,198.75,196.5,197.0,193.25
2017,1,"UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHE...",EM,Electronics and Comm Engg(SS),199.25,199.0,198.0,198.5,195.5,194.5,189.0
2017,1,"UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHE...",GI,Geo-Informatics,196.75,196.0,194.0,194.75,188.75,186.25,
2017,1,"UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHE...",IE,Industrial Engineering,196.5,196.0,195.5,194.25,186.75,184.0,


In [29]:
tnea_df.shape


(240, 11)

In [30]:
tnea_df.isna().sum()


College_Code      0
College_Name      0
Branch_Code       0
Branch_Name       0
OC                0
BC               13
BCM              22
MBC              12
SC               12
SCA              37
ST              122
dtype: int64

In [31]:
tnea_df.dtypes

College_Code      int64
College_Name     object
Branch_Code      object
Branch_Name      object
OC              float64
BC              float64
BCM             float64
MBC             float64
SC              float64
SCA             float64
ST              float64
dtype: object

In [32]:

tnea_df.College_Name.value_counts()


UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHENNAI - CEG CAMPUS,SARDAR PATEL ROAD, GUINDY,CHENNAI DIST, PIN - 600025.             36
University Departments of Anna University, Chennai - CEG Campus, Sardar Patel Road, Guindy, Chennai 600 025                       34
UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHENNAI - MIT CAMPUS,CHROMPET,,KANCHEEPURAM DIST, PIN - 600044.                        26
UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHENNAI - ACT CAMPUS,SARDAR PATEL ROAD, GUINDY,CHENNAI DIST, PIN - 600025.             22
University Departments of Anna University, Chennai - ACT Campus, Sardar Patel Road, Guindy, Chennai 600 025                       22
University Departments of Anna University, Chennai - MIT Campus, Chrompet, Tambaram Taluk, Kancheepuram District 600 044          12
Annamalai University Faculty of Engineering and Technology,Annamalai nagar,Cuddalore 608002                                       12
University Departments of Anna University, Chennai - MIT Campus, Chro

In [33]:
tnea_df.Branch_Code.value_counts()


CS    22
EC    22
ME    12
CE    10
EE    10
XM    10
IT     8
CM     8
EM     8
IM     8
EI     6
XC     6
CH     6
MI     4
MN     4
RP     4
PR     4
MF     4
GI     4
AU     4
AS     4
AE     4
TX     4
PP     4
PM     4
LE     4
IS     4
IB     4
FS     4
BY     4
CL     4
AP     4
IE     4
MA     4
PT     4
CR     4
BD     2
CZ     2
AI     2
AM     2
EX     2
MM     2
Name: Branch_Code, dtype: int64


# Encode the ordinal categorical variable 'Branch_code'
Branch_Code_mapping = {'MM','EX': 0, 'CR': 1, 'CH': 2, 'IM': 3, 'XM': 4, 'ME':5 , 'CS': 6}
tnea_df.Branch_Code= tnea_df.Branch_Code.map(Branch_Code_mapping)

In [34]:
tnea_df.Branch_Name.value_counts()

Computer Science and Engineering             10
Electronics and Communication Engineering    10
Mechanical Engineering                        9
Civil Engineering                             8
Mechanical Engineering (Tamil Medium)         7
                                             ..
INDUSTRIAL BIO-TECHNOLOGY(SS)                 1
LEATHER TECHNOLOGY                            1
PHARMACEUTICAL TECHNOLOGY(SS)                 1
PETROLEUM ENGINEERING AND TECH(SS)            1
Mechanical (Manufacturing)                    1
Name: Branch_Name, Length: 99, dtype: int64

In [35]:
for label, content in tnea_df.items():
    if pd.api.types.is_string_dtype(content):
        tnea_df[label] = content.astype("category").cat.as_ordered()    #Set the Categorical to be ordered.

In [36]:
tnea_df.dtypes

College_Code       int64
College_Name    category
Branch_Code     category
Branch_Name     category
OC               float64
BC               float64
BCM              float64
MBC              float64
SC               float64
SCA              float64
ST               float64
dtype: object

In [37]:
tnea_df.College_Name.cat.categories

Index(['Annamalai University Faculty of Engineering and Technology,Annamalai nagar,Cuddalore',
       'Annamalai University Faculty of Engineering and Technology,Annamalai nagar,Cuddalore 608002',
       'UNIVERSITY COLLEGE OF ENGINEERING, ARNI,ARNI TO DEVIKAPURAM ROAD, THATCHUR, ARNI,THIRUVANNAMALAI DIST, PIN - 632326.',
       'UNIVERSITY COLLEGE OF ENGINEERING, KANCHIPURAM,NH-4, CHENNAI BANGALORE HIGHWAY, PONNERIKKARAI,KANCHEEPURAM DIST, PIN - 631552.',
       'UNIVERSITY COLLEGE OF ENGINEERING, TINDIVANAM,IYYANTHOPE (POST) , MELPAKKAM (VILLAGE),VILLUPURAM DIST, PIN - 604001.',
       'UNIVERSITY COLLEGE OF ENGINEERING, VILLUPURAM,VILLUPURAM (POST) , KAKUPPAM (VILLAGE),VILLUPURAM DIST, PIN - 605103.',
       'UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHENNAI - ACT CAMPUS,SARDAR PATEL ROAD, GUINDY,CHENNAI DIST, PIN - 600025.',
       'UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHENNAI - CEG CAMPUS,SARDAR PATEL ROAD, GUINDY,CHENNAI DIST, PIN - 600025.',
       'UNIVERSITY DEPARTMENTS

In [39]:
tnea_df.Branch_Code.cat.categories

Index(['AE', 'AI', 'AM', 'AP', 'AS', 'AU', 'BD', 'BY', 'CE', 'CH', 'CL', 'CM',
       'CR', 'CS', 'CZ', 'EC', 'EE', 'EI', 'EM', 'EX', 'FS', 'GI', 'IB', 'IE',
       'IM', 'IS', 'IT', 'LE', 'MA', 'ME', 'MF', 'MI', 'MM', 'MN', 'PM', 'PP',
       'PR', 'PT', 'RP', 'TX', 'XC', 'XM'],
      dtype='object')

In [40]:
tnea_df.Branch_Name.cat.categories

Index(['AERONAUTICAL ENGINEERING', 'AGRICULTURAL AND IRRIGATION ENGG.(SS)',
       'APPAREL TECHNOLOGY(SS)', 'AUTOMOBILE ENGINEERING',
       'AUTOMOBILE ENGINEERING (SS)', 'Aeronautical Engineering',
       'Agricultural and Irrigation Engg.(SS)', 'Apparel Technology (SS)',
       'Apparel Technology(SS)', 'Automobile Engineering',
       'Automobile Engineering (SS)', 'BIO MEDICAL ENGG(SS)',
       'Bio Medical Engg(SS)', 'Bio- Medical Engineering (SS)',
       'CERAMIC TECHNOLOGY(SS)', 'CHEMICAL ENGINEERING',
       'CHEMICAL ENGINEERING(SS)', 'CIVIL ENGINEERING',
       'CIVIL ENGINEERING - TAMIL MEDIUM', 'COMPUTER SCIENCE AND ENGG.',
       'COMPUTER SCIENCE AND ENGG.(SS)', 'Ceramic Technology (SS)',
       'Ceramic Technology(SS)', 'Chemical Engineering',
       'Chemical Engineering (SS)', 'Civil Engineering',
       'Civil Engineering (Tamil Medium)', 'Civil and Structural Engineering',
       'Computer Science and Engg.', 'Computer Science and Engg.(SS)',
       'Computer Scie

In [42]:
tnea_df.to_csv("data/atrain_tmp.csv",index=False)

In [43]:
tnea_df = pd.read_csv("data/atrain_tmp.csv",low_memory=False)
tnea_df.head().T

Unnamed: 0,0,1,2,3,4
College_Code,1,1,1,1,1
College_Name,"UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHE...","UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHE...","UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHE...","UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHE...","UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHE..."
Branch_Code,AI,BY,CE,CM,CS
Branch_Name,Agricultural and Irrigation Engg.(SS),Bio Medical Engg(SS),Civil Engineering,Computer Science and Engg.(SS),Computer Science and Engg.
OC,196.25,198.25,198.5,199.0,199.75
BC,195.25,197.5,198.0,198.75,199.5
BCM,193.25,197.25,197.5,199.0,199.5
MBC,194.25,196.5,197.5,197.75,199.0
SC,188.75,193.0,196.0,193.5,197.5
SCA,185.75,191.75,193.5,190.5,196.25


In [44]:
tnea_df.isna().sum()

College_Code      0
College_Name      0
Branch_Code       0
Branch_Name       0
OC                0
BC               13
BCM              22
MBC              12
SC               12
SCA              37
ST              122
dtype: int64

In [45]:
for label, content in tnea_df.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            tnea_df[label+"_is_missing"] = pd.isnull(content)
            tnea_df[label] = content.fillna(content.median())

In [46]:
tnea_df

Unnamed: 0,College_Code,College_Name,Branch_Code,Branch_Name,OC,BC,BCM,MBC,SC,SCA,ST,BC_is_missing,BCM_is_missing,MBC_is_missing,SC_is_missing,SCA_is_missing,ST_is_missing
0,1,"UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHE...",AI,Agricultural and Irrigation Engg.(SS),196.25,195.25,193.250,194.250,188.75,185.75,175.25,False,False,False,False,False,False
1,1,"UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHE...",BY,Bio Medical Engg(SS),198.25,197.50,197.250,196.500,193.00,191.75,188.00,False,False,False,False,False,False
2,1,"UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHE...",CE,Civil Engineering,198.50,198.00,197.500,197.500,196.00,193.50,196.00,False,False,False,False,False,False
3,1,"UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHE...",CM,Computer Science and Engg.(SS),199.00,198.75,199.000,197.750,193.50,190.50,193.33,False,False,False,False,False,False
4,1,"UNIVERSITY DEPARTMENTS OF ANNA UNIVERSITY, CHE...",CS,Computer Science and Engg.,199.75,199.50,199.500,199.000,197.50,196.25,193.75,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,1013,"University College of Engineering, Villupuram,...",ME,Mechanical Engineering,107.50,89.50,186.375,83.000,80.00,85.50,167.00,False,True,False,False,False,True
236,1013,"University College of Engineering, Villupuram,...",XM,Mechanical Engineering (Tamil Medium),113.50,188.50,186.375,184.625,172.50,161.75,167.00,True,True,True,True,True,True
237,1014,"University College of Engineering, Tindivanam,...",CE,Civil Engineering,80.00,188.50,186.375,184.625,172.50,161.75,167.00,True,True,True,True,True,True
238,1014,"University College of Engineering, Tindivanam,...",CS,Computer Science and Engineering,143.50,112.00,82.000,114.500,103.00,161.75,167.00,False,False,False,False,True,True


In [47]:
tnea_df.shape

(240, 17)

In [48]:
tnea_df.isna().sum()

College_Code      0
College_Name      0
Branch_Code       0
Branch_Name       0
OC                0
BC                0
BCM               0
MBC               0
SC                0
SCA               0
ST                0
BC_is_missing     0
BCM_is_missing    0
MBC_is_missing    0
SC_is_missing     0
SCA_is_missing    0
ST_is_missing     0
dtype: int64

In [50]:
for label, content in tnea_df.items():
    if not pd.api.types.is_numeric_dtype(content):
        print(label)

College_Name
Branch_Code
Branch_Name


In [52]:
for label, content in tnea_df.items():
    if not pd.api.types.is_numeric_dtype(content):
        tnea_df[label+"_is_missing"] = pd.isnull(content)
        tnea_df[label] = pd.Categorical(content).codes+1

In [53]:
pd.Categorical(tnea_df["College_Name"]).codes+1

array([ 8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
        8,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  6,  6,  6,  6,  6,  5,  5,  5,  5,
        5,  3,  3,  3,  3,  3,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,
        8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,
        7,  7,  7,  7,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  3,  3,  3,  3,  3,  4,  4,
        4, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
       13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 15, 15, 15, 15, 15,
       15, 15, 15, 15, 15, 15, 15,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2, 11, 11, 11, 11, 11, 10, 10, 10, 13, 13, 13, 13, 13, 13, 13,
       13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12,
       12, 12, 12, 12, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  1,
        1,  1,  1,  1,  1

In [54]:
tnea_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   College_Code             240 non-null    int64  
 1   College_Name             240 non-null    int8   
 2   Branch_Code              240 non-null    int8   
 3   Branch_Name              240 non-null    int8   
 4   OC                       240 non-null    float64
 5   BC                       240 non-null    float64
 6   BCM                      240 non-null    float64
 7   MBC                      240 non-null    float64
 8   SC                       240 non-null    float64
 9   SCA                      240 non-null    float64
 10  ST                       240 non-null    float64
 11  BC_is_missing            240 non-null    bool   
 12  BCM_is_missing           240 non-null    bool   
 13  MBC_is_missing           240 non-null    bool   
 14  SC_is_missing            2

In [55]:
tnea_df

Unnamed: 0,College_Code,College_Name,Branch_Code,Branch_Name,OC,BC,BCM,MBC,SC,SCA,ST,BC_is_missing,BCM_is_missing,MBC_is_missing,SC_is_missing,SCA_is_missing,ST_is_missing,College_Name_is_missing,Branch_Code_is_missing,Branch_Name_is_missing
0,1,8,2,7,196.25,195.25,193.250,194.250,188.75,185.75,175.25,False,False,False,False,False,False,False,False,False
1,1,8,8,13,198.25,197.50,197.250,196.500,193.00,191.75,188.00,False,False,False,False,False,False,False,False,False
2,1,8,9,26,198.50,198.00,197.500,197.500,196.00,193.50,196.00,False,False,False,False,False,False,False,False,False
3,1,8,12,30,199.00,198.75,199.000,197.750,193.50,190.50,193.33,False,False,False,False,False,False,False,False,False
4,1,8,14,29,199.75,199.50,199.500,199.000,197.50,196.25,193.75,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,1013,11,30,79,107.50,89.50,186.375,83.000,80.00,85.50,167.00,False,True,False,False,False,True,False,False,False
236,1013,11,42,80,113.50,188.50,186.375,184.625,172.50,161.75,167.00,True,True,True,True,True,True,False,False,False
237,1014,10,9,26,80.00,188.50,186.375,184.625,172.50,161.75,167.00,True,True,True,True,True,True,False,False,False
238,1014,10,14,31,143.50,112.00,82.000,114.500,103.00,161.75,167.00,False,False,False,False,True,True,False,False,False


In [56]:
tnea_df.to_csv("data/atrain_tmp.csv",index=False)

In [57]:
tnea_df = pd.read_csv("data/atrain_tmp.csv",low_memory=False)
tnea_df.head().T

Unnamed: 0,0,1,2,3,4
College_Code,1,1,1,1,1
College_Name,8,8,8,8,8
Branch_Code,2,8,9,12,14
Branch_Name,7,13,26,30,29
OC,196.25,198.25,198.5,199.0,199.75
BC,195.25,197.5,198.0,198.75,199.5
BCM,193.25,197.25,197.5,199.0,199.5
MBC,194.25,196.5,197.5,197.75,199.0
SC,188.75,193.0,196.0,193.5,197.5
SCA,185.75,191.75,193.5,190.5,196.25


In [58]:
tnea_df.head().T

Unnamed: 0,0,1,2,3,4
College_Code,1,1,1,1,1
College_Name,8,8,8,8,8
Branch_Code,2,8,9,12,14
Branch_Name,7,13,26,30,29
OC,196.25,198.25,198.5,199.0,199.75
BC,195.25,197.5,198.0,198.75,199.5
BCM,193.25,197.25,197.5,199.0,199.5
MBC,194.25,196.5,197.5,197.75,199.0
SC,188.75,193.0,196.0,193.5,197.5
SCA,185.75,191.75,193.5,190.5,196.25


In [60]:
tnea_df.isna().sum()

College_Code               0
College_Name               0
Branch_Code                0
Branch_Name                0
OC                         0
BC                         0
BCM                        0
MBC                        0
SC                         0
SCA                        0
ST                         0
BC_is_missing              0
BCM_is_missing             0
MBC_is_missing             0
SC_is_missing              0
SCA_is_missing             0
ST_is_missing              0
College_Name_is_missing    0
Branch_Code_is_missing     0
Branch_Name_is_missing     0
dtype: int64

In [70]:
model_df = tnea_df.copy()
X = model_df.drop(['College_Name'], axis=1)
y = model_df['College_Name']
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=0)


In [71]:
xgb1 = XGBRegressor()
parameters = {
              'objective':['reg:squarederror'],
              'learning_rate': [.0001, 0.001, .01],
              'max_depth': [3, 5, 7],
              'min_child_weight': [3,5,7],
              'subsample': [0.1,0.5,1.0],
              'colsample_bytree': [0.1, 0.5, 1.0],
              'n_estimators': [500]}

xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 3,
                        n_jobs = -1,
                        verbose=0)

xgb_grid.fit(X_train, y_train)

xgb_cv = (xgb_grid.best_estimator_)

eval_set = [(X_train, y_train),
            (X_val, y_val)]

fit_model = xgb_cv.fit(
    X_train,
    y_train,
    eval_set=eval_set,
    eval_metric='mae',
    early_stopping_rounds=50,
    verbose=False)

print("MAE:", mean_absolute_error(y_val, fit_model.predict(X_val)))
print("MSE:", mean_squared_error(y_val, fit_model.predict(X_val)))
print("R2:", r2_score(y_val, fit_model.predict(X_val)))



MAE: 1.1411488230029743
MSE: 2.765751246482361
R2: 0.8043328869132754


In [73]:
fit_model.save_model('axgb_model.json')