In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler,OrdinalEncoder,FunctionTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import StandardScaler,RobustScaler, QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_regression

import seaborn as sns


In [3]:
# load the data
df = pd.read_csv("Student_dataset_3.csv")

# rename the columns
df.columns=['RollNo','Gender','Age','Location','Famsize','Pstatus','Medu','Fedu','Mjob','Fjob','reason','traveltime','studytime','Failures','Famsup','Paid','Activities','HigherEdu','Internet','Famrel','Freetime','GoOut','Health','10th%','12thordiploma%','EAMCETorECETrank','Internal','Prev cgpa','sgpa']

# remove duplicates 
df = df.drop_duplicates()




In [4]:
df['Eamcet Rank'] = 0
df['Ecet Rank'] = 0

df.loc[~df['RollNo'].str.contains('L'), 'Eamcet Rank'] = df.loc[~df['RollNo'].str.contains('L'), 'EAMCETorECETrank']
df.loc[df['RollNo'].str.contains('L'), 'Ecet Rank'] = df.loc[df['RollNo'].str.contains('L'), 'EAMCETorECETrank']
df.drop('EAMCETorECETrank', axis=1, inplace=True)


In [5]:
df['Prev cgpa'] = df['Prev cgpa'].replace(0, 5.5)
df['sgpa'] = df['sgpa'].replace(0, 5.5)
df['Total_eamcet/ecet_grade'] = 0


In [6]:
df['Fedu'] = df['Fedu'].replace('primary education (4th grade)', 'primary education( 4th grade )')

In [7]:
df['traveltime'] = df['traveltime'].replace('<1hr', '>1hr')

In [8]:
df.head(1)

Unnamed: 0,RollNo,Gender,Age,Location,Famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,traveltime,studytime,Failures,Famsup,Paid,Activities,HigherEdu,Internet,Famrel,Freetime,GoOut,Health,10th%,12thordiploma%,Internal,Prev cgpa,sgpa,Eamcet Rank,Ecet Rank,Total_eamcet/ecet_grade
0,319126510001,FEMALE,21,Urban,5,T,higher education,higher education,Homemaker,civil services: administrative or police,Other,30min - 1hr,1- <2 hr,0,Yes,No,Yes,No,Yes,excellent,high,3 - medium,5 - very good,93.1,96.3,90.23,9.4,8.93,18238.0,0,0


In [9]:
df.to_csv("cleaned_dataset.csv")

In [10]:

# assume the DataFrame is stored in a variable called "df"
unique_values = {}
for column in df.columns:
    values = df[column].unique()
    if len(values) < 10:
        unique_values[column] = values

# print the unique values for each column
for column, values in unique_values.items():
    print(f"Column '{column}': {values}")
    print()


Column 'Gender': ['FEMALE' 'MALE']

Column 'Age': [21 22 20]

Column 'Location': ['Urban' 'Rural']

Column 'Famsize': [5 4 3 2 6 8 9]

Column 'Pstatus': ['T' 'A']

Column 'Medu': ['higher education' '5th to 9th grade' 'secondary education' 'None'
 'primary education( 4th grade )']

Column 'Fedu': ['higher education' 'secondary education' '5th to 9th grade'
 'primary education( 4th grade )' 'None']

Column 'Mjob': ['Homemaker' "nominal: 'teacher', 'health' care related"
 'civil services: administrative or police']

Column 'Fjob': ['civil services: administrative or police'
 "nominal: 'teacher', 'health' care related" 'Homemaker']

Column 'reason': ['Other' 'Course preference' "Institution's reputation" 'close to home']

Column 'traveltime': ['30min - 1hr' '>1hr' '15 - 30 min' '<15 min']

Column 'studytime': ['1- <2 hr' '2 to 5 hr' '>10 hr' '5 to 10 hr']

Column 'Failures': [0 2 3 9 5 4 1]

Column 'Famsup': ['Yes' 'No']

Column 'Paid': ['No' 'Yes']

Column 'Activities': ['Yes' 'No']

Col

In [11]:
X = df.drop('sgpa', axis=1)
y = df['sgpa']

X_train,X_test,y_train,y_test=train_test_split(df.drop(columns=['sgpa']),df['sgpa'],test_size=0.2,random_state=42)


In [12]:
X.columns

Index(['RollNo', 'Gender', 'Age', 'Location', 'Famsize', 'Pstatus', 'Medu',
       'Fedu', 'Mjob', 'Fjob', 'reason', 'traveltime', 'studytime', 'Failures',
       'Famsup', 'Paid', 'Activities', 'HigherEdu', 'Internet', 'Famrel',
       'Freetime', 'GoOut', 'Health', '10th%', '12thordiploma%', 'Internal',
       'Prev cgpa', 'Eamcet Rank', 'Ecet Rank', 'Total_eamcet/ecet_grade'],
      dtype='object')

In [13]:
X.iloc[:1]

Unnamed: 0,RollNo,Gender,Age,Location,Famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,traveltime,studytime,Failures,Famsup,Paid,Activities,HigherEdu,Internet,Famrel,Freetime,GoOut,Health,10th%,12thordiploma%,Internal,Prev cgpa,Eamcet Rank,Ecet Rank,Total_eamcet/ecet_grade
0,319126510001,FEMALE,21,Urban,5,T,higher education,higher education,Homemaker,civil services: administrative or police,Other,30min - 1hr,1- <2 hr,0,Yes,No,Yes,No,Yes,excellent,high,3 - medium,5 - very good,93.1,96.3,90.23,9.4,18238.0,0,0


In [14]:
def get_eamcet_grade(rank):
    conditions = [
        rank == 0,
        rank < 7000,
        rank < 8500,
        rank < 10000,
        rank < 25000
    ]
    choices = [0, 4, 3, 2, 1]
    return np.select(conditions, choices, default=0)

def get_ecet_grade(rank):
    conditions = [
        rank == 0,
        rank < 250,
        rank < 350,
        rank < 580,
        rank < 1000
    ]
    choices = [0, 4, 3, 2, 1]
    return np.select(conditions, choices, default=0)

ctf1 = ColumnTransformer([
        ('imputer1', SimpleImputer(strategy='mean'), [28]),  #Eamcet Rank
         ('Eamcet_transformer', FunctionTransformer(get_eamcet_grade), [28]),
        ('imputer2', SimpleImputer(strategy='mean'), [29]),  #Ecet Rank
        ('Ecet_transformer', FunctionTransformer(get_ecet_grade), [29]),
    ],
    remainder='passthrough'
)


# df_transformed = ctf1.fit_transform(df)
# print(df_transformed[0])


# column_names = [ 'Eamcet Rank','Eamcet Grade', 'Ecet Rank','Ecet Grade','Roll No','Gender', 'Age', 
#                 'Location', 'Famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'traveltime',
#                 'studytime', 'Failures', 'Famsup', 'Paid', 'Activities', 'HigherEdu', 'Internet', 'Famrel',
#                 'Freetime', 'GoOut', 'Health', '10th%', '12thordiploma%', 'Internal', 'Prev cgpa','sgpa','Total_eamcet/ecet_grade']

# df = pd.DataFrame(df_transformed, columns=column_names)

# df.head(1)



In [15]:
# index = df.columns.get_loc('Total_eamcet/ecet_grade')
# print(index)


In [16]:
# print(df.columns)


In [17]:
def second_transformer(X):
    # Add values at indices 1 and 3
    X[:, 1] = X[:, 1] + X[:, 3]

    # Replace the value at the last index with the sum of values at indices 1 and 3
    X[:, -1] = X[:, 1] + X[:, 3]

    return X

ctf2 = FunctionTransformer(second_transformer)

# df_trans2 = ctf2.fit_transform(df_transformed)
# df1 = pd.DataFrame(df_trans2, columns=['Eamcet Rank', 'Eamcet Grade', 'Ecet Rank', 'Ecet Grade', 'Roll No',
#        'Gender', 'Age', 'Location', 'Famsize', 'Pstatus', 'Medu', 'Fedu',
#        'Mjob', 'Fjob', 'reason', 'traveltime', 'studytime', 'Failures',
#        'Famsup', 'Paid', 'Activities', 'HigherEdu', 'Internet', 'Famrel',
#        'Freetime', 'GoOut', 'Health', '10th%', '12thordiploma%', 'Internal',
#        'Prev cgpa', 'sgpa', 'Total_eamcet/ecet_grade'])

# df1.head(1)


In [18]:
# df1

In [19]:
# X_test

In [20]:
categories_medu = ['None', 'primary education( 4th grade )', '5th to 9th grade', 'secondary education', 'higher education']
categories_fedu = ['None', 'primary education( 4th grade )', '5th to 9th grade', 'secondary education', 'higher education']
categories_travel=[ '<15 min','15 - 30 min','30min - 1hr','>1hr']
categories_study=[ '1- <2 hr', '2 to 5 hr','5 to 10 hr', '>10 hr']
categories_famrel=[ 'Bad','Good','Very good','excellent']
categories_freetime=['very low', 'low', 'high','very high']
categories_goout=[ '1 - very low', '2 - low', '3 - medium','4 - high','5 - very high']
categories_health=[ '1 - very bad', '2- bad', '3 - average','4 - good','5 - very good']

# Define the column transformer with ordinal encoding for Medu and Fedu
ctf3 = ColumnTransformer(transformers=[('ordinal', OrdinalEncoder(categories=[categories_medu,
                            categories_fedu,categories_travel,categories_study,categories_famrel,
                            categories_freetime,categories_goout,categories_health],dtype=np.int32),
                                        [10,11,15,16,23,24,25,26])], remainder='passthrough')



# df1_transformed = ctf3.fit_transform(df1)

# df1_transformed[0]
# df2 = pd.DataFrame(df1_transformed, columns=['Medu', 'Fedu', 'traveltime', 'studytime','Famrel','Freetime',
#                                              'GoOut','Health','Eamcet Rank','Eamcet Grade','Ecet Rank','Ecet Grade','RollNo', 'Gender', 'Age', 'Location', 'Famsize',
#                                              'Pstatus', 'Mjob', 'Fjob', 'reason', 'Failures', 'Famsup', 'Paid', 
#                                              'Activities', 'HigherEdu', 'Internet','10th%', '12thordiploma%', 
#                                              'Internal','Prev cgpa', 'sgpa','Total_eamcet/ecet_grade'])



# df2

In [21]:
# define column transformer
ctf4 = ColumnTransformer(
    transformers=[
        ('one_hot', OneHotEncoder(drop='first', sparse=False, dtype=np.int32, handle_unknown='ignore'), [13,15,17,18, 19,20, 22, 23, 24, 25,26])
    ],
    remainder='passthrough')


# # apply column transformer
# df2_transformed = ctf4.fit_transform(df2)

# df2_transformed.shape

# # create dataframe with transformed data
# df3 = pd.DataFrame(df2_transformed, columns=['Gender','Location_urban','Pstatus','Mjob_civil','Mjob_Teacher/health','Fjob_civil','Fjob_Teacher/health',
#                                              'reason_reputation','reason_other','reason_closetohome','Famsup','Paid', 'Activities',
#                                              'HigherEdu', 'Internet', 'Medu', 'Fedu', 'traveltime', 'studytime','Famrel','Freetime',
#                                              'GoOut','Health','Eamcet Rank','Eamcet Grade','Ecet Rank','Ecet Grade','RollNo', 'Age',  "Famsize",
#                                             'Failures','10th%', '12thordiploma%',
#                                              'Internal','Prev cgpa', 'sgpa','Total_eamcet/ecet_grade'])

# df3


In [22]:
# define the column transformer to apply scaling and capping
ctf5 = ColumnTransformer(transformers=[
    ('scaling', RobustScaler(), [31,32,33,34]),
    ('capping', QuantileTransformer(output_distribution='uniform', n_quantiles=60, random_state=42),[31,32,33,34])
], remainder='passthrough')

# # fit and transform the data using the preprocessor
# df3_transformed = ctf5.fit_transform(df3)
# df3_transformed.shape


# # print the shape of the transformed dataframe
# df4 =pd.DataFrame(df3_transformed,columns=['scaled_10th%', 'scaled_12thordiploma%',
#                                              'scaled_Internal','scaled_Prev cgpa','10th%', '12thordiploma%',
#                                              'Internal','Prev cgpa','Gender','Location_urban','Pstatus','Mjob_civil','Mjob_Teacher/health','Fjob_civil','Fjob_Teacher/health',
#                                              'reason_reputation','reason_other','reason_closetohome','Famsup','Paid', 'Activities',
#                                              'HigherEdu', 'Internet', 'Medu', 'Fedu', 'traveltime', 'studytime','Famrel','Freetime',
#                                              'GoOut','Health','Eamcet Rank','Eamcet Grade','Ecet Rank','Ecet Grade','RollNo', 'Age',  "Famsize",
#                                             'Failures', 'sgpa','Total_eamcet/ecet_grade'])



# df4.head()

In [23]:
# c=0
# for i in df4.columns:
#     print(i,c)
#     c+=1

In [24]:
cols_to_drop=[0,1,2,3,31,32,33,34,35]

# define the transformer to drop the specified columns
ctf6 = ColumnTransformer(transformers=[
    ('drop', 'drop', cols_to_drop)
], remainder='passthrough')

# # fit and transform the data using the transformer
# df4_transformed = ctf6.fit_transform(df4)

# df5 =pd.DataFrame(df4_transformed,columns=['10th%', '12thordiploma%',
#                                              'Internal','Prev cgpa','Gender','Location_urban','Pstatus','Mjob_civil','Mjob_Teacher/health','Fjob_civil','Fjob_Teacher/health',
#                                              'reason_reputation','reason_other','reason_closetohome','Famsup','Paid', 'Activities',
#                                              'HigherEdu', 'Internet', 'Medu', 'Fedu', 'traveltime', 'studytime','Famrel','Freetime',
#                                              'GoOut','Health','Age',  "Famsize",
#                                             'Failures', 'sgpa','Total_eamcet/ecet_grade'])

# df5.head(5)

In [25]:
# # define X and y
# X = df5.drop('sgpa', axis=1)
# y = df5['sgpa']

In [26]:
# # define the SelectKBest transformer
# kbest = SelectKBest(score_func=mutual_info_regression, k=5)

# # select the best 5 columns using SelectKBest
# X_best = kbest.fit_transform(df5, y)

# # get the indices of the best 5 columns
# best_indices = kbest.get_support(indices=True)

# # get the column names of the best 5 columns
# all_columns = df5.columns.tolist()
# best_columns = [all_columns[i] for i in best_indices]

# # subset the data using the best 5 columns
# df5 = df5[best_columns]
# df5

In [27]:
# obj_cols = df5.select_dtypes(include=['object']).columns
# df5[obj_cols] = df4[obj_cols].applymap(lambda x: x.strip() if isinstance(x, str) else x).astype(float)


In [28]:
# df5.corr()['sgpa'].sort_values() 

In [29]:
# define the SelectKBest transformer
ctf7 = SelectKBest(score_func=mutual_info_regression, k=5)

# select the best 5 columns using SelectKBest
# df5 = ctf7.fit_transform(df5, y)

# df5 =pd.DataFrame(df5,columns=[ '12thordiploma%', 'Internal','Prev cgpa','sgpa','Total_eamcet/ecet_grade'])

# df5

In [30]:
ctf8=RandomForestRegressor()

## Create PipeLine

In [31]:
pipe=Pipeline(
   [
       ('ctf1',ctf1),
       ('ctf2',ctf2),
       ('ctf3',ctf3),
       ('ctf4',ctf4),
       ('ctf5',ctf5),
       ('ctf6',ctf6),
       ('ctf7',ctf7),
       ('ctf8',ctf8)
   ])

In [32]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('ctf1',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('imputer1', SimpleImputer(),
                                                  [28]),
                                                 ('Eamcet_transformer',
                                                  FunctionTransformer(func=<function get_eamcet_grade at 0x000002752F221820>),
                                                  [28]),
                                                 ('imputer2', SimpleImputer(),
                                                  [29]),
                                                 ('Ecet_transformer',
                                                  FunctionTransformer(func=<function get_ecet_grade at 0x0000027534BFA700>),
                                                  [29])])),
                (...
                                   transformers=[('scaling', RobustScaler(),
                                  

## Explore the PipeLine

In [33]:
from sklearn import set_config
set_config(display='diagram')

In [34]:
pipe.fit(X_train,y_train)

In [35]:
pipe.named_steps['ctf1']

In [36]:
y_pred=pipe.predict(X_test) 



In [37]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print("R-squared: {:.4f}".format(r2))
print("MSE score: {:.4f}".format(mse))
print("MAE score: {:.4f}".format(mae))
print("RMSE score: {:.4f}".format(rmse))


R-squared: 0.8916
MSE score: 0.1166
MAE score: 0.2432
RMSE score: 0.3414


In [38]:
# y_pred

## Cross Validation Score

In [39]:
cross_val_score(pipe,X_train,y_train,cv=10,scoring='r2').mean()

0.7268770138264495

## Pickling The Model File For Deployment

In [40]:
import pickle
pickle.dump(pipe,open('sapp_final_algo.pkl','wb'))

In [41]:
pipe.predict(pd.DataFrame([['319126510001','FEMALE',  21  ,  'Urban'   ,     4    ,    'T'   ,"higher education",'higher education','Homemaker','civil services: administrative or police', 'Other' , '30min - 1hr', '1- <2 hr' ,     0     , 'Yes'  ,  'No' ,    'Yes'    ,     'No'   ,    'Yes'  ,'excellent',   'high'  ,'3 - medium','5 - very good',  93.1   ,       96.3      ,   90.23   ,    9.4    ,    18238.0    ,      0     ,                        0]],
                    columns=['RollNo'      ,'Gender', 'Age', 'Location', 'Famsize', 'Pstatus',      'Medu'      ,       'Fedu'     ,   'Mjob'  ,                  'Fjob'                  , 'reason',  'traveltime', 'studytime', 'Failures','Famsup', 'Paid', 'Activities', 'HigherEdu', 'Internet', 'Famrel'  , 'Freetime',   'GoOut'  ,   'Health'    , '10th%' , '12thordiploma%', 'Internal','Prev cgpa',  'Eamcet Rank', 'Ecet Rank','Total_eamcet/ecet_grade']))

array([9.1522])