In [190]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.pipeline import make_pipeline

In [191]:
df = pd.read_csv('datasets/Admission_Predict_Ver1.1.csv')

display(df.head())

display(df.info())



Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Serial No.         500 non-null    int64  
 1   GRE Score          500 non-null    int64  
 2   TOEFL Score        500 non-null    int64  
 3   University Rating  500 non-null    int64  
 4   SOP                500 non-null    float64
 5   LOR                500 non-null    float64
 6   CGPA               500 non-null    float64
 7   Research           500 non-null    int64  
 8   Chance of Admit    500 non-null    float64
dtypes: float64(4), int64(5)
memory usage: 35.3 KB


None

In [186]:
# display correlation of feature pair
display(df.corr())

# We see that all feature except Serial No. have less correlation compare to Chance of Admit (label)
# So we drop it

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,1.0,-0.103839,-0.141696,-0.067641,-0.137352,-0.003694,-0.074289,-0.005332,0.008505
GRE Score,-0.103839,1.0,0.8272,0.635376,0.613498,0.524679,0.825878,0.563398,0.810351
TOEFL Score,-0.141696,0.8272,1.0,0.649799,0.64441,0.541563,0.810574,0.467012,0.792228
University Rating,-0.067641,0.635376,0.649799,1.0,0.728024,0.608651,0.705254,0.427047,0.690132
SOP,-0.137352,0.613498,0.64441,0.728024,1.0,0.663707,0.712154,0.408116,0.684137
LOR,-0.003694,0.524679,0.541563,0.608651,0.663707,1.0,0.637469,0.372526,0.645365
CGPA,-0.074289,0.825878,0.810574,0.705254,0.712154,0.637469,1.0,0.501311,0.882413
Research,-0.005332,0.563398,0.467012,0.427047,0.408116,0.372526,0.501311,1.0,0.545871
Chance of Admit,0.008505,0.810351,0.792228,0.690132,0.684137,0.645365,0.882413,0.545871,1.0


In [187]:
# Feature selection
# # drop 'Serial No.' and 'Chance of Admit ' and change to numpy array
X = df.drop(columns = ['Serial No.','Chance of Admit ']).values

# scale feature
X = StandardScaler().fit_transform(X)


# Label selection
# # select 'Chance of Admit ' columns and change to numpy array
y = df['Chance of Admit '].values

In [196]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

Linereg = LinearRegression()
Ridreg = Ridge()
Lassreg = Lasso()

array = np.arange(start=0.001, stop=1, step=0.01)

param_grid = {'alpha': array}

Ridreg_cv = GridSearchCV(Ridreg, param_grid, cv = 5)
Lassreg_cv = GridSearchCV(Lassreg, param_grid, cv = 5)

Ridreg_cv.fit(X_train, y_train)
Lassreg_cv.fit(X_train, y_train)
Linereg.fit(X_train, y_train)

y_pred_rid = Ridreg_cv.predict(X_test)
y_pred_lass = Lassreg_cv.predict(X_test)
y_pred_line = Linereg.predict(X_test)

print("Train score: {}, Test score: {}".format(Ridreg_cv.score(X_train,y_train),Ridreg_cv.score(X_test, y_test)))
print("Train score: {}, Test score: {}".format(Lassreg_cv.score(X_train, y_train),Lassreg_cv.score(X_test, y_test)))
print("Train score: {}, Test score: {}".format(Linereg.score(X_train, y_train),Linereg.score(X_test, y_test)))

Train score: 0.8466264507278138, Test score: 0.732041992729159
Train score: 0.8465515173877076, Test score: 0.7308270721403656
Train score: 0.8466320480384335, Test score: 0.7318173894140029


In [200]:
# We select Ridge regression for best model
print(Ridreg_cv.best_params_)
print("Mean Squared Error",mean_squared_error(y_test, y_pred_rid))
print("Mean Absolute Error",mean_absolute_error(y_test, y_pred_rid))

{'alpha': 0.9909999999999999}
Mean Squared Error 0.005067697290480992
Mean Absolute Error 0.05016773283931476
