# Exploring different Model Options and their respective accuracy scores

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [3]:
df = pd.read_csv("D:\Machine Learning\\notebook\data_cleaned.csv")

In [4]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,6,330,115,5,4.5,3.0,9.34,1,0.9


In [5]:
df.shape

(391, 10)

#### Preparing our X and Y for the regression problem

In [6]:
X = df.drop('Chance of Admit ', axis=1)
Y = df['Chance of Admit ']

In [7]:
X.head(5)

Unnamed: 0.1,Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,0,1,337,118,4,4.5,4.5,9.65,1
1,1,2,324,107,4,4.0,4.5,8.87,1
2,2,3,316,104,3,3.0,3.5,8.0,1
3,3,4,322,110,3,3.5,2.5,8.67,1
4,5,6,330,115,5,4.5,3.0,9.34,1


In [8]:
X = X.drop('Unnamed: 0', axis=1)

In [9]:
X.head(5)

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,1,337,118,4,4.5,4.5,9.65,1
1,2,324,107,4,4.0,4.5,8.87,1
2,3,316,104,3,3.0,3.5,8.0,1
3,4,322,110,3,3.5,2.5,8.67,1
4,6,330,115,5,4.5,3.0,9.34,1


In [10]:
X = X.drop('Serial No.', axis=1)

In [11]:
X.head(5)

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,337,118,4,4.5,4.5,9.65,1
1,324,107,4,4.0,4.5,8.87,1
2,316,104,3,3.0,3.5,8.0,1
3,322,110,3,3.5,2.5,8.67,1
4,330,115,5,4.5,3.0,9.34,1


In [12]:
X = X.values

In [13]:
X

array([[337.  , 118.  ,   4.  , ...,   4.5 ,   9.65,   1.  ],
       [324.  , 107.  ,   4.  , ...,   4.5 ,   8.87,   1.  ],
       [316.  , 104.  ,   3.  , ...,   3.5 ,   8.  ,   1.  ],
       ...,
       [318.  , 106.  ,   3.  , ...,   3.  ,   8.65,   0.  ],
       [317.  , 104.  ,   2.  , ...,   3.  ,   8.76,   0.  ],
       [312.  , 103.  ,   3.  , ...,   4.  ,   8.78,   0.  ]])

In [14]:
Y.head(5)

0    0.92
1    0.76
2    0.72
3    0.80
4    0.90
Name: Chance of Admit , dtype: float64

In [15]:
Y = Y.values

In [16]:
from sklearn.preprocessing import StandardScaler
standard = StandardScaler()
X = standard.fit_transform(X)

In [17]:
X

array([[ 1.75171732,  1.74071858,  0.77909493, ...,  1.14774969,
         1.76651081,  0.91887955],
       [ 0.60577406, -0.09563919,  0.77909493, ...,  1.14774969,
         0.42959467,  0.91887955],
       [-0.09942179, -0.59646403, -0.09626748, ...,  0.03001587,
        -1.06158103,  0.91887955],
       ...,
       [ 0.07687717, -0.2625808 , -0.09626748, ..., -0.52885104,
         0.05251576, -1.08828192],
       [-0.01127231, -0.59646403, -0.97162989, ..., -0.52885104,
         0.24105521, -1.08828192],
       [-0.45201972, -0.76340565, -0.09626748, ...,  0.58888278,
         0.27533511, -1.08828192]])

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((312, 7), (79, 7))

In [19]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [20]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, Y_train)
    Y_train_predict = model.predict(X_train)
    Y_test_predict = model.predict(X_test)
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(Y_train, Y_train_predict)
    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(Y_test, Y_test_predict)
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')


Linear Regression
Model performance for Training set
- Root Mean Squared Error: 0.0602
- Mean Absolute Error: 0.0428
- R2 Score: 0.8020
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.0584
- Mean Absolute Error: 0.0422
- R2 Score: 0.8239


Lasso
Model performance for Training set
- Root Mean Squared Error: 0.1352
- Mean Absolute Error: 0.1085
- R2 Score: 0.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.1399
- Mean Absolute Error: 0.1156
- R2 Score: -0.0110


Ridge
Model performance for Training set
- Root Mean Squared Error: 0.0602
- Mean Absolute Error: 0.0428
- R2 Score: 0.8020
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.0583
- Mean Absolute Error: 0.0421
- R2 Score: 0.8246


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 0.0541
- Mean Absolute Error: 0.0386
- R2 Score: 0.8401
----------------------

In [21]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)


Unnamed: 0,Model Name,R2_Score
2,Ridge,0.824619
0,Linear Regression,0.823905
7,CatBoosting Regressor,0.79704
3,K-Neighbors Regressor,0.794207
8,AdaBoost Regressor,0.788546
5,Random Forest Regressor,0.768101
6,XGBRegressor,0.713878
4,Decision Tree,0.667414
1,Lasso,-0.01097


In [22]:
df = pd.read_csv('D:\Machine Learning\\artifacts\data.csv')
df.columns

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA',
       'Research', 'Chance of Admit '],
      dtype='object')

In [24]:
df_1 = pd.read_csv('D:\Machine Learning\\artifacts\\test.csv')
df_1.columns

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA',
       'Research', 'Chance of Admit '],
      dtype='object')