In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Understanding data:

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
ap = pd.read_csv("/kaggle/input/graduate-admissions/Admission_Predict_Ver1.1.csv")
ap.head()

In [None]:
ap.shape

In [None]:
#let's check all values
ap.info()

In [None]:
#fancy stuff
ap.hist(figsize=(14, 10))
None

# Select features:

In [None]:
num_cols = [
    'GRE Score',
    'TOEFL Score',
    'University Rating',
    'SOP',
    'LOR',
    'CGPA',
    'Research'
]

target_col = 'Chance of Admit'

ap.columns = ap.columns.to_series().apply(lambda x: x.strip())

cols = num_cols + [target_col]
ap = ap[cols]

# Further Analysis

In [None]:
ap.describe()

In [None]:
ap.corr().style.background_gradient(cmap='coolwarm').set_precision(2)
None

# Data Preparations:

In [None]:
X = ap.drop('Chance of Admit', axis=1)
y = ap['Chance of Admit']

we're splitting the data by 70:15:15 because we'll use validation to find best hyperparameters and later test to evaluate the model.

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training, validation, and testing sets as 70:15:15
X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, test_size=0.5, random_state=42)

When preparing your data for machine learning, it is important to split your data into training and test sets *before doing any scaling* or transformation. You should apply any scaling or transformation on the training set only, and then apply the same transformation to the test set using the parameters learned from the training set. This approach ensures that your model is evaluated on unseen data and avoids any data leakage issues.

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit the scaler on the training set
scaler.fit(X_train)

# Transform the training, validation, and testing sets
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

# Evaluate on validation set
y_pred = lr.predict(X_val_scaled)
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("Linear Regression")
print("MSE val: ", mse)
print("R2 val score: ", r2)

# Evaluate on test set
y_pred_test = lr.predict(X_test_scaled)
mse_test = mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

print()
print("MSE test : ", mse_test)
print("R2 test score: ", r2_test)

If the points are *closely* clustered around the diagonal line, it indicates good performance of the model.

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Plot actual vs predicted for linear regression
plt.scatter(y_test, y_pred_test)
plt.plot([0, 1], [0, 1], '--k')  # diagonal line for reference
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Linear Regression')
plt.show()

# Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
max_r2, best_deg = 0, 0

r2_values = []

degrees = np.arange(1,5)

for deg in degrees:
  #poly features
  poly_features = PolynomialFeatures(degree=deg, include_bias=False)
  X_train_poly = poly_features.fit_transform(X_train_scaled)

  #linear Regression
  lr_poly = LinearRegression()
  lr_poly.fit(X_train_poly, y_train)  

  #test
  X_val_poly = poly_features.transform(X_val_scaled)
  y_pred_poly = lr_poly.predict(X_val_poly)

  #mse_poly = mean_squared_error(y_val, y_pred_poly)
  r2_poly = r2_score(y_val, y_pred_poly)
  r2_values.append(r2_poly)

  if r2_poly > max_r2:
    max_r2 = r2_poly
    best_deg = deg

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(degrees, r2_values)
ax.set_yscale('log')
ax.set_xlabel('Degree')
ax.set_ylabel('R2')

In [None]:
degree = best_deg  # degree of the polynomial
poly = PolynomialFeatures(degree)
X_train_poly = poly.fit_transform(X_train_scaled)

lr_poly = LinearRegression()
lr_poly.fit(X_train_poly, y_train)

# Evaluate on validation set
X_val_poly = poly.transform(X_val_scaled)
y_pred_poly = lr_poly.predict(X_val_poly)
mse_poly = mean_squared_error(y_val, y_pred_poly)
r2_poly = r2_score(y_val, y_pred_poly)

print("Polynomial Regression (degree = {})".format(degree))
print("MSE: ", mse_poly)
print("R2 score: ", r2_poly)

# Evaluate on test set
X_test_poly = poly.transform(X_test_scaled)
y_pred_test_poly = lr_poly.predict(X_test_poly)
mse_poly = mean_squared_error(y_test, y_pred_test_poly)
r2_poly = r2_score(y_test, y_pred_test_poly)

print()
print("MSE test : ", mse_poly)
print("R2 test score: ", r2_poly)

Actually, results are different from those that I had in Colab. Here results for Linear and Polynomial are the same because the best degree found is 1, therefore those functions are the same.  

In [None]:
# Plot actual vs predicted for polynomial regression
plt.scatter(y_test, y_pred_test_poly)
plt.plot([0, 1], [0, 1], '--k')  # diagonal line for reference
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Polynomial Regression (degree = {})'.format(degree))
plt.show()

# Multicolinearity Check:

In [None]:
corr_matrix = X.corr()

# Display the correlation matrix
print(corr_matrix)

In [None]:
import seaborn as sns

# Create a heatmap of the correlation matrix
sns.heatmap(corr_matrix, cmap='coolwarm')
None

As we can see there's a *big colinearity* between TOEFL, GRE scores and CGPA. So let's use Lasso, Ridge and ElasticNet Regressions.

# Lasso Regression:

Lasso is particularly useful when dealing with high-dimensional datasets with many irrelevant features.

In [None]:
from sklearn.linear_model import LassoCV

# Define the range of alpha values to try
alphas = [0.001, 0.01, 0.1, 1]

# Create a LassoCV object with the range of alpha values and set cv=5 for 5-fold cross-validation
lasso_cv = LassoCV(alphas=alphas, cv=5)

# Fit the LassoCV object to the training data
lasso_cv.fit(X_train_scaled, y_train)

# Print the best alpha value and corresponding R^2 score
print("Best alpha:", lasso_cv.alpha_)
print("R^2 score with best alpha:", lasso_cv.score(X_train_scaled, y_train))

# Evaluate the model on the validation set
val_score = lasso_cv.score(X_val_scaled, y_val)
print("Validation R^2 score:", val_score)

# Evaluate the model on the test set
test_score = lasso_cv.score(X_test_scaled, y_test)
print("Test R^2 score:", test_score)

# Ridge Regression:

Ridge is particularly useful when there are many predictors with small or moderate effect sizes. Probably, not our case. 

In [None]:
from sklearn.linear_model import RidgeCV

# Define the range of alpha values to try
alphas = [1, 10, 20, 30]

ridge_cv = RidgeCV(alphas=alphas, cv=5)

ridge_cv.fit(X_train_scaled, y_train)

print("Best alpha:", ridge_cv.alpha_)
print("R^2 score with best alpha:", ridge_cv.score(X_train_scaled, y_train))

val_score = ridge_cv.score(X_val_scaled, y_val)
print("Validation R^2 score:", val_score)

test_score = ridge_cv.score(X_test_scaled, y_test)
print("Test R^2 score:", test_score)

In [None]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV

elastic_net = ElasticNet()

# Set up the parameter grid
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10],
              'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]}

grid_search = GridSearchCV(elastic_net, param_grid, cv=5)

grid_search.fit(X_train_scaled, y_train)

print("Best alpha value:", grid_search.best_params_['alpha'])
print("Best l1_ratio value:", grid_search.best_params_['l1_ratio'])

# Print R^2 score with best hyperparameters
elastic_net_best = ElasticNet(alpha=grid_search.best_params_['alpha'], l1_ratio=grid_search.best_params_['l1_ratio'])
elastic_net_best.fit(X_train_scaled, y_train)
print("R^2 score with best hyperparameters:", elastic_net_best.score(X_train_scaled, y_train))

val_score = elastic_net_best.score(X_val_scaled, y_val)
print("Validation R^2 score:", val_score)

test_score = elastic_net_best.score(X_test_scaled, y_test)
print("Test R^2 score:", test_score)

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {'max_depth': [1, 2, 3, 4, 5],
              'min_samples_split': [2, 3, 4, 5, 6]}

# Create a DecisionTreeRegressor object
dt = DecisionTreeRegressor()

# Create a GridSearchCV object with the parameter grid to search and 5-fold cross-validation
grid_search = GridSearchCV(dt, param_grid, cv=5)

# Fit the grid search object to the training data
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters and R^2 score
print("Best hyperparameters:", grid_search.best_params_)
print("Training R^2 score:", grid_search.best_score_)
print("Validation R^2 score:", grid_search.score(X_val_scaled, y_val))
print("Test R^2 score:", grid_search.score(X_test_scaled, y_test))

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {'n_estimators': [30, 50, 100],
              'max_depth': [2, 3, 4, 5],
              'min_samples_split': [3, 4, 5, 6]}

# Create a RandomForestRegressor object
rf = RandomForestRegressor()

# Create a GridSearchCV object with the parameter grid to search and 5-fold cross-validation
grid_search = GridSearchCV(rf, param_grid, cv=5)

# Fit the grid search object to the training data
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters and R^2 score
print("Best hyperparameters:", grid_search.best_params_)
print("Training R^2 score:", grid_search.best_score_)
print("Validation R^2 score:", grid_search.score(X_val_scaled, y_val))
print("Test R^2 score:", grid_search.score(X_test_scaled, y_test))

# Gradient Boosting:

It is very slow and scores are worse, so I wouldn't advise you to use Gradient Boosting here

In [None]:
"""
from xgboost import XGBRegressor

# Create an instance of the XGBRegressor class
xgb = XGBRegressor()

# Define the hyperparameter grid
param_grid = {'n_estimators': [100, 500, 1000],
              'max_depth': [3, 5, 7],
              'learning_rate': [0.01, 0.1, 1.0]}

# Perform a grid search to find the best hyperparameters
grid_search = GridSearchCV(xgb, param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Train the XGBoost model with the best hyperparameters
xgb_best = XGBRegressor(**grid_search.best_params_)
xgb_best.fit(X_train_scaled, y_train)

# Evaluate the model on the validation set
val_pred = xgb_best.predict(X_val_scaled)
val_score = r2_score(y_val, val_pred)
print("Validation R^2 score:", val_score)

# Evaluate the model on the test set
test_pred = xgb_best.predict(X_test_scaled)
test_score = r2_score(y_test, test_pred)
print("Test R^2 score:", test_score)
"""

# Conclusion:

Best result was achivied by Decision Tree. Sometimes it might Random Forest. 

For datasets like this one it may be better to use a simpler model, such as a decision tree or a random forest, that can handle a small amount of data without overfitting. They do not require the removal of highly correlated features, and they can be a good choice of algorithm for datasets with multicollinearity issues.

In summary, decision trees and random forest are a good choice for small datasets (500x9) with highly correlated features. 