In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split,GridSearchCV, cross_val_score

In [None]:
df = pd.read_csv('data.csv')

In [None]:
train = df.drop('Grades', axis = 1)
target = df['Grades']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.33, random_state=42)

In [None]:
# Grid Search
grid = {
    'n_neighbors':np.arange(1,21),
    'metric':['euclidean', 'manhattan', 'minkowski' ],
    'p':np.arange(1,3),
    'weights':['uniform','distance'],
    'algorithm':['auto','ball_tree','kd_tree','brute']
}
knn = KNeighborsRegressor()
cv = GridSearchCV(knn, grid, cv=5)
cv.fit(X_train, y_train)
print("Best Parameters:",cv.best_params_)
print("Training_score:",cv.best_score_)
print("Test_score:",cv.score(X_test,y_test))

Best Parameters: {'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 8, 'p': 1, 'weights': 'distance'}
Training_score: 0.7083023961128294
Test_score: 0.7187644914447258


In [None]:
df.describe()

Unnamed: 0,Socioeconomic Score,Study Hours,Sleep Hours,Attendance (%),Grades
count,1388.0,1388.0,1388.0,1388.0,1388.0
mean,0.552274,4.560807,8.047262,58.536023,40.691643
std,0.261272,1.897581,1.3707,11.675287,9.467358
min,0.10128,0.8,4.8,40.0,32.0
25%,0.322118,3.475,7.0,49.0,34.0
50%,0.545945,3.9,8.4,57.0,35.0
75%,0.78961,5.9,9.1,66.0,47.0
max,0.99982,10.0,10.0,100.0,91.0



Reasons to Scale:
1. Range of Values:
   * The ranges of the features differ significantly.
   * Socioeconomic Score ranges from 0.1 to ~1.
   * Study Hours ranges from 0.8 to 10.
   * Attendance (%) ranges from 40 to 100.
   * These differences in scale can lead to biased results, where features with larger ranges dominate the distance calculations.
Standard Deviation Differences:

2. The standard deviations of features is vary considerably, indicating different levels of spread.
   * Socioeconomic Score: 0.26
   * Study Hours: 1.89
   * Attendance: 11.67

In [None]:
# Scalling the data
scl = StandardScaler()
train_s = scl.fit_transform(train)

In [None]:
pd.DataFrame(train_s).describe()

Unnamed: 0,0,1,2,3
count,1388.0,1388.0,1388.0,1388.0
mean,-9.982409e-17,7.166858000000001e-17,-2.674774e-16,1.337387e-16
std,1.00036,1.00036,1.00036,1.00036
min,-1.72677,-1.98261,-2.369909,-1.588201
25%,-0.8812255,-0.5724121,-0.7643102,-0.8170643
50%,-0.02423339,-0.348362,0.2574342,-0.1316093
75%,0.9087127,0.7059914,0.7683064,0.6395275
max,1.713566,2.867416,1.425142,3.552711


In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_s, target, test_size=0.33, random_state=42)

In [None]:
# Grid Search
grid = {
    'n_neighbors':np.arange(1,21),
    'metric':['euclidean', 'manhattan', 'minkowski' ],
    'p':np.arange(1,3),
    'weights':['uniform','distance'],
    'algorithm':['auto','ball_tree','kd_tree','brute']
}
knn = KNeighborsRegressor()
cv = GridSearchCV(knn, grid, cv=5)
cv.fit(X_train, y_train)
print("Best Parameters:",cv.best_params_)
print("Training_score:",cv.best_score_)
print("Test_score:",cv.score(X_test,y_test))

Best Parameters: {'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 6, 'p': 1, 'weights': 'distance'}
Training_score: 0.9183124893567897
Test_score: 0.9384666564548828


In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.33, random_state=42)

In [None]:
grid = {
    'C':[0.1,10,100],
    'gamma':[0.1,0.01,0.001],
    'kernel':['linear','poly','rbf'],
    'degree':[2,3,4]
}

svr = SVR()
svr_m= GridSearchCV(svr, grid, cv=5)
svr_m.fit(X_train, y_train)
print("Best Parameters:",svr_m.best_params_)
print("Training_score:",svr_m.best_score_)
print("Test_score:",svr_m.score(X_test,y_test))