In [2]:
#Necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale, StandardScaler
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn import neighbors
from sklearn.svm import SVR

In [3]:
#Filter warnings
from warnings import filterwarnings
filterwarnings("ignore")

In [4]:
#Read data and change categoric values
df = pd.read_csv("Hitters.csv")

df = df.dropna()

dms = pd.get_dummies(df[["League", "Division", "NewLeague"]])

y = df["Salary"]

X_ = df.drop(["Salary", "League", "Division", "NewLeague"], axis=1).astype("float64")

X = pd.concat([X_, dms[["League_N", "Division_W", "NewLeague_N"]]], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=42)

Basic Prediction

In [6]:
knn_model = KNeighborsRegressor().fit(X_train,y_train)

In [10]:
dir(knn_model)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_check_algorithm_metric',
 '_check_feature_names',
 '_check_n_features',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_estimator_type',
 '_fit',
 '_fit_X',
 '_fit_method',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_kneighbors_reduce_func',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_tree',
 '_validate_data',
 '_validate

In [8]:
knn_model.n_neighbors

5

In [9]:
knn_model.metric

'minkowski'

In [12]:
y_pred = knn_model.predict(X_test)

In [14]:
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
rmse

426.6570764525201

Model Tuning

In [20]:
#Finding the best K
RMSE = []
for k in range(1,51):
    knn_model = KNeighborsRegressor(n_neighbors=k).fit(X_train,y_train)
    y_pred = knn_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test,y_pred))
    RMSE.append((k,rmse))

In [27]:
min_rmse = min(RMSE, key=lambda x: x[1])

print(f"Minimum RMSE is {min_rmse[1]} for k value {min_rmse[0]}")

Minimum RMSE is 413.7094731463598 for k value 8


In [29]:
#Using GridsearchCV
knn_params = {"n_neighbors" : np.arange(1,51,1)}
knn = KNeighborsRegressor()
knn_cv = GridSearchCV(knn,knn_params,cv=10).fit(X_train,y_train)

In [30]:
knn_cv.best_params_

{'n_neighbors': 8}

In [31]:
knn_final = KNeighborsRegressor(n_neighbors=8).fit(X_train,y_train)

y_predicted = knn_final.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_predicted,y_test))

In [32]:
rmse

413.7094731463598