## K-Neighbours Regression

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

## Reading the dataset

In [2]:
import pandas as pd
df=pd.read_csv("50_Startups (1).csv")
df.head()

Unnamed: 0,RND,ADMIN,MKT,STATE,PROFIT
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RND     50 non-null     float64
 1   ADMIN   50 non-null     float64
 2   MKT     50 non-null     float64
 3   STATE   50 non-null     object 
 4   PROFIT  50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [4]:
df.isna().sum()

RND       0
ADMIN     0
MKT       0
STATE     0
PROFIT    0
dtype: int64

## Separate X and Y

In [5]:
X=df.drop(columns=['PROFIT'])
Y=df[['PROFIT']]

In [6]:
X.head()

Unnamed: 0,RND,ADMIN,MKT,STATE
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [7]:
Y.head()

Unnamed: 0,PROFIT
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


In [8]:
Y.value_counts()

PROFIT   
14681.40     1
141585.52    1
110352.25    1
111313.02    1
118474.03    1
122776.86    1
124266.90    1
125370.37    1
126992.93    1
129917.04    1
132602.65    1
134307.35    1
144259.40    1
35673.41     1
146121.95    1
149759.96    1
152211.77    1
155752.60    1
156122.51    1
156991.12    1
166187.94    1
182901.99    1
191050.39    1
191792.06    1
108733.99    1
108552.04    1
107404.34    1
105733.54    1
42559.73     1
49490.75     1
64926.08     1
65200.33     1
69758.98     1
71498.49     1
77798.83     1
78239.91     1
81005.76     1
81229.06     1
89949.14     1
90708.19     1
96479.51     1
96712.80     1
96778.92     1
97427.84     1
97483.56     1
99937.59     1
101004.64    1
103282.38    1
105008.31    1
192261.83    1
Name: count, dtype: int64

## ## Apply Label encoder on X For 'STATE'

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['STATE'] = le.fit_transform(X['STATE'])
X.head()

Unnamed: 0,RND,ADMIN,MKT,STATE
0,165349.2,136897.8,471784.1,2
1,162597.7,151377.59,443898.53,0
2,153441.51,101145.55,407934.54,1
3,144372.41,118671.85,383199.62,2
4,142107.34,91391.77,366168.42,1


## Preprocess X feature

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [11]:
num_pipe=Pipeline(steps=[('imputer',SimpleImputer(strategy='median')),
                            ('scalar',StandardScaler())])

In [12]:
X_pre=num_pipe.fit_transform(X)
X_pre[0:5]

array([[ 2.01641149,  0.56075291,  2.15394309,  1.21267813],
       [ 1.95586034,  1.08280658,  1.9236004 , -1.21267813],
       [ 1.75436374, -0.72825703,  1.62652767,  0.        ],
       [ 1.55478369, -0.09636463,  1.42221024,  1.21267813],
       [ 1.5049372 , -1.07991935,  1.28152771,  0.        ]])

In [13]:
cols=num_pipe.get_feature_names_out()
cols

array(['RND', 'ADMIN', 'MKT', 'STATE'], dtype=object)

## Train Test Split

In [14]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(X_pre,Y,test_size=0.33,random_state=21)

In [15]:
xtrain.shape

(33, 4)

In [16]:
xtest.shape

(17, 4)

## Create KNN Model

In [17]:
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor(n_neighbors=9)
model.fit(xtrain,ytrain)

In [18]:
# Model accuracy in training
model.score(xtrain, ytrain)

0.7161587340497921

In [19]:
# Model accuracy in training
model.score(xtest, ytest)

0.7587683186523781

### Tune the model with GridSearchCV

In [20]:
params = {'n_neighbors':[2,3,4,5,6,7,8,9]}

In [21]:
from sklearn.model_selection import GridSearchCV
knn = KNeighborsRegressor()
gscv = GridSearchCV(knn, param_grid=params, cv=5, scoring='f1_macro')
gscv.fit(xtrain, ytrain)

In [22]:
gscv.best_params_

{'n_neighbors': 2}

In [23]:
gscv.best_score_

nan

In [24]:
best_knn = gscv.best_estimator_
best_knn

In [25]:
ypred_tr = best_knn.predict(xtrain)
ypred_ts = best_knn.predict(xtest)

In [26]:
ypred_ts[0:5]

array([[146898.645],
       [ 56054.62 ],
       [ 82965.36 ],
       [ 97070.32 ],
       [146898.645]])

In [27]:
ytest.head()

Unnamed: 0,PROFIT
7,155752.6
44,65200.33
43,69758.98
25,107404.34
14,132602.65


In [28]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cf = confusion_matrix(ytest, ypred_ts)
cfd = ConfusionMatrixDisplay(cf, display_labels=best_knn.classes_)
cfd.plot()

ValueError: continuous is not supported