In [3]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [4]:
data = pd.read_csv("Data/insurance.csv")
data.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462


In [5]:
X = data.drop(columns = "charges")
y = np.array(data["charges"])

categorical = X.select_dtypes(exclude = "number").columns
numeric = X.select_dtypes(include = "number").columns

X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

preprocessing = ColumnTransformer([
    ("num", Pipeline([
        ("imp", SimpleImputer(strategy = "median")),
        ("scaler", StandardScaler())
    ]), numeric),

    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy = "most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown = "ignore", sparse_output = False))
    ]), categorical)
])

X_train = preprocessing.fit_transform(X_train_raw)
X_test = preprocessing.transform(X_test_raw)

In [6]:
class KNN:
    def __init__(self):
        self.x = None
        self.y = None

    def distance(self, x1, x2):
        return np.sqrt(sum((x1 - x2)**2))

    def train(self, x, y):
        self.x = x
        self.y = y

    def predict(self, x, k):
        distance_label = [
            (self.distance(x, train_point), train_label)
            for train_point, train_label in zip(self.x, self.y)
        ]
        neighbors = sorted(distance_label)[:k]
        return sum(
            label for _, label in neighbors) / k
        

In [8]:
model = KNN()
model.train(X_train, y_train)

predictions = []
k = 5

for x in X_test:
    pred = model.predict(x, k)
    predictions.append(float(pred))


y_pred = np.array(predictions)
print("Accuracy", round(r2_score(y_test, y_pred), 2))

Accuracy 0.77
