In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

dataset = pd.read_csv('C:/Users/d_tol/Desktop/L.3 - first term/ML/ML Sections/AmesHousing.csv')
columns = [
        'Lot Frontage','Overall Qual','Year Built','Year Remod/Add','Mas Vnr Area','Exter Qual','BsmtFin SF 1',
        'Total Bsmt SF','1st Flr SF','Gr Liv Area','Full Bath','Kitchen Qual','TotRms AbvGrd','Fireplaces',
        'Garage Yr Blt', 'Garage Cars', 'Garage Area','SalePrice'
        ]

num_columns = [
        'Lot Frontage','Year Built','Year Remod/Add','Mas Vnr Area','BsmtFin SF 1',
        'Total Bsmt SF','1st Flr SF','Gr Liv Area','Full Bath','TotRms AbvGrd','Fireplaces',
        'Garage Yr Blt', 'Garage Cars', 'Garage Area','SalePrice'
        ]

cat_columns = ['Overall Qual','Exter Qual', 'Kitchen Qual']
df = dataset[columns]
print(df.shape)
df.head()

(2930, 18)


Unnamed: 0,Lot Frontage,Overall Qual,Year Built,Year Remod/Add,Mas Vnr Area,Exter Qual,BsmtFin SF 1,Total Bsmt SF,1st Flr SF,Gr Liv Area,Full Bath,Kitchen Qual,TotRms AbvGrd,Fireplaces,Garage Yr Blt,Garage Cars,Garage Area,SalePrice
0,141.0,6,1960,1960,112.0,TA,639.0,1080.0,1656,1656,1,TA,7,2,1960.0,2.0,528.0,215000
1,80.0,5,1961,1961,0.0,TA,468.0,882.0,896,896,1,TA,5,0,1961.0,1.0,730.0,105000
2,81.0,6,1958,1958,108.0,TA,923.0,1329.0,1329,1329,1,Gd,6,0,1958.0,1.0,312.0,172000
3,93.0,7,1968,1968,0.0,Gd,1065.0,2110.0,2110,2110,2,Ex,8,2,1968.0,2.0,522.0,244000
4,74.0,5,1997,1998,0.0,TA,791.0,928.0,928,1629,2,TA,6,1,1997.0,2.0,482.0,189900


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Lot Frontage    2440 non-null   float64
 1   Overall Qual    2930 non-null   int64  
 2   Year Built      2930 non-null   int64  
 3   Year Remod/Add  2930 non-null   int64  
 4   Mas Vnr Area    2907 non-null   float64
 5   Exter Qual      2930 non-null   object 
 6   BsmtFin SF 1    2929 non-null   float64
 7   Total Bsmt SF   2929 non-null   float64
 8   1st Flr SF      2930 non-null   int64  
 9   Gr Liv Area     2930 non-null   int64  
 10  Full Bath       2930 non-null   int64  
 11  Kitchen Qual    2930 non-null   object 
 12  TotRms AbvGrd   2930 non-null   int64  
 13  Fireplaces      2930 non-null   int64  
 14  Garage Yr Blt   2771 non-null   float64
 15  Garage Cars     2929 non-null   float64
 16  Garage Area     2929 non-null   float64
 17  SalePrice       2930 non-null   i

### Detecting NULL Values

In [3]:
print(df.isnull().sum())

Lot Frontage      490
Overall Qual        0
Year Built          0
Year Remod/Add      0
Mas Vnr Area       23
Exter Qual          0
BsmtFin SF 1        1
Total Bsmt SF       1
1st Flr SF          0
Gr Liv Area         0
Full Bath           0
Kitchen Qual        0
TotRms AbvGrd       0
Fireplaces          0
Garage Yr Blt     159
Garage Cars         1
Garage Area         1
SalePrice           0
dtype: int64


### Handle NULL Values

In [4]:
df[num_columns] = df[num_columns].fillna(df[num_columns].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[num_columns] = df[num_columns].fillna(df[num_columns].mean())


In [5]:
print(df.isnull().sum())

Lot Frontage      0
Overall Qual      0
Year Built        0
Year Remod/Add    0
Mas Vnr Area      0
Exter Qual        0
BsmtFin SF 1      0
Total Bsmt SF     0
1st Flr SF        0
Gr Liv Area       0
Full Bath         0
Kitchen Qual      0
TotRms AbvGrd     0
Fireplaces        0
Garage Yr Blt     0
Garage Cars       0
Garage Area       0
SalePrice         0
dtype: int64


### Handling Outliers

In [6]:
def remove_outliers_iqr(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

num_columns = [
        'Lot Frontage','Year Built','Year Remod/Add','Mas Vnr Area','BsmtFin SF 1',
        'Total Bsmt SF','1st Flr SF','Gr Liv Area','Full Bath','TotRms AbvGrd','Fireplaces',
        'Garage Yr Blt', 'Garage Cars', 'Garage Area','SalePrice'
        ]
df = remove_outliers_iqr(df, num_columns)
print("Shape after outlier removal:", df.shape)


Shape after outlier removal: (2210, 18)


### Scale Numerical Columns and Encode Categorical Columns

In [7]:
scaler = StandardScaler()
df[num_columns] = scaler.fit_transform(df[num_columns])

le = LabelEncoder()
df['Exter Qual'] = le.fit_transform(df['Exter Qual'])
df['Kitchen Qual'] = le.fit_transform(df['Kitchen Qual'])
df.head(20)

Unnamed: 0,Lot Frontage,Overall Qual,Year Built,Year Remod/Add,Mas Vnr Area,Exter Qual,BsmtFin SF 1,Total Bsmt SF,1st Flr SF,Gr Liv Area,Full Bath,Kitchen Qual,TotRms AbvGrd,Fireplaces,Garage Yr Blt,Garage Cars,Garage Area,SalePrice
1,0.993321,5,-0.244027,-1.030369,-0.587375,3,0.179968,-0.408516,-0.656627,-1.301527,-0.927643,4,-0.909671,-0.864497,-0.593636,-0.94255,1.611457,-1.173917
2,1.069611,6,-0.342842,-1.171931,0.585191,3,1.367616,1.035865,0.832627,-0.155601,-0.927643,2,-0.140298,-0.864497,-0.714377,-0.94255,-0.674532,0.191613
4,0.535581,5,0.941755,0.71556,-0.587375,3,1.023068,-0.259878,-0.546567,0.638343,0.972348,4,-0.140298,0.793956,0.855248,0.527724,0.255177,0.556434
5,0.840741,6,0.974693,0.71556,-0.370233,3,0.529737,-0.26634,-0.553446,0.572181,0.972348,2,0.629076,0.793956,0.895495,0.527724,0.18955,0.670568
6,-1.981988,8,1.073508,0.857122,-0.587375,2,0.56628,1.064947,0.863582,-0.131783,0.972348,2,-0.140298,-0.864497,1.016235,0.527724,0.802064,1.037427
7,-1.829408,8,0.777063,0.432436,-0.587375,2,-0.355126,0.877532,0.664097,-0.285279,0.972348,2,-0.909671,-0.864497,0.654014,0.527724,0.38643,0.589044
8,-2.134568,8,0.875878,0.621185,-0.587375,2,2.038442,1.895385,1.819732,0.603939,0.972348,2,-0.909671,0.793956,0.774755,0.527724,0.944255,1.506191
9,-0.532479,7,1.007631,0.762747,-0.587375,3,-1.041613,-0.046613,-0.202628,1.101477,0.972348,2,0.629076,0.793956,0.935742,0.527724,0.036422,0.538091
10,0.611871,6,0.810001,0.526811,-0.587375,3,-1.041613,-0.793039,-1.114066,0.707152,0.972348,4,0.629076,0.793956,0.694261,0.527724,0.025484,0.271099
11,0.171265,6,0.777063,1.140245,-0.587375,3,1.398939,0.515629,0.344234,-0.531401,0.972348,4,-0.140298,-0.864497,0.654014,0.527724,-0.083894,0.456567


### Split The Data Into Training and Testing 

In [8]:
X = df.drop(columns=['SalePrice'])
y = df['SalePrice']

print(X.shape)
print(y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

(2210, 17)
(2210,)


### Getting the best number of Neighbors to train the knn model

In [9]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': range(1, 20)}
grid_search = GridSearchCV(KNeighborsRegressor(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best n_neighbors:", grid_search.best_params_['n_neighbors'])


Best n_neighbors: 11


### Train The Model

In [10]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

### Calculate The MSE, RMSE and R2 Score

In [11]:
lr_preds = lr_model.predict(X_test)

lr_mse = mean_squared_error(y_test, lr_preds)
lr_r2 = r2_score(y_test, lr_preds)
lr_rmse = np.sqrt(lr_mse)


print("Linear Regression: MSE =", lr_mse, " , RMSE = ", lr_rmse,", R2 =", lr_r2)
print("Train accuracy =", lr_model.score(X_train, y_train))
print("Test accuracy  =", lr_model.score(X_test, y_test))


Linear Regression: MSE = 0.1583920102719532  , RMSE =  0.3979849372425459 , R2 = 0.850066309507564
Train accuracy = 0.8489695768305543
Test accuracy  = 0.850066309507564


In [12]:
knn_model = KNeighborsRegressor(n_neighbors=11)
knn_model.fit(X_train, y_train)

knn_preds = knn_model.predict(X_test)

knn_mse = mean_squared_error(y_test, knn_preds)
knn_r2 = r2_score(y_test, knn_preds)
knn_rmse = np.sqrt(knn_mse)

print("KNN Regression: MSE =", knn_mse, ", RMSE =", knn_rmse, ", R2 =", knn_r2)
print("Train accuracy =", knn_model.score(X_train, y_train))
print("Test accuracy  =", knn_model.score(X_test, y_test))

KNN Regression: MSE = 0.17511389315816378 , RMSE = 0.4184661194865885 , R2 = 0.8342373948495132
Train accuracy = 0.8460721935267181
Test accuracy  = 0.8342373948495132
