**import libraries**

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score

**pre-processing**

In [13]:
df = pd.read_csv('train.csv')
print(df.head())

print(df.isnull().sum())
df['TBath'] = df['FullBath'] + df['HalfBath']

x = df[['LotArea', 'BedroomAbvGr', 'TBath']]
y = df['SalePrice']

scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

poly = PolynomialFeatures(degree=2, include_bias=False)
x_poly = poly.fit_transform(x_scaled)

x_train, x_test, y_train, y_test = train_test_split(x_poly, y, test_size=0.2, random_state=42)

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [14]:
print(x)
print(x_poly)
print(y)

      LotArea  BedroomAbvGr  TBath
0        8450             3      3
1        9600             3      2
2       11250             3      3
3        9550             3      1
4       14260             4      3
...       ...           ...    ...
1455     7917             3      3
1456    13175             3      2
1457     9042             4      2
1458     9717             2      1
1459     9937             3      2

[1460 rows x 3 columns]
[[-0.20714171  0.16377912  1.32385068 ...  0.0268236   0.2168191
   1.75258063]
 [-0.09188637  0.16377912  0.06550303 ...  0.0268236   0.01072803
   0.00429065]
 [ 0.07347998  0.16377912  1.32385068 ...  0.0268236   0.2168191
   1.75258063]
 ...
 [-0.14781027  1.39002276  0.06550303 ...  1.93216328  0.0910507
   0.00429065]
 [-0.08016039 -1.06246453 -1.19284462 ...  1.12883087  1.2673551
   1.4228783 ]
 [-0.05811155  0.16377912  0.06550303 ...  0.0268236   0.01072803
   0.00429065]]
0       208500
1       181500
2       223500
3       140000
4      

**linear regression model**

In [15]:
model = LinearRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

**evaluation**

In [16]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 4502939620.881385
R-squared: 0.4129402714760536


**cross-validation and prediction**

In [35]:
cvs = cross_val_score(model, x_poly, y, cv=5)
print("Cross-Validation Scores:", cvs)
print("Mean Cross-Validation Score:", cvs.mean())

new_data = pd.DataFrame({'LotArea': [1500], 'BedroomAbvGr': [2], 'TBath':[1]})

new_datas = scaler.transform(new_data)
new_datap = poly.transform(new_datas)

prediction = model.predict(new_datap)

formatted_prediction = "${:,.3f}".format(prediction[0])
print("Predicted Price:", formatted_prediction)

Cross-Validation Scores: [0.45858925 0.20540491 0.43593608 0.41292612 0.32072538]
Mean Cross-Validation Score: 0.3667163491724095
Predicted Price: $105,253.957
