In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

In [2]:
pd.set_option('display.max_Columns',None)
pd.options.display.float_format = '{:,.2f}'.format

**IMPORT DATSET**

In [4]:
df= pd.read_csv('house_price_regression_dataset.csv')
df

Unnamed: 0,Square_Footage,Num_Bedrooms,Num_Bathrooms,Year_Built,Lot_Size,Garage_Size,Neighborhood_Quality,House_Price
0,1360,2,1,1981,0.60,0,5,262382.85
1,4272,3,3,2016,4.75,1,6,985260.85
2,3592,1,2,2016,3.63,0,9,777977.39
3,966,1,2,1977,2.73,1,8,229698.92
4,4926,2,1,1993,4.70,0,8,1041740.86
...,...,...,...,...,...,...,...,...
995,3261,4,1,1978,2.17,2,10,701494.00
996,3179,1,2,1999,2.98,1,10,683723.16
997,2606,4,2,1962,4.06,0,2,572024.02
998,4723,5,2,1950,1.93,0,7,964865.30


In [5]:
df.shape

(1000, 8)

In [6]:
#pengecekan tipe data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Square_Footage        1000 non-null   int64  
 1   Num_Bedrooms          1000 non-null   int64  
 2   Num_Bathrooms         1000 non-null   int64  
 3   Year_Built            1000 non-null   int64  
 4   Lot_Size              1000 non-null   float64
 5   Garage_Size           1000 non-null   int64  
 6   Neighborhood_Quality  1000 non-null   int64  
 7   House_Price           1000 non-null   float64
dtypes: float64(2), int64(6)
memory usage: 62.6 KB


In [7]:
#pengecekan nilai kosong
df.isna().sum()

Unnamed: 0,0
Square_Footage,0
Num_Bedrooms,0
Num_Bathrooms,0
Year_Built,0
Lot_Size,0
Garage_Size,0
Neighborhood_Quality,0
House_Price,0


In [9]:
df.drop_duplicates(inplace=True)

In [10]:
df.describe()

Unnamed: 0,Square_Footage,Num_Bedrooms,Num_Bathrooms,Year_Built,Lot_Size,Garage_Size,Neighborhood_Quality,House_Price
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,2815.42,2.99,1.97,1986.55,2.78,1.02,5.62,618861.02
std,1255.51,1.43,0.82,20.63,1.3,0.81,2.89,253568.06
min,503.0,1.0,1.0,1950.0,0.51,0.0,1.0,111626.85
25%,1749.5,2.0,1.0,1969.0,1.67,0.0,3.0,401648.23
50%,2862.5,3.0,2.0,1986.0,2.81,1.0,6.0,628267.29
75%,3849.5,4.0,3.0,2004.25,3.92,2.0,8.0,827141.28
max,4999.0,5.0,3.0,2022.0,4.99,2.0,10.0,1108236.84


In [11]:
df.corr(method='pearson')

Unnamed: 0,Square_Footage,Num_Bedrooms,Num_Bathrooms,Year_Built,Lot_Size,Garage_Size,Neighborhood_Quality,House_Price
Square_Footage,1.0,-0.04,-0.03,-0.02,0.09,0.03,-0.01,0.99
Num_Bedrooms,-0.04,1.0,0.02,-0.02,-0.01,0.11,-0.05,0.01
Num_Bathrooms,-0.03,0.02,1.0,-0.02,0.03,0.02,0.02,-0.0
Year_Built,-0.02,-0.02,-0.02,1.0,-0.06,-0.03,-0.01,0.05
Lot_Size,0.09,-0.01,0.03,-0.06,1.0,0.0,0.04,0.16
Garage_Size,0.03,0.11,0.02,-0.03,0.0,1.0,-0.01,0.05
Neighborhood_Quality,-0.01,-0.05,0.02,-0.01,0.04,-0.01,1.0,-0.01
House_Price,0.99,0.01,-0.0,0.05,0.16,0.05,-0.01,1.0


**DATA SPLITTING**

In [12]:
X = df.drop(columns=['House_Price'])
y = df.House_Price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,shuffle=True, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape



((800, 7), (200, 7), (800,), (200,))

**PREDIKSI HARGA RUMAH MENGGUNAKAN LINEAR REGRESSI**

In [13]:
from sklearn.linear_model import LinearRegression

In [14]:
#TRAINING, VALIDATING, TESTING
model = GridSearchCV(LinearRegression(),param_grid={}, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train,y_train)
model.score(X_train, y_train),model.score(X_test, y_test),model.best_score_

Fitting 3 folds for each of 1 candidates, totalling 3 fits


(0.9985375946918145, 0.9984263636823408, np.float64(0.9984954474304325))

**EVALUASI MODEL MENGGUNAKAN R2**

In [15]:
from sklearn.metrics import r2_score

In [16]:
y_pred = model.predict(X_test)
R2 = r2_score(y_test, y_pred)
print(R2)

0.9984263636823408


**PREDIKSI DATA BARU**

In [17]:
prediksi = {
    'Square_Footage': [1800, 2500, 3200, 1500, 4000, 2750, 2200, 3100, 2000, 3600],  # Luas rumah dalam sqft
    'Num_Bedrooms': [3, 4, 5, 2, 6, 4, 3, 5, 3, 5],  # Jumlah kamar tidur
    'Num_Bathrooms': [2, 3, 4, 1, 5, 3, 2, 4, 2, 4],  # Jumlah kamar mandi
    'Year_Built': [2005, 2010, 2018, 1995, 2022, 2015, 2008, 2020, 2012, 2019],  # Tahun dibangun
    'Lot_Size': [4000, 5000, 6000, 3000, 7000, 5500, 4500, 6200, 4800, 6800],  # Luas tanah dalam sqft
    'Garage_Size': [1, 2, 2, 1, 3, 2, 1, 2, 1, 3],  # Kapasitas garasi
    'Neighborhood_Quality': [7, 8, 9, 6, 10, 8, 7, 9, 7, 9]  # Kualitas lingkungan (misalnya skala 1-10)
}

# Ubah ke DataFrame
X_pred = pd.DataFrame(prediksi)

In [18]:
y_pred_ = model.predict(X_pred)

In [19]:
X_pred['Predicted_Price'] = y_pred_

In [20]:
X_pred

Unnamed: 0,Square_Footage,Num_Bedrooms,Num_Bathrooms,Year_Built,Lot_Size,Garage_Size,Neighborhood_Quality,Predicted_Price
0,1800,3,2,2005,4000,1,7,59935180.22
1,2500,4,3,2010,5000,2,8,74988886.45
2,3200,5,4,2018,6000,2,9,90040427.14
3,1500,2,1,1995,3000,1,6,44961457.75
4,4000,6,5,2022,7000,3,10,105113091.15
5,2750,4,3,2015,5500,2,8,82486424.65
6,2200,3,2,2008,4500,1,7,67460658.34
7,3100,5,4,2020,6200,2,9,92999539.77
8,2000,3,2,2012,4800,1,7,71890345.16
9,3600,5,4,2019,6800,3,9,102034679.65
