In [None]:
import warnings
# Ignoring future warnings and deprecation warnings so as not to make the notebook full of warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
df = pd.read_csv('/content/VN_housing_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Ngày,Địa chỉ,Quận,Huyện,Loại hình nhà ở,Giấy tờ pháp lý,Số tầng,Số phòng ngủ,Diện tích,Dài,Rộng,Giá/m2
0,0.0,2020-08-05,"Đường Hoàng Quốc Việt, Phường Nghĩa Đô, Quận C...",Quận Cầu Giấy,Phường Nghĩa Đô,"Nhà ngõ, hẻm",Đã có sổ,4.0,5 phòng,46 m²,,,"86,96 triệu/m²"
1,1.0,2020-08-05,"Đường Kim Giang, Phường Kim Giang, Quận Thanh ...",Quận Thanh Xuân,Phường Kim Giang,"Nhà mặt phố, mặt tiền",,,3 phòng,37 m²,,,"116,22 triệu/m²"
2,2.0,2020-08-05,"phố minh khai, Phường Minh Khai, Quận Hai Bà T...",Quận Hai Bà Trưng,Phường Minh Khai,"Nhà ngõ, hẻm",Đã có sổ,4.0,4 phòng,40 m²,10 m,4 m,65 triệu/m²
3,3.0,2020-08-05,"Đường Võng Thị, Phường Thụy Khuê, Quận Tây Hồ,...",Quận Tây Hồ,Phường Thụy Khuê,"Nhà ngõ, hẻm",Đã có sổ,,6 phòng,51 m²,12.75 m,4 m,100 triệu/m²
4,4.0,2020-08-05,"Đường Kim Giang, Phường Kim Giang, Quận Thanh ...",Quận Thanh Xuân,Phường Kim Giang,"Nhà ngõ, hẻm",,,4 phòng,36 m²,9 m,4 m,"86,11 triệu/m²"


# **Data preprocessing**

In [None]:
df.describe()

Unnamed: 0.1,Unnamed: 0
count,82496.0
mean,41247.5
std,23814.688241
min,0.0
25%,20623.75
50%,41247.5
75%,61871.25
max,82495.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82497 entries, 0 to 82496
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       82496 non-null  float64
 1   Ngày             82496 non-null  object 
 2   Địa chỉ          82449 non-null  object 
 3   Quận             82495 non-null  object 
 4   Huyện            82449 non-null  object 
 5   Loại hình nhà ở  82465 non-null  object 
 6   Giấy tờ pháp lý  53610 non-null  object 
 7   Số tầng          36399 non-null  object 
 8   Số phòng ngủ     82458 non-null  object 
 9   Diện tích        82495 non-null  object 
 10  Dài              19827 non-null  object 
 11  Rộng             35445 non-null  object 
 12  Giá/m2           82484 non-null  object 
dtypes: float64(1), object(12)
memory usage: 8.2+ MB


In [None]:
null_counts = df.isnull().sum()
null_counts

Unnamed: 0             1
Ngày                   1
Địa chỉ               48
Quận                   2
Huyện                 48
Loại hình nhà ở       32
Giấy tờ pháp lý    28887
Số tầng            46098
Số phòng ngủ          39
Diện tích              2
Dài                62670
Rộng               47052
Giá/m2                13
dtype: int64

In [None]:
df.shape

(82497, 13)

In [None]:
# Drop unnecessary column
df = df.drop('Unnamed: 0', axis=1)

In [None]:
# Rename columns
df_renamed = df.rename(columns = {"Ngày":"Date", "Địa chỉ":"Address", "Quận":"District",
                                  "Huyện":"Ward", "Loại hình nhà ở":"HouseType",
                                 "Giấy tờ pháp lý":"LegalPaper", "Số tầng":"NumberofFloors",
                                 "Số phòng ngủ":"NumberofBedrooms", "Diện tích":"SquaredMeterArea",
                                 "Dài":"Length", "Rộng":"Width", "Giá/m2":"Price/m2"})

In [None]:
df_renamed.insert(2, 'City', 'Hà Nội')

In [None]:
# Replace the null values in LegalPaper column
df_renamed['LegalPaper'].fillna('Không rõ tình trạng', inplace=True)

In [None]:
# Replace the null values in HouseType column
df_renamed['HouseType'].fillna('Không rõ', inplace=True)

In [None]:
# Replace the null values in Address column
def fill_address_nulls(df_renamed):
    for index, row in df_renamed.iterrows():
        if pd.isnull(row['Address']):
            df_renamed.at[index, 'Address'] = f"{row['Ward']}, {row['District']}, {row['City']}"
fill_address_nulls(df_renamed)

In [None]:
# Replace the null values in Ward column
df_renamed['Ward'].fillna(df_renamed['District'], inplace=True)

In [None]:
# Replace the null values in NumberofBedrooms column
mode_value = df_renamed['NumberofBedrooms'].mode()[0]
df_renamed['NumberofBedrooms'].fillna(mode_value, inplace=True)

In [None]:
# Drop the null values from other columns
df_renamed = df_renamed.dropna()
df_renamed.head()

Unnamed: 0,Date,Address,City,District,Ward,HouseType,LegalPaper,NumberofFloors,NumberofBedrooms,SquaredMeterArea,Length,Width,Price/m2
2,2020-08-05,"phố minh khai, Phường Minh Khai, Quận Hai Bà T...",Hà Nội,Quận Hai Bà Trưng,Phường Minh Khai,"Nhà ngõ, hẻm",Đã có sổ,4,4 phòng,40 m²,10 m,4 m,65 triệu/m²
15,2020-08-05,"Đường Bồ Đề, Phường Bồ Đề, Quận Long Biên, Hà Nội",Hà Nội,Quận Long Biên,Phường Bồ Đề,"Nhà ngõ, hẻm",Đã có sổ,5,4 phòng,52 m²,12 m,4.2 m,"93,27 triệu/m²"
24,2020-08-04,"Đường Tố Hữu, Phường La Khê, Quận Hà Đông, Hà Nội",Hà Nội,Quận Hà Đông,Phường La Khê,"Nhà mặt phố, mặt tiền",Đã có sổ,5,5 phòng,90 m²,18 m,5 m,"108,89 triệu/m²"
29,2020-08-04,"180/61/5, Đường Tây Mỗ, Phường Tây Mỗ, Quận Na...",Hà Nội,Quận Nam Từ Liêm,Phường Tây Mỗ,"Nhà ngõ, hẻm",Đã có sổ,4,3 phòng,32 m²,6.6 m,4.5 m,"60,94 triệu/m²"
34,2020-08-04,"Đường Tả Thanh Oai, Xã Tả Thanh Oai, Huyện Tha...",Hà Nội,Huyện Thanh Trì,Xã Tả Thanh Oai,"Nhà ngõ, hẻm",Đã có sổ,3,2 phòng,42 m²,11 m,4 m,"29,76 triệu/m²"


In [None]:
df_renamed.shape

(11568, 13)

In [None]:
df = df_renamed.isnull().sum()
df

Date                0
Address             0
City                0
District            0
Ward                0
HouseType           0
LegalPaper          0
NumberofFloors      0
NumberofBedrooms    0
SquaredMeterArea    0
Length              0
Width               0
Price/m2            0
dtype: int64

In [None]:
df_renamed = df_renamed.reset_index()

In [None]:
df_renamed = df_renamed[df_renamed['NumberofFloors'] != 'Nhiều hơn 10']
df_renamed = df_renamed[df_renamed['NumberofBedrooms'] != 'nhiều hơn 10 phòng']

In [None]:
df_renamed['District'] = df_renamed['District'].str.replace('Quận ','').str.strip()
df_renamed['District'] = df_renamed['District'].str.replace('Huyện ','').str.strip()
df_renamed['District'] = df_renamed['District'].str.replace('Thị xã ','').str.strip()
df_renamed['Ward'] = df_renamed['Ward'].str.replace('Phường ','').str.strip()
df_renamed['NumberofFloors'] = df_renamed['NumberofFloors'].str.strip().astype(float)
df_renamed['NumberofBedrooms'] = df_renamed['NumberofBedrooms'].str.replace(' phòng','').str.strip().astype(float)
df_renamed['SquaredMeterArea'] = df_renamed['SquaredMeterArea'].str.replace(' m²','').str.strip().astype(float)
df_renamed['Length'] = df_renamed['Length'].str.replace(' m','').str.strip().astype(float)
df_renamed['Width'] = df_renamed['Width'].str.replace(' m','').str.strip().astype(float)

In [None]:
# Handle billion values (tỷ)
df_renamed.loc[df_renamed['Price/m2'].str.contains(' tỷ/m²'), 'Price/m2'] = (
    df_renamed.loc[df_renamed['Price/m2'].str.contains(' tỷ/m²'), 'Price/m2']
    .str.replace(' tỷ/m²', '')
    .str.replace('.', '')  # Remove thousand separators
    .str.replace(',', '.')  # Replace decimal separators
    .astype(float) * 1000
)

# Handle million values (triệu)
df_renamed.loc[df_renamed['Price/m2'].str.contains(' triệu/m²', na=False), 'Price/m2'] = (
    df_renamed.loc[df_renamed['Price/m2'].str.contains(' triệu/m²', na=False), 'Price/m2']
    .str.replace(' triệu/m²', '')
    .str.replace('.', '')  # Remove thousand separators
    .str.replace(',', '.')  # Replace decimal separators
    .astype(float)
)
# Handle đồng values (đ)
df_renamed.loc[df_renamed['Price/m2'].str.contains(' đ/m²', na=False), 'Price/m2'] = (
    df_renamed.loc[df_renamed['Price/m2'].str.contains(' đ/m²', na=False), 'Price/m2']
    .str.replace(' đ/m²', '')
    .str.replace('.', '')  # Remove thousand separators
    .astype(float) * 0.000001  # Convert from đồng to millions
)

  .str.replace('.', '')  # Remove thousand separators
  .str.replace('.', '')  # Remove thousand separators
  .str.replace('.', '')  # Remove thousand separators


In [None]:
df_renamed.head()

Unnamed: 0,index,Date,Address,City,District,Ward,HouseType,LegalPaper,NumberofFloors,NumberofBedrooms,SquaredMeterArea,Length,Width,Price/m2
0,2,2020-08-05,"phố minh khai, Phường Minh Khai, Quận Hai Bà T...",Hà Nội,Hai Bà Trưng,Minh Khai,"Nhà ngõ, hẻm",Đã có sổ,4.0,4.0,40.0,10.0,4.0,65.0
1,15,2020-08-05,"Đường Bồ Đề, Phường Bồ Đề, Quận Long Biên, Hà Nội",Hà Nội,Long Biên,Bồ Đề,"Nhà ngõ, hẻm",Đã có sổ,5.0,4.0,52.0,12.0,4.2,93.27
2,24,2020-08-04,"Đường Tố Hữu, Phường La Khê, Quận Hà Đông, Hà Nội",Hà Nội,Hà Đông,La Khê,"Nhà mặt phố, mặt tiền",Đã có sổ,5.0,5.0,90.0,18.0,5.0,108.89
3,29,2020-08-04,"180/61/5, Đường Tây Mỗ, Phường Tây Mỗ, Quận Na...",Hà Nội,Nam Từ Liêm,Tây Mỗ,"Nhà ngõ, hẻm",Đã có sổ,4.0,3.0,32.0,6.6,4.5,60.94
4,34,2020-08-04,"Đường Tả Thanh Oai, Xã Tả Thanh Oai, Huyện Tha...",Hà Nội,Thanh Trì,Xã Tả Thanh Oai,"Nhà ngõ, hẻm",Đã có sổ,3.0,2.0,42.0,11.0,4.0,29.76


In [None]:
!pip install plotly



In [None]:
# Identify outlier using boxplot
import plotly.express as px
fig = px.box(df_renamed, y='Price/m2', title = 'Boxplot of Price/m2')
fig.show()

In [None]:
# Remove outlier from 'Price/m2' column using IQR method
Q1 = df_renamed['Price/m2'].quantile(0.25)
Q3 = df_renamed['Price/m2'].quantile(0.75)
IQR = Q3 - Q1

In [None]:
outlier_condition = ((df_renamed['Price/m2'] < (Q1 - 1.5 * IQR)) | (df_renamed['Price/m2'] > (Q3 + 50 * IQR)))

In [None]:
df_final = df_renamed[~outlier_condition]

In [None]:
df_final.shape

(11290, 14)

In [None]:
# Check for outlier again
fig = px.box(df_final, y='Price/m2', title = 'Boxplot of Price/m2')
fig.show()

In [None]:
pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.7-py3-none-any.whl (235 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/235.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/235.5 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/235.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.7


In [None]:
import unidecode
def convert_accented_characters(row):
    return unidecode.unidecode(row)

In [None]:
df_final['District'] = df_final['District'].apply(convert_accented_characters)
df_final.head()

Unnamed: 0,index,Date,Address,City,District,Ward,HouseType,LegalPaper,NumberofFloors,NumberofBedrooms,SquaredMeterArea,Length,Width,Price/m2
0,2,2020-08-05,"phố minh khai, Phường Minh Khai, Quận Hai Bà T...",Hà Nội,Hai Ba Trung,Minh Khai,"Nhà ngõ, hẻm",Đã có sổ,4.0,4.0,40.0,10.0,4.0,65.0
1,15,2020-08-05,"Đường Bồ Đề, Phường Bồ Đề, Quận Long Biên, Hà Nội",Hà Nội,Long Bien,Bồ Đề,"Nhà ngõ, hẻm",Đã có sổ,5.0,4.0,52.0,12.0,4.2,93.27
2,24,2020-08-04,"Đường Tố Hữu, Phường La Khê, Quận Hà Đông, Hà Nội",Hà Nội,Ha Dong,La Khê,"Nhà mặt phố, mặt tiền",Đã có sổ,5.0,5.0,90.0,18.0,5.0,108.89
3,29,2020-08-04,"180/61/5, Đường Tây Mỗ, Phường Tây Mỗ, Quận Na...",Hà Nội,Nam Tu Liem,Tây Mỗ,"Nhà ngõ, hẻm",Đã có sổ,4.0,3.0,32.0,6.6,4.5,60.94
4,34,2020-08-04,"Đường Tả Thanh Oai, Xã Tả Thanh Oai, Huyện Tha...",Hà Nội,Thanh Tri,Xã Tả Thanh Oai,"Nhà ngõ, hẻm",Đã có sổ,3.0,2.0,42.0,11.0,4.0,29.76


# **Data Mining - Housing Price Prediction**

In [None]:
!pip install tensorflow



In [None]:
!pip install scikeras

Collecting scikeras
  Downloading scikeras-0.12.0-py3-none-any.whl (27 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.12.0


In [None]:
# Import necessary libraries
import warnings
import os
import numpy as np
import pandas as pd
import datetime
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasRegressor

In [None]:
# Create dummies for categorical columns
dummy_type_of_housing = pd.get_dummies(df_final.HouseType, prefix="housing_type").astype(int)
dummy_legal_paper = pd.get_dummies(df_final.LegalPaper, prefix="legal_paper").astype(int)
dummy_district = pd.get_dummies(df_final.District, prefix="district").astype(int)
dummy_ward = pd.get_dummies(df_final.Ward, prefix="ward").astype(int)

df_cleaned = pd.concat([df_final, dummy_type_of_housing, dummy_legal_paper, dummy_district, dummy_ward], axis=1)
df_cleaned = df_cleaned.drop(['index', 'Date', 'Address', 'District', 'Ward', 'HouseType', 'LegalPaper', 'City'], axis = 1)
df_cleaned.head()

Unnamed: 0,NumberofFloors,NumberofBedrooms,SquaredMeterArea,Length,Width,Price/m2,housing_type_Nhà biệt thự,"housing_type_Nhà mặt phố, mặt tiền","housing_type_Nhà ngõ, hẻm",housing_type_Nhà phố liền kề,...,ward_Đại Mỗ,ward_Định Công,ward_Đống Mác,ward_Đồng Mai,ward_Đồng Nhân,ward_Đồng Tâm,ward_Đồng Xuân,ward_Đội Cấn,ward_Đức Giang,ward_Đức Thắng
0,4.0,4.0,40.0,10.0,4.0,65.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,5.0,4.0,52.0,12.0,4.2,93.27,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,5.0,5.0,90.0,18.0,5.0,108.89,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4.0,3.0,32.0,6.6,4.5,60.94,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,3.0,2.0,42.0,11.0,4.0,29.76,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


### Removing outlier using IQR method

We are proceeding with the removal of outliers from our dataset in preparation for deploying an Artificial Neural Network (ANN) for regression. Since regression models are sensitive to outliers, their presence in significant numbers can adversely affect the accuracy of our results

In [None]:
def remove_outlier_IQR(df, series):
    Q1=df[series].quantile(0.25)
    Q3=df[series].quantile(0.75)
    IQR=Q3-Q1
    df_final=df[~((df[series]<(Q1-1.5*IQR)) | (df[series]>(Q3+1.5*IQR)))]
    return df_final

removed_outliers = df_cleaned
columns_to_remove_outliers = ['NumberofFloors', 'NumberofBedrooms', 'SquaredMeterArea', 'Length',
                              'Width', 'Price/m2']
for column in columns_to_remove_outliers:
    removed_outliers = remove_outlier_IQR(removed_outliers, column)

print("The final length of the dataset is", str(len(removed_outliers)), "rows.")

The final length of the dataset is 7389 rows.


### Training a Artificial Neural Network (ANN) for Regression

In [None]:
housing = removed_outliers
# Separate predictors and response (price) variables
X = housing.loc[:, housing.columns != 'Price/m2']
y = housing[['Price/m2']]
to_be_scaled = ['NumberofFloors', 'NumberofBedrooms', 'SquaredMeterArea', 'Length', 'Width']

# Initiate scaler
PredictorScaler=StandardScaler()
TargetVarScaler=StandardScaler()

X_scaled = X
y_scaled = y

# Storing the fit object for reference and reverse the scaling later
PredictorScalerFit=PredictorScaler.fit(X_scaled[to_be_scaled])
TargetVarScalerFit=TargetVarScaler.fit(y_scaled)

# Generating the standardized values of X and y
X_scaled[to_be_scaled]=PredictorScalerFit.transform(X_scaled[to_be_scaled])
y_scaled=TargetVarScalerFit.transform(y)

X_array = np.array(X_scaled.values).astype("float32")
y_array = np.array(y_scaled).astype("float32")

X_train, X_test, y_train, y_test = train_test_split(X_array, y_array, test_size=0.2, random_state=2032)

# Check to see if all train and test arrays have correct dimensions
if X_train.shape[0] == y_train.shape[0] and X_train.shape[1] == X_test.shape[1] and X_test.shape[0] == y_test.shape[0] and y_train.shape[1] == y_test.shape[1]:
    print("All train and test sets have correct dimensions.")

All train and test sets have correct dimensions.


### Finding best parameters for the ANN using Grid Search

In [None]:
# Turn off TensorFlow messages and warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ["KMP_SETTINGS"] = "false"

# Create the base model
def create_regression_ANN(optimizer='adam'):
    model = Sequential()
    model.add(Dense(units=10, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer=optimizer)
    return model

# Create a dictionary for trial parameters
ANN_params = {
    'batch_size': [10, 20, 30, 50],
    'epochs': [10, 20, 50],
    'optimizer': ['adam', 'rmsprop']
}

ANN_trial = KerasRegressor(build_fn=create_regression_ANN, verbose=0)

# Initiate the grid search and store best parameters for later reference
ANN_grid_search = GridSearchCV(estimator=ANN_trial, param_grid=ANN_params,
                               cv=3, n_jobs=-1).fit(X_train, y_train, verbose=0)
ANN_best_params = ANN_grid_search.best_params_

# Showing the best parameters
print(ANN_best_params)

{'batch_size': 10, 'epochs': 10, 'optimizer': 'rmsprop'}


In [None]:
# Fitting the ANN to the Training set
ANN = Sequential()
ANN.add(Dense(units=10, input_dim=X_train.shape[1],
                kernel_initializer='normal', activation='relu'))
ANN.add(Dense(1, kernel_initializer='normal'))
ANN.compile(loss='mean_squared_error', optimizer=ANN_best_params['optimizer'])
ANN.fit(X_train, y_train,batch_size = int(ANN_best_params['batch_size']),
        epochs = int(ANN_best_params['epochs']), verbose=0)

ANN.summary()

Model: "sequential_21"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_40 (Dense)            (None, 10)                2820      
                                                                 
 dense_41 (Dense)            (None, 1)                 11        
                                                                 
Total params: 2831 (11.06 KB)
Trainable params: 2831 (11.06 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Generating Predictions on testing data
ANN_predictions = ANN.predict(X_test)

# Scaling the predicted Price data back to original price scale
ANN_predictions = TargetVarScalerFit.inverse_transform(ANN_predictions)

# Scaling the y_test Price data back to original price scale
y_test_orig = TargetVarScalerFit.inverse_transform(y_test)

# Scaling the test data back to original scale
Test_Data = np.concatenate((PredictorScalerFit.inverse_transform(X_test[:,:5]), X_test[:,5:]), axis=1)

# Recreating the dataset, now with predicted price using the ANN model
TestingData = pd.DataFrame(data=Test_Data, columns=X.columns)
TestingData['Price'] = y_test_orig
TestingData['ANN_predictions'] = ANN_predictions

TestingData[['Price', 'ANN_predictions']].head(10)



Unnamed: 0,Price,ANN_predictions
0,75.0,79.67907
1,86.669998,78.265526
2,118.75,91.616554
3,71.879997,73.192772
4,83.330002,108.442963
5,114.580002,96.999878
6,110.0,78.05687
7,77.5,70.089722
8,65.849998,79.295868
9,90.629997,112.134949


In [None]:
# Define a function evaluate the predictions
def Accuracy_Score(orig, pred):
    MAPE = np.mean(100 * (np.abs(orig - pred) / orig))
    return(100-MAPE)

In [None]:
# Showing scores for ANN
print("Accuracy for the ANN model is:", str(Accuracy_Score(TestingData['Price'], TestingData['ANN_predictions'])))

Accuracy for the ANN model is: 83.68039131164551


# **Multiple Linear Regression**

In [None]:
import pandas as pd
import statsmodels.api as sm

In [None]:
housing.dtypes

NumberofFloors      float64
NumberofBedrooms    float64
SquaredMeterArea    float64
Length              float64
Width               float64
                     ...   
ward_Đồng Tâm         int64
ward_Đồng Xuân        int64
ward_Đội Cấn          int64
ward_Đức Giang        int64
ward_Đức Thắng        int64
Length: 282, dtype: object

In [None]:
non_numeric_columns = housing.select_dtypes(include=['object']).columns
if len(non_numeric_columns) > 0:
    print("Non-numeric columns:", non_numeric_columns)
else:
    print("All columns are numeric.")

All columns are numeric.


In [None]:
housing['Price/m2'] = pd.to_numeric(housing['Price/m2'], errors='coerce')

In [None]:
housing = removed_outliers
X = housing.loc[:, housing.columns != 'Price/m2']
y = housing[['Price/m2']]

In [None]:
# Adding a constant to the model (intercept)
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Print out the statistics
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:               Price/m2   R-squared:                       0.467
Model:                            OLS   Adj. R-squared:                  0.452
Method:                 Least Squares   F-statistic:                     30.86
Date:                Sat, 23 Dec 2023   Prob (F-statistic):               0.00
Time:                        06:32:41   Log-Likelihood:                -32217.
No. Observations:                7389   AIC:                         6.484e+04
Df Residuals:                    7184   BIC:                         6.626e+04
Df Model:                         204                                         
Covariance Type:            nonrobust                                         
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
cons

In [None]:
results_df = pd.DataFrame({
    'coef': model.params,
    'p-value': model.pvalues
})

significant_variables = results_df[results_df['p-value'] < 0.05]

significant_variables

Unnamed: 0,coef,p-value
const,28.076483,2.347051e-38
NumberofFloors,6.796552,1.036286e-82
NumberofBedrooms,1.613933,6.244761e-06
SquaredMeterArea,-0.321304,8.421692e-11
Length,0.877533,4.584781e-05
...,...,...
ward_Yết Kiêu,12.106770,1.878036e-02
ward_Ô Chợ Dừa,7.762256,1.762126e-03
ward_Điện Biên,43.915492,4.838308e-10
ward_Đồng Mai,-32.492775,7.574196e-08


In [None]:
significant_variables.to_excel('/content/MultipleLinearRegressionResult.xlsx')