In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [None]:

# Reading the Data
df =  pd.read_csv('house_prices.csv',encoding='gbk',low_memory=False)#, errors='ignore')
# Shape of the Data
print ('DATA',df.shape)
df.head(1)

# Step 1: Remove variables with more than 50% missing data
df = df.drop(['DOM'], axis=1)
# Step 2: Remove observations with any missing values and with values '未知' = None
df = df.dropna()
df = df[df['constructionTime']!='未知']
# Step 3: Removing the columns 'kitchens', 'bathrooms', and 'drawingRooms'
df.drop(['kitchen', 'bathRoom', 'drawingRoom', 'url', 'id', 'Cid', 'floor', 'buildingType', 'ladderRatio'], axis=1, inplace=True)
# Step 4: Setting the number of living rooms to be within the range of 1 to 4
df['livingRoom'] = pd.to_numeric(df['livingRoom'], errors='coerce')
df['livingRoom'] = df['livingRoom'].clip(lower=1, upper=4)
df.columns
print ("DATA", df.shape)

# Creating 'distance' feature
# To calculate Distance Between Two Points on Earth 
from math import radians, cos, sin, asin, sqrt

def haversine(lat1, lon1, lat2=39.916668, lon2=116.383331):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371  # Radius of Earth in kilometers
    return c * r

df['distance'] = df.apply(lambda x: haversine(x['Lat'], x['Lng']), axis=1)
df['constructionTime'] = df['constructionTime'].astype(int)
df['age'] = 2023 - pd.to_numeric(df['constructionTime'], errors='coerce')
# Drop the original 'constructionTime' column
df.drop('constructionTime', axis=1, inplace=True)

# Set minimum values
min_price = 10000
min_square = 20

df['price'] = df['price'].clip(lower=min_price)
df['square'] = df['square'].clip(lower=min_square)

print ('DATA',df.shape)
df.head(1)

# 'timeTrade' feature to year base only.
df['tradeTime'] = pd.DatetimeIndex(df['tradeTime']).year

# Converting features datatype to see outliers
df['livingRoom'] = df['livingRoom'].astype(int)
df['tradeTime'] = df['tradeTime'].astype(int)
df['renovationCondition'] = df['renovationCondition'].astype(int)
df['buildingStructure'] = df['buildingStructure'].astype(int)
df['elevator'] = df['elevator'].astype(int)
df['fiveYearsProperty'] = df['fiveYearsProperty'].astype(int)
df['subway'] = df['subway'].astype(int)
df['followers']  = df['followers'].astype(int)
df['totalPrice']  = df['totalPrice'].astype(int)
df['elevator']  = df['elevator'].astype(int)
df['fiveYearsProperty']  = df['fiveYearsProperty'].astype(int)
df['subway']  = df['subway'].astype(int)
df['age']  = df['age'].astype(int)

# Reseting the index
df.reset_index(inplace=True)
df.drop(['index'],axis=1,inplace=True)
# Now the remaining data
print ("DATA", df.shape)
df.head()

df.dtypes

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop('totalPrice', axis=1)
y = df['totalPrice']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:

model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dense(1)
])


In [None]:

model.compile(optimizer='adam', loss='mean_squared_error')


In [None]:

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)


In [None]:

history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping], verbose=2)


In [None]:
from sklearn.metrics import mean_squared_log_error
from numpy import sqrt

# Function to calculate RMSLE
def rmsle(y_true, y_pred):
    return sqrt(mean_squared_log_error(y_true, y_pred))

# Predicting on the test set
y_pred = model.predict(X_test)

# Calculating RMSLE
test_rmsle = rmsle(y_test, y_pred)
print('Test RMSLE:', test_rmsle)

In [None]:

plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('True vs. Predicted Values')
plt.show()
