In [None]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.ensemble import StackingRegressor
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)


# Reading the Data
df =  pd.read_csv('house_prices.csv',encoding='gbk',low_memory=False)#, errors='ignore')
# Shape of the Data
print ('DATA',df.shape)
df.head(1)

# Step 1: Remove variables with more than 50% missing data
df = df.drop(['DOM'], axis=1)
# Step 2: Remove observations with any missing values and with values '未知' = None
df = df.dropna()
df = df[df['constructionTime']!='未知']
# Step 3: Removing the columns 'kitchens', 'bathrooms', and 'drawingRooms'
df.drop(['kitchen', 'bathRoom', 'drawingRoom', 'url', 'id', 'Cid', 'floor', 'buildingType', 'ladderRatio'], axis=1, inplace=True)
# Step 4: Setting the number of living rooms to be within the range of 1 to 4
df['livingRoom'] = pd.to_numeric(df['livingRoom'], errors='coerce')
df['livingRoom'] = df['livingRoom'].clip(lower=1, upper=4)
df.columns
print ("DATA", df.shape)

# Creating 'distance' feature
# To calculate Distance Between Two Points on Earth 
from math import radians, cos, sin, asin, sqrt

def haversine(lat1, lon1, lat2=39.916668, lon2=116.383331):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371  # Radius of Earth in kilometers
    return c * r

df['distance'] = df.apply(lambda x: haversine(x['Lat'], x['Lng']), axis=1)
df['constructionTime'] = df['constructionTime'].astype(int)
df['age'] = 2023 - pd.to_numeric(df['constructionTime'], errors='coerce')
# Drop the original 'constructionTime' column
df.drop('constructionTime', axis=1, inplace=True)

# Set minimum values
min_price = 10000
min_square = 20

df['price'] = df['price'].clip(lower=min_price)
df['square'] = df['square'].clip(lower=min_square)

print ('DATA',df.shape)
df.head(1)

# 'timeTrade' feature to year base only.
df['tradeTime'] = pd.DatetimeIndex(df['tradeTime']).year

# Converting features datatype to see outliers
df['livingRoom'] = df['livingRoom'].astype(int)
df['tradeTime'] = df['tradeTime'].astype(int)
df['renovationCondition'] = df['renovationCondition'].astype(int)
df['buildingStructure'] = df['buildingStructure'].astype(int)
df['elevator'] = df['elevator'].astype(int)
df['fiveYearsProperty'] = df['fiveYearsProperty'].astype(int)
df['subway'] = df['subway'].astype(int)
df['followers']  = df['followers'].astype(int)
df['totalPrice']  = df['totalPrice'].astype(int)
df['elevator']  = df['elevator'].astype(int)
df['fiveYearsProperty']  = df['fiveYearsProperty'].astype(int)
df['subway']  = df['subway'].astype(int)
df['age']  = df['age'].astype(int)

# Reseting the index
df.reset_index(inplace=True)
df.drop(['index'],axis=1,inplace=True)
# Now the remaining data
print ("DATA", df.shape)
df.head()

df.dtypes

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming 'data' is your DataFrame
X = df.drop('totalPrice', axis=1)
y = df['totalPrice']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import GridSearchCV

class KerasClassifier(BaseEstimator, ClassifierMixin):  
    def __init__(self, optimizer='adam', input_shape=None):
        self.optimizer = optimizer
        self.input_shape = input_shape

    def create_model(self):
        if self.input_shape is None:
            raise ValueError("Input shape must be defined")
        model = Sequential([
            Dense(128, activation='relu', input_shape=(self.input_shape,)), 
            Dense(64, activation='relu'),
            Dense(32, activation='relu'),
            Dense(1)  
        ])
        model.compile(loss='mean_squared_error', optimizer=self.optimizer)
        return model

    def fit(self, X, y):
        if self.input_shape is None:
            self.input_shape = X.shape[1]
        self.model = self.create_model()
        self.model.fit(X, y, epochs=10, batch_size=10, verbose=1)
        return self

    def predict(self, X):
        predictions = self.model.predict(X)
        return predictions

input_shape = X_train_scaled.shape[1]
model = KerasClassifier(input_shape=input_shape)

param_grid = {'optimizer': ['adam', 'sgd']}
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train_scaled, y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import GridSearchCV

class KerasRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, learning_rate=0.01, input_shape=None):
        self.learning_rate = learning_rate
        self.input_shape = input_shape

    def create_model(self):
        if self.input_shape is None:
            raise ValueError("Input shape must be defined")
        model = Sequential([
            Dense(128, activation='relu', input_shape=(self.input_shape,)), 
            Dense(64, activation='relu'),
            Dense(32, activation='relu'),
            Dense(1)  
        ])
        optimizer = Adam(learning_rate=self.learning_rate)
        model.compile(loss='mean_squared_error', optimizer=optimizer)
        return model

    def fit(self, X, y):
        if self.input_shape is None:
            self.input_shape = X.shape[1]
        self.model = self.create_model()
        self.model.fit(X, y, epochs=10, batch_size=10, verbose=1)
        return self

    def predict(self, X):
        predictions = self.model.predict(X)
        return predictions

input_shape = X_train_scaled.shape[1]
my_model = KerasRegressor(input_shape=input_shape)

param_grid = {'learning_rate': [0.001, 0.01, 0.1]}
grid = GridSearchCV(estimator=my_model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train_scaled, y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)  
])

In [None]:
from tensorflow.keras.optimizers import Adam

model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Configure early stopping
early_stopping = EarlyStopping(
    monitor='val_loss', 
    min_delta=1e-4, 
    patience=10, 
    mode='min',
    restore_best_weights=True
)

# Train the model with early stopping
history = model.fit(
    X_train_scaled, 
    y_train, 
    validation_split=0.2, 
    epochs=150, 
    batch_size=32, 
    callbacks=[early_stopping]
)

In [None]:
from sklearn.metrics import mean_squared_log_error
from numpy import sqrt

# Function to calculate RMSLE
def rmsle(y_true, y_pred):
    return sqrt(mean_squared_log_error(y_true, y_pred))

# Predicting on the test set
y_pred = model.predict(X_test_scaled)

# Calculating RMSLE
test_rmsle = rmsle(y_test, y_pred)
print('Test RMSLE:', test_rmsle)

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('True vs. Predicted Values')
plt.show()