In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.ensemble import StackingRegressor

In [2]:
# Reading the Data
df =  pd.read_csv('house_prices.csv',encoding='gbk',low_memory=False)#, errors='ignore')
# Shape of the Data
print ('DATA',df.shape)
df.head(1)

DATA (318851, 26)


Unnamed: 0,url,id,Lng,Lat,Cid,tradeTime,DOM,followers,totalPrice,price,...,buildingType,constructionTime,renovationCondition,buildingStructure,ladderRatio,elevator,fiveYearsProperty,subway,district,communityAverage
0,https://bj.lianjia.com/chengjiao/101084782030....,101084782030,116.475489,40.01952,1111027376244,2016-08-09,1464.0,106,415.0,31680,...,1.0,2005,3,6,0.217,1.0,0.0,1.0,7,56021.0


In [3]:
# Columns or Features or Variables
df.columns

Index(['url', 'id', 'Lng', 'Lat', 'Cid', 'tradeTime', 'DOM', 'followers',
       'totalPrice', 'price', 'square', 'livingRoom', 'drawingRoom', 'kitchen',
       'bathRoom', 'floor', 'buildingType', 'constructionTime',
       'renovationCondition', 'buildingStructure', 'ladderRatio', 'elevator',
       'fiveYearsProperty', 'subway', 'district', 'communityAverage'],
      dtype='object')

In [4]:
# Finding missing Values in each column or feature
df.isnull().sum()

url                         0
id                          0
Lng                         0
Lat                         0
Cid                         0
tradeTime                   0
DOM                    157977
followers                   0
totalPrice                  0
price                       0
square                      0
livingRoom                  0
drawingRoom                 0
kitchen                     0
bathRoom                    0
floor                       0
buildingType             2021
constructionTime            0
renovationCondition         0
buildingStructure           0
ladderRatio                 0
elevator                   32
fiveYearsProperty          32
subway                     32
district                    0
communityAverage          463
dtype: int64

In [5]:
# Step 1: Remove variables with more than 50% missing data
df = df.drop(['DOM'], axis=1)
# Step 2: Remove observations with any missing values and with values '未知' = None
df = df.dropna()
df = df[df['constructionTime']!='未知']
# Step 3: Removing the columns 'kitchens', 'bathrooms', and 'drawingRooms'
df.drop(['kitchen', 'bathRoom', 'drawingRoom', 'url', 'id', 'Cid', 'floor', 'buildingType', 'ladderRatio'], axis=1, inplace=True)
# Step 4: Setting the number of living rooms to be within the range of 1 to 4
df['livingRoom'] = pd.to_numeric(df['livingRoom'], errors='coerce')
df['livingRoom'] = df['livingRoom'].clip(lower=1, upper=4)
df.columns
print ("DATA", df.shape)

DATA (297701, 16)


In [6]:
# Creating 'distance' feature
# To calculate Distance Between Two Points on Earth 
from math import radians, cos, sin, asin, sqrt

def haversine(lat1, lon1, lat2=39.916668, lon2=116.383331):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371  # Radius of Earth in kilometers
    return c * r

df['distance'] = df.apply(lambda x: haversine(x['Lat'], x['Lng']), axis=1)
df['constructionTime'] = df['constructionTime'].astype(int)
df['age'] = 2023 - pd.to_numeric(df['constructionTime'], errors='coerce')
# Drop the original 'constructionTime' column
df.drop('constructionTime', axis=1, inplace=True)

# Set minimum values
min_price = 10000
min_square = 20

df['price'] = df['price'].clip(lower=min_price)
df['square'] = df['square'].clip(lower=min_square)

print ('DATA',df.shape)
df.head(1)

DATA (297701, 17)


Unnamed: 0,Lng,Lat,tradeTime,followers,totalPrice,price,square,livingRoom,renovationCondition,buildingStructure,elevator,fiveYearsProperty,subway,district,communityAverage,distance,age
0,116.475489,40.01952,2016-08-09,106,415.0,31680,131.0,2,3,6,1.0,0.0,1.0,7,56021.0,13.873607,18


In [7]:
# 'timeTrade' feature to year base only.
df['tradeTime'] = pd.DatetimeIndex(df['tradeTime']).year

# Converting features datatype to see outliers
df['livingRoom'] = df['livingRoom'].astype(int)
df['tradeTime'] = df['tradeTime'].astype(int)
df['renovationCondition'] = df['renovationCondition'].astype(int)
df['buildingStructure'] = df['buildingStructure'].astype(int)
df['elevator'] = df['elevator'].astype(int)
df['fiveYearsProperty'] = df['fiveYearsProperty'].astype(int)
df['subway'] = df['subway'].astype(int)
df['followers']  = df['followers'].astype(int)
df['totalPrice']  = df['totalPrice'].astype(int)
df['elevator']  = df['elevator'].astype(int)
df['fiveYearsProperty']  = df['fiveYearsProperty'].astype(int)
df['subway']  = df['subway'].astype(int)
df['age']  = df['age'].astype(int)

# Reseting the index
df.reset_index(inplace=True)
df.drop(['index'],axis=1,inplace=True)
# Now the remaining data
print ("DATA", df.shape)
df.head()

df.dtypes

DATA (297701, 17)
The DATA (297701, 17)


Lng                    float64
Lat                    float64
tradeTime                int32
followers                int32
totalPrice               int32
price                    int64
square                 float64
livingRoom               int32
renovationCondition      int32
buildingStructure        int32
elevator                 int32
fiveYearsProperty        int32
subway                   int32
district                 int64
communityAverage       float64
distance               float64
age                      int32
dtype: object

In [8]:
# Function to remove outliers from a DataFrame
def remove_outliers(df):
    for column in df.select_dtypes(include=['number']).columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Filter out outliers from the DataFrame
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

# Remove outliers
df_clean = remove_outliers(df)

# Display the first few rows of the updated DataFrame
print ('DATA',df_clean.shape)
df_clean.head(1)

DATA (180574, 17)


Unnamed: 0,Lng,Lat,tradeTime,followers,totalPrice,price,square,livingRoom,renovationCondition,buildingStructure,elevator,fiveYearsProperty,subway,district,communityAverage,distance,age
15,116.284755,39.93453,2016,6,212,32981,64.28,1,3,6,1,0,1,8,70141.0,8.637332,14


In [9]:
# Selecting features and target variable
X = df_clean[['tradeTime', 'followers', 'square', 'livingRoom',
       'renovationCondition', 'buildingStructure',
       'elevator', 'fiveYearsProperty', 'subway', 'district',
       'communityAverage', 'distance', 'age']]

# Standardized the Features
X = np.asarray(X)
X = preprocessing.StandardScaler().fit(X).transform(X)  

# Selecting Target Feature
y = df_clean['price']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Initialize the Random Forest Regressor
rf = RandomForestRegressor(random_state=42,n_estimators=900,max_depth=20,min_samples_split=10)

# Train the model
rf.fit(X_train, y_train)

# Making predictions
y_pred = rf.predict(X_test)

# Evaluating the model
print ("RMSLE {:,.5f}".format(np.sqrt(mean_squared_log_error( y_pred, y_test))))

RMSLE 0.14309


In [11]:
# Initialize the XGBoost
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', min_child_weight = 2, subsample = 1,
                          colsample_bytree = 0.8,
                          learning_rate = 0.2, n_estimators = 500,
                         reg_lambda = 0.45, reg_alpha = 0, gamma = 0.5)

# Train the model
xg_reg.fit(X_train,y_train)

# Making predictions
y_pred = xg_reg.predict(X_test)

# Evaluating the model
print ("RMSLE {:,.5f}".format(np.sqrt(mean_squared_log_error( y_pred, y_test))))

RMSLE 0.14165


In [12]:
# Initialize the Light GBM Model
import lightgbm as lgb
gbm = lgb.LGBMRegressor(objective='regression',num_leaves=36,learning_rate=0.15,
                        n_estimators=64,min_child_weight = 2, colsample_bytree = 0.8,
                        reg_lambda = 0.4)

# Train the model
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric='l2_root', callbacks=[lgb.early_stopping(50)])

# Making predictions
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)

print ("RMSLE {:,.5f}".format(np.sqrt(mean_squared_log_error( y_pred, y_test))))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000995 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 901
[LightGBM] [Info] Number of data points in the train set: 144459, number of used features: 13
[LightGBM] [Info] Start training from score 38756.797950
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[64]	valid_0's rmse: 5586.3	valid_0's l2: 3.12068e+07
RMSLE 0.15102


In [13]:
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Define the base models
models = [
    ('xgb', XGBRegressor(objective ='reg:squarederror', min_child_weight = 2, subsample = 1, colsample_bytree = 0.8, learning_rate = 0.2, n_estimators = 500, reg_lambda = 0.45, reg_alpha = 0, gamma = 0.5)),
    ('lgb', LGBMRegressor(objective='regression', num_leaves=36, learning_rate=0.15, n_estimators=64, min_child_weight = 2, colsample_bytree = 0.8, reg_lambda = 0.45)),
    ('rf', RandomForestRegressor(random_state=42, n_estimators=900, max_depth=20, n_jobs=-1, min_samples_split=10))
]

# Create a voting regressor
voting_model = VotingRegressor(estimators=models)


# Training the voting regressor
voting_model.fit(X_train, y_train)

# Making predictions and evaluating the model
predictions = voting_model.predict(X_test)

print ("RMSLE {:,.5f}".format(np.sqrt(mean_squared_log_error(predictions, y_test))))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003165 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 901
[LightGBM] [Info] Number of data points in the train set: 144459, number of used features: 13
[LightGBM] [Info] Start training from score 38756.797950
RMSLE 0.14180


In [14]:
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Define the base models
models = [
    ('xgb', XGBRegressor(objective ='reg:squarederror', min_child_weight = 2, subsample = 1, colsample_bytree = 0.8, learning_rate = 0.2, n_estimators = 500, reg_lambda = 0.45, reg_alpha = 0, gamma = 0.5)),
    ('lgb', LGBMRegressor(objective='regression', num_leaves=36, learning_rate=0.15, n_estimators=64, min_child_weight = 2, colsample_bytree = 0.8, reg_lambda = 0.45))
]

# Stacking the models with a final RandomForest Regressor as a meta-learner
stacked_model = StackingRegressor(
    estimators=models,
    final_estimator=RandomForestRegressor(random_state=42, n_estimators=900, max_depth=20, n_jobs=-1, min_samples_split=10)
)


# Training the voting regressor
stacked_model.fit(X_train, y_train)

# Making predictions and evaluating the model
predictions = stacked_model.predict(X_test)

print ("RMSLE {:,.5f}".format(np.sqrt(mean_squared_log_error(predictions, y_test))))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001262 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 901
[LightGBM] [Info] Number of data points in the train set: 144459, number of used features: 13
[LightGBM] [Info] Start training from score 38756.797950
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003250 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 901
[LightGBM] [Info] Number of data points in the train set: 115567, number of used features: 13
[LightGBM] [Info] Start training from score 38725.864062
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000897 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] 