In [5]:
# Import libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from math import radians, cos, sin, asin, sqrt
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import re

# Load the data
train_data = pd.read_csv('data/X_train.csv')
Y_train = pd.read_csv('data/y_train.csv')
test_data = pd.read_csv('data/X_test.csv')

# Preprocessing
train_data['constructionTime'].replace({'未知': 0}, inplace=True)
test_data['constructionTime'].replace({'未知': 0}, inplace=True)
train_data['livingRoom'].replace({'#NAME?': 2}, inplace=True)
test_data['livingRoom'].replace({'#NAME?': 2}, inplace=True)

floor_col_train = train_data.floor.apply(lambda x: str(x)[-2:])
floor_col_test = test_data.floor.apply(lambda x: str(x)[-2:])
train_data['floor'] = floor_col_train
test_data['floor'] = floor_col_test

# Function to calculate distance from Beijing
def distance(lat2, lon2, lat1=39.916668, lon1=116.383331): 
    lon1, lon2, lat1, lat2 = map(radians, [lon1, lon2, lat1, lat2])
    dlon = lon2 - lon1  
    dlat = lat2 - lat1 
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * asin(sqrt(a))  
    r = 6371
    return c * r

train_data['distance'] = train_data.apply(lambda x: distance(x['Lat'], x['Lng']), axis=1)
train_data['constructionTime'] = train_data['constructionTime'].astype(int)
train_data['building_age'] = 2024 - train_data['constructionTime']

test_data['distance'] = test_data.apply(lambda x: distance(x['Lat'], x['Lng']), axis=1)
test_data['constructionTime'] = test_data['constructionTime'].astype(int)
test_data['building_age'] = 2024 - test_data['constructionTime']

# Replace 'buildingType' with more meaningful values
building_type_mapping = {1: 'Tower', 2: 'Bungalow', 3: 'Tower and Plate', 4: 'Plate'}
train_data['buildingType'].replace(building_type_mapping, inplace=True)
test_data['buildingType'].replace(building_type_mapping, inplace=True)

# Feature Engineering
train_data['price_per_sq'] = train_data['communityAverage'] / train_data['square']

test_data['price_per_sq'] = test_data['communityAverage'] / test_data['square']

# Select features for training
features = ['Lng', 'Lat', 'followers', 'square', 'livingRoom', 'drawingRoom', 'kitchen', 'bathRoom', 'floor',
            'buildingType', 'constructionTime', 'renovationCondition', 'buildingStructure', 'ladderRatio', 'elevator',
            'fiveYearsProperty', 'subway', 'district', 'communityAverage', 'distance', 'building_age', 
            'price_per_sq']
train_data = train_data[features]
test_data = test_data[features]

# Standardize numerical features
all_features = pd.concat((train_data.iloc[:, 1:], test_data.iloc[:, 1:]))
numerical_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numerical_features] = all_features[numerical_features].apply(lambda x: (x - x.mean()) / x.std())
all_features = pd.get_dummies(all_features, dummy_na=True)
all_features = all_features.fillna(0)

# Split back into train and test sets
n_train = train_data.shape[0]
train_features = all_features[:n_train]
test_features = all_features[n_train:]
train_labels = Y_train.values

# Split training data into train/validation sets
x_train, x_valid, y_train, y_valid = train_test_split(train_features, train_labels, test_size=0.25, random_state=27)

# Hyperparameter tuning for RandomForest
param_grid = {
    'n_estimators': [100, 300, 500, 900],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

rf = RandomForestRegressor(random_state=42)
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=10, cv=3, verbose=2, n_jobs=-1, random_state=42)
random_search.fit(x_train, y_train)

# Use best parameters to fit the model
best_rf = random_search.best_estimator_
best_rf.fit(x_train, y_train)

# Stacking Regressor
base_models = [
    ('rf', RandomForestRegressor(n_estimators=300, max_depth=20, random_state=42)),
    ('gb', GradientBoostingRegressor(n_estimators=200, learning_rate=0.1)),
    ('xgb', xgb.XGBRegressor(n_estimators=200, learning_rate=0.1))
]
stacked_model = StackingRegressor(estimators=base_models, final_estimator=RandomForestRegressor(n_estimators=100))

stacked_model.fit(x_train, y_train.ravel())

# Cross-Validation to evaluate the model
cv_scores = cross_val_score(stacked_model, train_features, train_labels.ravel(), cv=5, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-cv_scores)

print(f"Cross-Validation RMSE scores: {rmse_scores}")
print(f"Mean RMSE: {rmse_scores.mean()}, Standard deviation: {rmse_scores.std()}")

# Predict on test data
test_predictions = stacked_model.predict(test_features)

# Save predictions to CSV
results_df = pd.DataFrame({
    'ID': range(len(test_predictions)), 
    'TARGET': test_predictions
})

results_df.to_csv('predictions.csv', index=False)
print("Predictions have been saved to 'predictions.csv'")


  train_data = pd.read_csv('data/X_train.csv')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['constructionTime'].replace({'未知': 0}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['constructionTime'].replace({'未知': 0}, inplace=True)
The behavior will change in pandas 3.0. This inplace metho

Fitting 3 folds for each of 10 candidates, totalling 30 fits


KeyboardInterrupt: 

In [2]:
# Load the data
train_data = pd.read_csv('data/X_train.csv')
Y_train = pd.read_csv('data/y_train.csv')
test_data = pd.read_csv('data/X_test.csv')

  train_data = pd.read_csv('data/X_train.csv')


In [3]:

# Chuyen ki tu khong xac dinh bang so "0" hoặc 2
train_data['constructionTime'].replace({'未知': 0}, inplace=True)
test_data['constructionTime'].replace({'未知': 0}, inplace=True)
train_data['livingRoom'].replace({'#NAME?': 2}, inplace=True)
test_data['livingRoom'].replace({'#NAME?': 2}, inplace=True)

# Cat 2 ky tu cuoi va gan gia tri moi vao cot floor
floor_col_train = train_data.floor.apply(lambda x: str(x)[-2:])
floor_col_test = test_data.floor.apply(lambda x: str(x)[-2:])

train_data['floor'] = floor_col_train
test_data['floor'] = floor_col_test

# We will find distance agnaist each lat and lng from Beijing (lat:39.916668,lon:116.383331)
def distance(lat2, lon2,lat1=39.916668,lon1=116.383331): 
      
    # The math module contains a function named 
    # radians which converts from degrees to radians. 
    lon1 = radians(lon1) 
    lon2 = radians(lon2) 
    lat1 = radians(lat1) 
    lat2 = radians(lat2) 
       
    # Haversine formula  
    dlon = lon2 - lon1  
    dlat = lat2 - lat1 
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
  
    c = 2 * asin(sqrt(a))  
     
    # Radius of earth in kilometers. Use 3956 for miles 
    r = 6371
       
    # calculate the result 
    return(c * r) 

train_data['distance'] = train_data.apply(lambda x: distance (x['Lat'],x['Lng']),axis=1)
train_data['constructionTime'] = train_data['constructionTime'].astype(int)
train_data['building_age'] = 2024 - train_data['constructionTime']

test_data['distance'] = test_data.apply(lambda x: distance (x['Lat'],x['Lng']),axis=1)
test_data['constructionTime'] = test_data['constructionTime'].astype(int)
test_data['building_age'] = 2024 - test_data['constructionTime']
# Converting 'buildingType' feature to object or string type
train_data['buildingType'].replace(1,'Tower',inplace=True)
train_data['buildingType'].replace(2,'Bungalow',inplace=True)
train_data['buildingType'].replace(3,'Tower and Plate',inplace=True)
train_data['buildingType'].replace(4,'Plate',inplace=True)

test_data['buildingType'].replace(1,'Tower',inplace=True)
test_data['buildingType'].replace(2,'Bungalow',inplace=True)
test_data['buildingType'].replace(3,'Tower and Plate',inplace=True)
test_data['buildingType'].replace(4,'Plate',inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['constructionTime'].replace({'未知': 0}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['constructionTime'].replace({'未知': 0}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate obje

In [53]:
train_data.columns

Index(['ID', 'Lng', 'Lat', 'tradeTime', 'followers', 'square', 'livingRoom',
       'drawingRoom', 'kitchen', 'bathRoom', 'floor', 'buildingType',
       'constructionTime', 'renovationCondition', 'buildingStructure',
       'ladderRatio', 'elevator', 'fiveYearsProperty', 'subway', 'district',
       'communityAverage', 'distance', 'Age'],
      dtype='object')

In [4]:
#lấy những columns cần thiết để training
train = ['Lng','Lat','followers','square', 'livingRoom',
       'drawingRoom', 'kitchen', 'bathRoom', 'floor', 'buildingType',
       'constructionTime', 'renovationCondition', 'buildingStructure','ladderRatio', 'elevator', 'fiveYearsProperty', 'subway', 'district',
       'communityAverage','distance','building_age']
train_data = train_data[train]
test_data = test_data[train]

all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))

#obtain all the numerical features
numerical_features = all_features.dtypes[all_features.dtypes != 'object'].index
#apply standardization to each feature
all_features[numerical_features] = all_features[numerical_features].apply( lambda x: (x-x.mean()) / x.std() )
#replace missing values with 0
all_features[numerical_features] = all_features[numerical_features].fillna(0)
#one-hot encoding consider missing values as a category.
all_features = pd.get_dummies(all_features, dummy_na=True)
all_features.shape

pd.isna(all_features).any()

Lat                             False
followers                       False
square                          False
kitchen                         False
constructionTime                False
                                ...  
buildingType_Bungalow           False
buildingType_Plate              False
buildingType_Tower              False
buildingType_Tower and Plate    False
buildingType_nan                False
Length: 134, dtype: bool

In [56]:
#Tách dữ liệu thành tập huấn luyện và tập kiểm tra
n_train = train_data.shape[0]
train_features = all_features[:n_train]
test_features = all_features[n_train:]
train_labels = Y_train.values

# Tách tập huấn luyện thành tập train và validation
x_train, x_valid,y_train, y_valid = train_test_split(train_features, train_labels, test_size=0.250001, random_state = 27)


In [57]:
rf = RandomForestRegressor(random_state=42,n_estimators=900,max_depth=20,
                                              n_jobs=-1,min_samples_split=10,)

In [58]:
# Fit Model

rf.fit(x_train, y_train)

# Make validation predictions

y_pred = rf.predict(x_valid)

In [59]:

# Step 1: Predict using RandomForest on test data
rf_test_pred = rf.predict(test_features)
rf_test_pred.shape
target_predictions = rf_test_pred[:, 1]
# Step 4: Create a DataFrame with ID and TARGET
results_df = pd.DataFrame({
    'ID': range(len(rf_test_pred)), 
    'TARGET': target_predictions
})

# Step 5: Export to CSV
results_df.to_csv('predictions.csv', index=False)

print("Predictions have been saved to 'predictions.csv'")

Predictions have been saved to 'predictions.csv'
