In [88]:
import pandas as pd
import numpy as np
import tensorflow as tf
from math import radians, cos, sin, asin, sqrt
from tensorflow.keras import layers, models, callbacks
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

In [89]:
# Load the data
train_data = pd.read_csv('data/X_train.csv')
Y_train = pd.read_csv('data/y_train.csv')
test_data = pd.read_csv('data/X_test.csv')

  train_data = pd.read_csv('data/X_train.csv')


In [90]:
# Data cleaning functions
def clean_construction_time_data(construction_time_series):
    def process_construction_time(x):
        try:
            year = int(x)
            if 1900 <= year <= 2024:  
                return year
            else:
                return np.nan
        except ValueError:
            return np.nan
    cleaned_series = construction_time_series.apply(process_construction_time)
    mode_value = cleaned_series.mode().iloc[0]
    cleaned_series = cleaned_series.fillna(mode_value)
    return cleaned_series.astype(int)

def distance(lat2, lon2, lat1=39.916668, lon1=116.383331): 
    lon1, lon2, lat1, lat2 = map(radians, [lon1, lon2, lat1, lat2])
    dlon = lon2 - lon1  
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))  
    r = 6371
    return c * r

def clean_building_type_data(building_type_series):
    def process_building_type(x):
        if pd.isna(x):
            return 'Unknown'
        elif isinstance(x, (int, float)):
            if x == 1 or x == 1.0:
                return 'Tower'
            elif x == 2 or x == 2.0:
                return 'Bungalow'
            elif x == 3 or x == 3.0:
                return 'Tower and Plate'
            elif x == 4 or x == 4.0:
                return 'Plate'
            elif 0 < x < 1:
                return 'Plate'
            else:
                return 'Plate'
        else:
            return str(x)
    return building_type_series.apply(process_building_type)

def clean_floor_data(floor_series):
    def process_floor(x):
        x = str(x)
        if x == '结构':
            return 6  
        else:
            return x[-2:].strip()
    cleaned_series = floor_series.apply(process_floor)
    cleaned_series = pd.to_numeric(cleaned_series, errors='coerce')
    median_value = cleaned_series.mode().iloc[0]
    cleaned_series = cleaned_series.fillna(median_value)
    return cleaned_series

def clean_bathroom_data(bathroom_series):
    bathroom_series = bathroom_series.astype(str)
    def process_value(x):
        try:
            value = int(float(x))
            if 0 <= value <= 6:
                return value
            else:
                return np.nan
        except ValueError:
            return np.nan
    cleaned_series = bathroom_series.apply(process_value)
    median_value = cleaned_series.mode().iloc[0]
    cleaned_series = cleaned_series.fillna(median_value)
    return cleaned_series

def clean_drawing_room_data(drawing_room_series):
    def process_drawing_room(x):
        x = str(x).lower()
        if x.isdigit():
            return int(x)
        else:
            parts = x.split()
            if len(parts) == 2 and parts[1].isdigit():
                return int(parts[1])
            else:
                return np.nan
    cleaned_series = drawing_room_series.apply(process_drawing_room)
    median_value = cleaned_series.mode().iloc[0]
    cleaned_series = cleaned_series.fillna(median_value)
    return cleaned_series

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['livingRoom'].replace({'#NAME?': 2}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['livingRoom'].replace({'#NAME?': 2}, inplace=True)


In [91]:
# Apply data cleaning
for df in [train_data, test_data]:
    df['constructionTime'] = clean_construction_time_data(df['constructionTime'])
    df['livingRoom'].replace({'#NAME?': 2}, inplace=True)
    df['distance'] = df.apply(lambda x: distance(x['Lat'], x['Lng']), axis=1)
    df['building_age'] = 2024 - df['constructionTime']
    df['buildingType'] = clean_building_type_data(df['buildingType'])
    df['floor'] = clean_floor_data(df['floor'])
    df['bathRoom'] = clean_bathroom_data(df['bathRoom'])
    df['drawingRoom'] = clean_drawing_room_data(df['drawingRoom'])

array([ 42,  20,  22,  25,  17,  24,  30,  16,  21,  13,  18,  26,  14,
        35,  19,  32,  39,  37,  15,  29,  34,  27,  11,  36,  43,  45,
        23,  31,  38,  49,  33,  40,  44,  12,  28,  60,  41,   9,  58,
        10,  54,  48,  59,  66,  68,  46,  69,  61,  50,  47,  64,  57,
        51,  70,  56,  67,   8,  62,  53,  52,  65,  71,  63,  74,  90,
        91,  72, 110,  55,  80])

In [92]:
# Select features for training
features = ['Lng', 'Lat', 'tradeTime', 'followers', 'square', 'livingRoom',
            'drawingRoom', 'kitchen', 'bathRoom', 'floor', 'buildingType',
            'constructionTime', 'renovationCondition', 'buildingStructure',
            'ladderRatio', 'elevator', 'fiveYearsProperty', 'subway', 'district',
            'communityAverage', 'distance', 'building_age']

train_data = train_data[features]
test_data = test_data[features]
# Combine features and apply preprocessing
all_features = pd.concat((train_data, test_data))
numerical_features = all_features.select_dtypes(include=[np.number]).columns
all_features[numerical_features] = all_features[numerical_features].apply(lambda x: (x - x.mean()) / x.std())
all_features[numerical_features] = all_features[numerical_features].fillna(0)
all_features = pd.get_dummies(all_features, dummy_na=True)

ID                       int64
Lng                    float64
Lat                    float64
tradeTime               object
followers                int64
square                 float64
livingRoom              object
drawingRoom             object
kitchen                  int64
bathRoom                object
floor                   object
buildingType           float64
constructionTime         int32
renovationCondition      int64
buildingStructure        int64
ladderRatio            float64
elevator               float64
fiveYearsProperty      float64
subway                 float64
district                 int64
communityAverage       float64
distance               float64
building_age             int32
dtype: object

In [93]:
# Split data
n_train = train_data.shape[0]
train_features = all_features[:n_train]
test_features = all_features[n_train:]
train_labels = Y_train.values

x_train, x_valid, y_train, y_valid = train_test_split(train_features, train_labels, test_size=0.25, random_state=27)


tradeTime: ['2012-11-12' '2014-11-29' '2015-02-05' ... '2010-02-26' '2010-03-05'
 '2010-04-11']

livingRoom: ['2' '1' '3' '4' '5' '6' '9' '7' '0' '8' 2]

drawingRoom: ['1' '2' '0' '3' '4' '中 16' '中 24' '中 14' '底 28' '中 15' '底 11' '5' '低 15'
 '中 22' '中 6' '低 6' '高 14' '底 20' '低 16' '高 12']

bathRoom: ['1' '2' '3' '0' '4' '5' '6' '2006' '未知' '2003' '2005' 1 2 3 0 5 4 2003
 2005 6 2000 1990 2006 1994 7 2004 1996 2011]

floor: ['顶 5' '高 24' '高 12' '低 24' '中 6' '中 14' '底 5' '低 22' '顶 6' '中 9' '低 18'
 '中 17' '低 19' '中 11' '高 23' '高 32' '中 18' '底 6' '低 6' '中 12' '低 7' '低 20'
 '低 14' '中 8' '高 10' '底 7' '高 6' '底 22' '低 28' '底 18' '未知 6' '顶 26' '高 7'
 '中 5' '高 9' '中 16' '高 16' '中 21' '低 21' '低 11' '中 31' '高 18' '高 15'
 '底 28' '低 17' '低 30' '高 27' '底 13' '底 4' '低 12' '高 25' '中 19' '低 16'
 '低 29' '高 22' '顶 4' '中 7' '顶 22' '中 15' '中 22' '底 11' '顶 16' '中 10'
 '中 28' '低 9' '底 10' '高 26' '顶 14' '顶 9' '中 29' '顶 17' '低 26' '高 11'
 '高 30' '低 25' '高 17' '低 10' '顶 7' '低 32' '中 13' '中 25' '底 26' '中 27'
 '顶 

In [94]:
# RandomForest model
rf = RandomForestRegressor(random_state=42, n_estimators=900, max_depth=20, n_jobs=-1, min_samples_split=10)
rf.fit(x_train, y_train)

6


In [98]:
train_data.columns

Index(['ID', 'Lng', 'Lat', 'tradeTime', 'followers', 'square', 'livingRoom',
       'drawingRoom', 'kitchen', 'bathRoom', 'floor', 'buildingType',
       'constructionTime', 'renovationCondition', 'buildingStructure',
       'ladderRatio', 'elevator', 'fiveYearsProperty', 'subway', 'district',
       'communityAverage', 'distance', 'building_age'],
      dtype='object')

In [99]:
#lấy những columns cần thiết để training
train = ['Lng', 'Lat', 'tradeTime', 'followers', 'square', 'livingRoom',
       'drawingRoom', 'kitchen', 'bathRoom', 'floor', 'buildingType',
       'constructionTime', 'renovationCondition', 'buildingStructure',
       'ladderRatio', 'elevator', 'fiveYearsProperty', 'subway', 'district',
       'communityAverage', 'distance', 'building_age']
train_data = train_data[train]
test_data = test_data[train]

all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))

#obtain all the numerical features
numerical_features = all_features.dtypes[all_features.dtypes != 'object'].index
#apply standardization to each feature
all_features[numerical_features] = all_features[numerical_features].apply( lambda x: (x-x.mean()) / x.std() )
#replace missing values with 0
all_features[numerical_features] = all_features[numerical_features].fillna(0)
#one-hot encoding consider missing values as a category.
all_features = pd.get_dummies(all_features, dummy_na=True)
all_features.shape

pd.isna(all_features).any()

Lat                             False
followers                       False
square                          False
drawingRoom                     False
kitchen                         False
                                ...  
buildingType_Plate              False
buildingType_Tower              False
buildingType_Tower and Plate    False
buildingType_Unknown            False
buildingType_nan                False
Length: 2597, dtype: bool

In [100]:
#Tách dữ liệu thành tập huấn luyện và tập kiểm tra
n_train = train_data.shape[0]
train_features = all_features[:n_train]
test_features = all_features[n_train:]
train_labels = Y_train.values

# Tách tập huấn luyện thành tập train và validation
x_train, x_valid,y_train, y_valid = train_test_split(train_features, train_labels, test_size=0.250001, random_state = 27)


In [101]:
# RandomForest model
rf = RandomForestRegressor(random_state=42, n_estimators=900, max_depth=20, n_jobs=-1, min_samples_split=10)
rf.fit(x_train, y_train)

In [None]:

# Get RandomForest predictions
rf_train_pred = rf.predict(x_train)
rf_valid_pred = rf.predict(x_valid)
rf_test_pred = rf.predict(test_features)

In [59]:
# MLP model
def create_mlp_model(input_shape):
    model = models.Sequential([
        layers.Dense(128, activation='relu', input_shape=(input_shape,)),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(64, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(32, activation='relu'),
        layers.BatchNormalization(),
        layers.Dense(1)
    ])
    return model

Predictions have been saved to 'predictions.csv'


In [None]:
# Prepare data for MLP
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
test_features_scaled = scaler.transform(test_features)

# Create and compile the MLP model
mlp_model = create_mlp_model(x_train.shape[1])
mlp_model.compile(optimizer='adam', loss='mse')

In [None]:
# Train the MLP model
early_stopping = callbacks.EarlyStopping(patience=10, restore_best_weights=True)
history = mlp_model.fit(
    x_train_scaled, rf_train_pred,
    validation_data=(x_valid_scaled, rf_valid_pred),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1
)

In [None]:
# Make predictions using MLP
mlp_predictions = mlp_model.predict(test_features_scaled).flatten()

# Save MLP predictions
mlp_results_df = pd.DataFrame({
    'ID': range(len(mlp_predictions)),
    'TARGET': mlp_predictions
})
mlp_results_df.to_csv('mlp_predictions.csv', index=False)
print("MLP predictions have been saved to 'mlp_predictions.csv'")