In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from math import radians, cos, sin, asin, sqrt
from tensorflow.keras import layers, models, callbacks
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

In [2]:
# Load the data
train_data = pd.read_csv('data/X_train.csv')
Y_train = pd.read_csv('data/y_train.csv')
test_data = pd.read_csv('data/X_test.csv')

  train_data = pd.read_csv('data/X_train.csv')


In [3]:
# Data preprocessing
def preprocess_data(df):
    df['constructionTime'] = df['constructionTime'].replace({'未知': 0})
    df['livingRoom'] = df['livingRoom'].replace({'#NAME?': 2})
    df['floor'] = df['floor'].apply(lambda x: str(x)[-2:])
    return df

train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Distance calculation function
def distance(lat2, lon2, lat1=39.916668, lon1=116.383331):
    lon1, lon2, lat1, lat2 = map(radians, [lon1, lon2, lat1, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371  # Radius of earth in kilometers
    return c * r

# Add distance and building age features
def add_features(df):
    df['distance'] = df.apply(lambda x: distance(x['Lat'], x['Lng']), axis=1)
    df['constructionTime'] = df['constructionTime'].astype(int)
    df['building_age'] = 2024 - df['constructionTime']
    return df

train_data = add_features(train_data)
test_data = add_features(test_data)

# Convert buildingType to categorical
building_type_map = {1: 'Tower', 2: 'Bungalow', 3: 'Tower and Plate', 4: 'Plate'}
train_data['buildingType'] = train_data['buildingType'].map(building_type_map)
test_data['buildingType'] = test_data['buildingType'].map(building_type_map)

In [4]:
# Select features for training
features = ['Lng', 'Lat', 'followers', 'square', 'livingRoom', 'drawingRoom', 'kitchen', 'bathRoom', 
            'floor', 'buildingType', 'constructionTime', 'renovationCondition', 'buildingStructure',
            'ladderRatio', 'elevator', 'fiveYearsProperty', 'subway', 'district', 'communityAverage',
            'distance', 'building_age']

train_data = train_data[features]
test_data = test_data[features]

# Combine train and test for preprocessing
all_features = pd.concat((train_data, test_data))

In [5]:
# Preprocessing
numerical_features = all_features.select_dtypes(include=[np.number]).columns
all_features[numerical_features] = all_features[numerical_features].apply(lambda x: (x - x.mean()) / x.std())
all_features = pd.get_dummies(all_features, dummy_na=True)

# Split back into train and test
n_train = train_data.shape[0]
train_features = all_features[:n_train]
test_features = all_features[n_train:]
train_labels = Y_train.values

# Split into train and validation sets
x_train, x_valid, y_train, y_valid = train_test_split(train_features, train_labels, test_size=0.25, random_state=27)

In [6]:
# RandomForest model
rf = RandomForestRegressor(random_state=42, n_estimators=900, max_depth=20, n_jobs=-1, min_samples_split=10)
rf.fit(x_train, y_train)


In [7]:

# Step 1: Predict using RandomForest on test data
rf_test_pred = rf.predict(test_features)
rf_test_pred.shape
target_predictions = rf_test_pred[:, 1]
# Step 4: Create a DataFrame with ID and TARGET
results_df = pd.DataFrame({
    'ID': range(len(rf_test_pred)), 
    'TARGET': target_predictions
})

# Step 5: Export to CSV
results_df.to_csv('rfr_predictions.csv', index=False)


In [8]:
# Make predictions with RandomForest
rf_train_pred = rf.predict(x_train)
rf_valid_pred = rf.predict(x_valid)
rf_test_pred = rf.predict(test_features)



In [9]:
# MLP model
def create_mlp_model(input_shape):
    model = models.Sequential([
        layers.Dense(256, activation='relu', input_shape=(input_shape,)),
        layers.BatchNormalization(),
        layers.Dropout(0.4),
        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.4),
        layers.Dense(64, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(32, activation='relu'),
        layers.BatchNormalization(),
        layers.Dense(1)
    ])
    return model


In [10]:
# Prepare data for MLP
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
test_features_scaled = scaler.transform(test_features)

# Create and compile the MLP model
mlp_model = create_mlp_model(x_train.shape[1])
mlp_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='mse',
                  metrics=['mae'])

# Define callbacks
early_stopping = callbacks.EarlyStopping(patience=15, restore_best_weights=True)
reduce_lr = callbacks.ReduceLROnPlateau(factor=0.2, patience=5, min_lr=1e-6)


history = mlp_model.fit(
    x_train_scaled, y_train,
    validation_data=(x_valid_scaled, y_valid),
    epochs=50,
    batch_size=64,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m2990/2990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 16ms/step - loss: 16933894144.0000 - mae: 79829.4922 - val_loss: 16851350528.0000 - val_mae: 79594.1328 - learning_rate: 0.0010
Epoch 2/50
[1m2990/2990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 13ms/step - loss: 16791229440.0000 - mae: 79535.8750 - val_loss: 16620838912.0000 - val_mae: 79571.7266 - learning_rate: 0.0010
Epoch 3/50
[1m2990/2990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 15ms/step - loss: 16607357952.0000 - mae: 79690.6484 - val_loss: 16318090240.0000 - val_mae: 79589.7266 - learning_rate: 0.0010
Epoch 4/50
[1m2990/2990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 13ms/step - loss: 16179773440.0000 - mae: 79488.1953 - val_loss: 15917821952.0000 - val_mae: 79634.4766 - learning_rate: 0.0010
Epoch 5/50
[1m2990/2990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 12ms/step - loss: 15779381248.0000 - mae: 79571.8359 - val_loss: 15458545664.0000 - val_

In [11]:
# Make predictions using MLP
mlp_predictions = mlp_model.predict(test_features_scaled).flatten()

# Save MLP predictions
mlp_results_df = pd.DataFrame({
    'ID': range(len(mlp_predictions)),
    'TARGET': mlp_predictions
})
mlp_results_df.to_csv('mlp_predictions.csv', index=False)
print("MLP predictions have been saved to 'mlp_predictions.csv'")

[1m1993/1993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step
MLP predictions have been saved to 'mlp_predictions.csv'
