In [1]:
from math import *
import numpy as np
import matplotlib.pyplot as plt
import torch
import pandas as pd

#### Loading csv file
Some code functions are taken from the https://www.kaggle.com/code/muhammadqasimshabbir/50sec-runtime-gpu-based-real-estate-demand. It was very helpful in doing the feature engineering

In [36]:
def load_all_data():
    """Load all datasets with optimized memory usage"""
    data = {}
    
    # Main datasets
    datasets = {
        'new': 'data/train/new_house_transactions.csv',
        'new_nb': 'data/train/new_house_transactions_nearby_sectors.csv',
        'pre': 'data/train/pre_owned_house_transactions.csv',
        'pre_nb': 'data/train/pre_owned_house_transactions_nearby_sectors.csv',
        'land': 'data/train/land_transactions.csv',
        'land_nb': 'data/train/land_transactions_nearby_sectors.csv',
        'city_idx': 'data/train/city_indexes.csv',
        'city_search': 'data/train/city_search_index.csv',
        'poi': 'data/train/sector_POI.csv',
        'test': 'data/test.csv'
    }
    
    for name, path in datasets.items():
        try:
            data[name] = pd.read_csv(path)
            print(f"Loaded {name}: {data[name].shape}")
        except Exception as e:
            print(f"Error loading {name}: {e}")
    
    return data




#### FEATURE ENGINEERING

In [56]:
def extract_datetime_features(df, date_col='month'):
    """
    Extract essential datetime features
    """
    if date_col in df.columns:
        # Parse to datetime directly (no splitting needed)
        df['date'] = pd.to_datetime(df[date_col], format='%Y-%b')  # e.g. "2021-Jan"
        df['Year'] = df['date'].dt.year
        df['Month_num'] = df['date'].dt.month
        df['time_index'] = (df['Year'] - df['Year'].min()) * 12 + df['Month_num']
        
        # Seasonality encoding
        df['sin_month'] = np.sin(2 * np.pi * df['Month_num'] / 12)
        df['cos_month'] = np.cos(2 * np.pi * df['Month_num'] / 12)
    
    return df

def extract_sector_features(df, col='sector'):
    """
    Extract numeric sector id from 'sector %d' strings
    """
    if col in df.columns:
        df[col] = df[col].str.extract(r'(\d+)').astype(int)
    return df

def extract_keyword(df, col='keyword'):
    """
    Extracts the keywords and maps them into integers
    """
    keyword_dict = {'买房':1, '二手房市场':2, '公积金':3, '利率上调':4, '去库存':5, '取消限购':6, '契税':7,
                    '学区房':8, '安置':9, '房产税':10, '房价':11, '房价上涨':12, '房价下跌':13, '房价调控':14,
                    '房价走势':15, '房地产开发':16, '房地产税':17, '房屋装修':18, '房贷':19, '棚户区':20,
                    '棚户区改造':21, '租购':22, '税费':23, '落户':24, '融资':25, '购房':26, '贷款利率':27,
                    '限售':28, '限购':29, '首付':30}
    
    if col in df.columns:
        df[col] =  df[col].map(keyword_dict)
    return(df)

def extract_source(df, col='source'):
    """
    Extracts the source of the keyword search and maps into integers
    """

    keyword_dict = {'PC端':1, '移动端':2}
    if col in df.columns:
        df[col] = df[col].map(keyword_dict)
    return(df)

In [58]:
all_data = load_all_data()
dfname_key = list(all_data.keys())

for key in dfname_key:
    all_data[key] = extract_datetime_features(all_data[key], date_col='month')
    all_data[key] = extract_sector_features(all_data[key], col='sector')
    all_data[key] = extract_keyword(all_data[key], col='keyword')
    all_data[key] = extract_source(all_data[key], col='source')

Loaded new: (5433, 11)
Loaded new_nb: (5360, 11)
Loaded pre: (5360, 6)
Loaded pre_nb: (5427, 6)
Loaded land: (5896, 6)
Loaded land_nb: (5025, 6)
Loaded city_idx: (7, 74)
Loaded city_search: (4020, 4)
Loaded poi: (86, 142)
Loaded test: (1152, 2)


In [68]:
for key in dfname_key:
    if 'sector' in all_data[key].columns:
        filtered_df = all_data[key][all_data[key]['sector']==2]
        print(key, filtered_df.shape)


new (67, 17)
new_nb (67, 17)
pre (67, 12)
pre_nb (67, 12)
land (67, 12)
land_nb (67, 12)
poi (1, 142)


In [77]:
print(all_data['new_nb'])

         month  sector  num_new_house_transactions_nearby_sectors  \
0     2019-Jan      35                                 129.250000   
1     2019-Jan      23                                  27.400000   
2     2019-Jan      80                                  81.285714   
3     2019-Jan      53                                  28.500000   
4     2019-Jan      84                                   8.857143   
...        ...     ...                                        ...   
5355  2024-Jul      28                                  80.600000   
5356  2024-Jul      20                                 112.857143   
5357  2024-Jul      56                                  44.250000   
5358  2024-Jul      47                                 167.000000   
5359  2024-Jul      48                                  50.500000   

      area_new_house_transactions_nearby_sectors  \
0                                   13212.500000   
1                                    2822.400000   
2              