In [7]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.cluster import KMeans
# 自定义预处理器
class CustomPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scaler = StandardScaler()
        self.kmeans = KMeans(n_clusters=3, random_state=42)
        self.weather_dummies_columns = None
        self.categorical_features = None
        self.numeric_features = None

    def fit(self, X, y=None):
        # 对数变换
        X = X.copy()
        X['visibility'] = np.log1p(X['visibility'])
        X['snowdepth'] = np.log1p(X['snowdepth'])
        X['precip'] = np.log1p(X['precip'])
        # 时间特征转换
        X['month_sin'] = np.sin(2 * np.pi * X['month'] / 12)
        X['month_cos'] = np.cos(2 * np.pi * X['month'] / 12)
        X['day_of_week_sin'] = np.sin(2 * np.pi * X['day_of_week'] / 7)
        X['day_of_week_cos'] = np.cos(2 * np.pi * X['day_of_week'] / 7)
        X['hour_of_day_sin'] = np.sin(2 * np.pi * X['hour_of_day'] / 24)
        X['hour_of_day_cos'] = np.cos(2 * np.pi * X['hour_of_day'] / 24)
        # 删除原始时间特征
        X = X.drop(columns=['month', 'day_of_week', 'hour_of_day'], axis=1)
        # 选择用于聚类的特征
        clustering_features = ['summertime', 'temp', 'dew', 'humidity', 'snowdepth', 'windspeed', 'cloudcover', 'visibility', 'precip', 'snow']
        # 在训练数据上拟合 KMeans
        self.kmeans.fit(X[clustering_features])
        # 添加聚类结果
        X['weather_cluster'] = self.kmeans.labels_
        # 映射聚类标签到天气质量
        X['weather_quality'] = X['weather_cluster'].apply(self.label_weather_cluster)
        # One-Hot 编码
        weather_dummies = pd.get_dummies(X['weather_quality'], prefix='weather', drop_first=True)
        # 保存天气哑变量的列名
        self.weather_dummies_columns = weather_dummies.columns
        X = pd.concat([X, weather_dummies], axis=1)
        # 选择特征
        weather_features = ['summertime', 'temp', 'dew', 'humidity', 'snowdepth', 'windspeed', 'cloudcover', 'visibility', 'precip', 'snow']
        self.categorical_features = ['holiday', 'weekday'] + list(self.weather_dummies_columns)
        self.numeric_features = [col for col in X.columns if col not in ['holiday', 'weekday', 'increase_stock', 'weather_cluster', 'weather_quality'] + list(self.weather_dummies_columns) + weather_features]
        # 转换布尔类型
        for col in X.select_dtypes(include=['bool']).columns:
            X[col] = X[col].astype(int)
        # 在训练数据上拟合 StandardScaler
        self.scaler.fit(X[self.numeric_features])
        return self

    def transform(self, X):
        X = X.copy()
        # 对数变换
        X['visibility'] = np.log1p(X['visibility'])
        X['snowdepth'] = np.log1p(X['snowdepth'])
        X['precip'] = np.log1p(X['precip'])
        # 时间特征转换
        X['month_sin'] = np.sin(2 * np.pi * X['month'] / 12)
        X['month_cos'] = np.cos(2 * np.pi * X['month'] / 12)
        X['day_of_week_sin'] = np.sin(2 * np.pi * X['day_of_week'] / 7)
        X['day_of_week_cos'] = np.cos(2 * np.pi * X['day_of_week'] / 7)
        X['hour_of_day_sin'] = np.sin(2 * np.pi * X['hour_of_day'] / 24)
        X['hour_of_day_cos'] = np.cos(2 * np.pi * X['hour_of_day'] / 24)
        # 删除原始时间特征
        X = X.drop(columns=['month', 'day_of_week', 'hour_of_day'], axis=1)
        # 选择用于聚类的特征
        clustering_features = ['summertime', 'temp', 'dew', 'humidity', 'snowdepth', 'windspeed', 'cloudcover', 'visibility', 'precip', 'snow']
        # 使用训练好的 KMeans 进行预测
        X['weather_cluster'] = self.kmeans.predict(X[clustering_features])
        # 映射聚类标签到天气质量
        X['weather_quality'] = X['weather_cluster'].apply(self.label_weather_cluster)
        # One-Hot 编码
        weather_dummies = pd.get_dummies(X['weather_quality'], prefix='weather', drop_first=True)
        # 确保测试数据的列与训练数据一致
        for col in self.weather_dummies_columns:
            if col not in weather_dummies.columns:
                weather_dummies[col] = 0
        X = pd.concat([X, weather_dummies], axis=1)
        # 转换布尔类型
        for col in X.select_dtypes(include=['bool']).columns:
            X[col] = X[col].astype(int)
        # 标准化
        X_scaled = self.scaler.transform(X[self.numeric_features])
        X_scaled = pd.DataFrame(X_scaled, columns=self.numeric_features, index=X.index)
        # 组合特征
        X_processed = pd.concat([X[self.categorical_features], X_scaled], axis=1)
        return X_processed

    def label_weather_cluster(self, cluster_label):
        if cluster_label == 1:
            return 'bad_weather'
        elif cluster_label == 0:
            return 'good_weather'
        elif cluster_label == 2:
            return 'neutral_weather'

# 模型评估函数
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    report = classification_report(y_true, y_pred, target_names=['low_bike_demand', 'high_bike_demand'])
    return accuracy, f1, report

# 超参数调优函数
def tune_random_forest_rs(X_train, y_train, cv=5, scoring='f1', n_iter=100):
    num_pos = np.sum(y_train == 1)
    num_neg = np.sum(y_train == 0)
    ratio = num_neg / num_pos
    param_dist = {
        'classifier__n_estimators': [5 * i for i in range(100, 150)],
        'classifier__max_depth': list(range(15, 25)),
        'classifier__min_samples_split': list(range(3, 32)),
        'classifier__min_samples_leaf': list(range(3, 32)),
        'classifier__max_features': ['sqrt', 'log2', 0.1, 0.2, 0.3, 0.4, 0.5],
        'classifier__bootstrap': [True, False],
        'classifier__class_weight': ['balanced', {0: 1, 1: ratio}, {0: 1 / ratio, 1: 1}, {0: 1, 1: 1}],
        'classifier__criterion': ["gini", "entropy", "log_loss"],
        'classifier__warm_start': [True, False],
    }
    # 创建 Pipeline
    pipeline = Pipeline([
        ('preprocessor', CustomPreprocessor()),
        ('classifier', RandomForestClassifier(random_state=42))
    ])
    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_dist,
        n_iter=n_iter,
        cv=cv,
        scoring=scoring,
        n_jobs=-1,
        random_state=42
    )
    random_search.fit(X_train, y_train.ravel())
    return random_search.best_estimator_

# 主程序
if __name__ == "__main__":
    # 加载数据
    data = pd.read_csv('data/training_data_fall2024.csv')
    
    # 定义特征和目标变量
    X_all = data.copy()
    y_all = data['increase_stock'].map({'low_bike_demand': 0, 'high_bike_demand': 1}).to_numpy().ravel()
    X_all = X_all.drop(columns=['increase_stock'])
    
    # 超参数调优
    print("Tuning Random Forest...")
    best_model = tune_random_forest_rs(X_all, y_all, cv=10, scoring='f1', n_iter=100)
    
    # 使用交叉验证预测
    y_pred = cross_val_predict(best_model, X_all, y_all, cv=10)
    
    # 评估模型
    accuracy, f1, report = evaluate_model(y_all, y_pred)
    print(f"Cross-validated Accuracy: {accuracy:.2f}")
    print(f"Cross-validated F1 Score: {f1:.2f}")
    print("Classification Report:\n", report)
    print("Confusion Matrix:\n", confusion_matrix(y_all, y_pred))


Tuning Random Forest...




Cross-validated Accuracy: 0.86
Cross-validated F1 Score: 0.68
Classification Report:
                   precision    recall  f1-score   support

 low_bike_demand       0.95      0.88      0.91      1312
high_bike_demand       0.58      0.80      0.68       288

        accuracy                           0.86      1600
       macro avg       0.77      0.84      0.79      1600
    weighted avg       0.89      0.86      0.87      1600

Confusion Matrix:
 [[1148  164]
 [  57  231]]
