In [10]:
import json
import joblib
import numpy as np
import pandas as pd

from scipy.stats import randint
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split, cross_val_score

In [11]:
def process_data(path):
    with open(path) as f:
        datas = json.load(f)
    
    num_samples = len(datas)
    num_user_info = len(datas[0]["user_info"])
    num_features = len(datas[0]["parameter"])
    
    inputs = np.empty((num_samples, num_user_info))
    outputs = np.empty((num_samples, num_features))
    
    for i, data in enumerate(datas):
        user_info = []
        for _, value in data["user_info"].items():
            if value == "":
                user_info.append(-1.0)
            else:
                user_info.append(float(value))
        inputs[i] = user_info
        
        features = []
        for _, value in data["parameter"].items():
            if value == "":
                features.append(-1.0)
            else:
                features.append(float(value))
        outputs[i] = features
    
    return inputs, outputs

In [12]:
x_train, y_train = process_data('../train_data/type_1/train.json')
x_val, y_val = process_data('../train_data/type_1/valid.json')

In [13]:
x_train.shape, y_train.shape

((6087, 61), (6087, 119))

In [14]:
param_distributions = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 10),
    'min_samples_split': randint(2, 20),
}

rf_regressor = RandomForestRegressor(random_state=42)
random_search = RandomizedSearchCV(rf_regressor, param_distributions, n_iter=50, cv=5, scoring='r2', random_state=42)
random_search.fit(x_train, y_train)

In [15]:
# 保存模型
best_model = random_search.best_estimator_
joblib.dump(best_model, './models/rf_model_type_1.pkl')

['rf_regressor.pkl']

In [16]:
from sklearn.metrics import r2_score
# 读取模型
best_model = joblib.load('./models/rf_model_type_1.pkl')

# 使用最佳模型在验证集上进行预测
y_val_pred = best_model.predict(x_val)

# 计算并打印R2分数
val_r2 = r2_score(y_val, y_val_pred)
print(f'Validation R2 Score: {val_r2}')

Validation R2 Score: 0.17555657854959658


In [17]:
# Random choose a sample to check the prediction
random_index = np.random.randint(0, len(y_val))

real = y_val[random_index]
pred = y_val_pred[random_index]

In [21]:
pd.set_option('display.max_rows', None)

results = pd.DataFrame({
    '真实值': [int(i) for i in real],
    '预测值': [round(i) for i in pred]
})

results

Unnamed: 0,真实值,预测值
0,3,7
1,42,33
2,24,26
3,24,26
4,24,26
5,24,26
6,24,26
7,24,26
8,45,37
9,45,38
