In [5]:
import pandas as pd


train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')


missing_values_train = train_data.isnull().sum()
missing_values_test = test_data.isnull().sum()

print("Training Data Missing Values:\n", missing_values_train[missing_values_train > 0])
print("\nTest Data Missing Values:\n", missing_values_test[missing_values_test > 0])




Training Data Missing Values:
 Series([], dtype: int64)

Test Data Missing Values:
 Series([], dtype: int64)


In [6]:
for column in train_data.columns:
    if train_data[column].dtype == 'object':
        train_data[column].fillna(train_data[column].mode()[0], inplace=True)
        test_data[column].fillna(test_data[column].mode()[0], inplace=True)
    else:
        train_data[column].fillna(train_data[column].median(), inplace=True)
        test_data[column].fillna(test_data[column].median(), inplace=True)

In [8]:
from sklearn.preprocessing import OneHotEncoder

# 确保数据的数据类型是一致的
X_train = X_train.astype(str)
X_test = X_test.astype(str)

# 明确指定分类特征
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

# 初始化OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

# 使用训练数据拟合encoder
encoder.fit(X_train[categorical_features])

# 对训练和测试数据进行编码
X_train_encoded = encoder.transform(X_train[categorical_features])
X_test_encoded = encoder.transform(X_test[categorical_features])

# 将编码后的数据与原始数据合并
X_train = pd.concat([X_train.drop(categorical_features, axis=1), pd.DataFrame(X_train_encoded)], axis=1)
X_test = pd.concat([X_test.drop(categorical_features, axis=1), pd.DataFrame(X_test_encoded)], axis=1)





In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# 线性回归
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_predictions = lr.predict(X_test)
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_predictions))

# 随机森林回归
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_predictions = rf.predict(X_test)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_predictions))

print(f"Linear Regression RMSE: {lr_rmse}")
print(f"Random Forest Regression RMSE: {rf_rmse}")


Linear Regression RMSE: 39469.1550526339
Random Forest Regression RMSE: 41440.85938693507
