In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 1. 读取数据
# 这里的路径 './dataset/train.csv' 对应我们刚才整理好的位置
train_df = pd.read_csv('./dataset/train.csv')
test_df = pd.read_csv('./dataset/test.csv')

print("训练集形状:", train_df.shape) # 应该看到 (1460, 81)
print("测试集形状:", test_df.shape)   # 应该看到 (1459, 80) - 少的一列是房价 SalePrice
train_df.head()

训练集形状: (1460, 81)
测试集形状: (1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
# 2. 准备数据
# 我们选取 'GrLivArea' (居住面积) 作为特征
X_train_full = train_df[['GrLivArea']] 
y_train_full = train_df['SalePrice']

X_test_final = test_df[['GrLivArea']] # 这是我们要预测的真题

# 检查测试集有没有空值（Kaggle的测试集偶尔会有空值，线性回归碰到空值会报错）
# 如果有空值，我们用平均值填充一下
X_test_final = X_test_final.fillna(X_test_final.mean())

# 3. 训练模型
model = LinearRegression()
model.fit(X_train_full, y_train_full)

print("模型训练完毕！")
print(f"房价 = {model.coef_[0]:.2f} * 面积 + {model.intercept_:.2f}")

模型训练完毕！
房价 = 107.13 * 面积 + 18569.03


In [7]:
# 4. 对考试题 (test.csv) 进行预测
predictions = model.predict(X_test_final)

# 5. 生成提交文件
# Kaggle 要求格式：Id, SalePrice
output = pd.DataFrame({'Id': test_df.Id, 'SalePrice': predictions})

# 保存为 csv
output.to_csv('my_submission.csv', index=False)
print("✅ 文件已生成: my_submission.csv")
print("快去文件夹里看看有没有这个文件！")

✅ 文件已生成: my_submission.csv
快去文件夹里看看有没有这个文件！
