In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from scipy import stats

data = pd.read_csv("房价预测.csv")
x = data["Year"].values
y = data["Price"].values

In [None]:
model = LinearRegression().fit(x.reshape(-1,1), y)
y_result = model.predict(x.reshape(-1,1))

In [None]:
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
y_fit = slope * x + intercept
t_value = slope / std_err
mse = ((data['Price'] - y_result)**2).mean()


In [None]:
plt.figure(figsize=(5, 3))
plt.scatter(x, y, label='Original Data')
plt.plot(x, y_fit, color='red', label='Fitted Line')
plt.xlabel('Year')
plt.ylabel('Price (Unit: Thousand Yuan/m²)')
plt.title('Nanjing Housing Price Trend')
plt.legend()
plt.grid(True)

In [None]:
print(f"R^2为: {r_value}")
print(f"t变量为: {t_value}")
print(f"p变量为: {p_value}")
print(f"均方误差为: {mse}")
print(f"斜率为: {slope}")
print(f"截距为: {intercept}")
print(f"斜率标准差为: {std_err}")

In [None]:
# 预测未来房价和置信区间
future_years = np.array([2014, 2015, 2016, 2017, 2018, 2019, 2020]).reshape(-1, 1)
future_prices = model.predict(future_years)

# 计算置信区间
tinv = lambda p, df: abs(stats.t.ppf(p/2, df))
ts = tinv(0.05, len(x) - 2)  # 95% 置信度

# 计算预测值的标准误差
x_mean = x.mean() # 计算X的均值，用.values取出numpy数组再计算，避免pandas series的广播问题
pred_std = np.sqrt(mse * (1 + 1/len(x) + (future_years - x_mean)**2 / np.sum((x- x_mean)**2))).reshape(-1,1) #对X也做类似处理


interval = pred_std * ts

print(f"2014-2020预测值：{future_prices}")
print(f"置信区间：")
#输出预测值和对应的置信区间
for i in range (len(future_years)):
    print(f"{future_years[i][0]}年预测均价：{future_prices[i]} 置信区间：[{future_prices[i]-interval[i][0]},{future_prices[i]+interval[i][0]}]")