In [None]:
import matplotlib.pyplot as plt
import numpy as np
import polars as pl
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


dataset = fetch_california_housing()

In [None]:
print(dataset.DESCR)

In [None]:
df_exp = pl.DataFrame(dataset.data, dataset.feature_names)
df_obj = pl.DataFrame(dataset.target, ["HousingPrices"])

df_total = pl.concat([df_exp, df_obj], how="horizontal")
df_total.head()

In [None]:
plt.figure(figsize=(12, 9))
sns.heatmap(df_total.to_pandas().corr(), annot=True, cmap="cividis", fmt=".2f", linewidths=0.5, )

In [None]:
col_exp_var = "MedInc"
col_obj_var = "HousingPrices"

fig = plt.figure(figsize=(12, 9))
ax = fig.add_subplot()
ax.scatter(df_total.select(col_exp_var), df_total.select(col_obj_var))
ax.set_xlabel(col_exp_var)
ax.set_ylabel(col_obj_var)
ax.set_title("Califolnia housing scatter")
df_total.select([col_exp_var, col_obj_var]).describe()

In [None]:
q_95 = df_total.select(col_exp_var).quantile(0.95)
df_total_prc = df_total.filter(pl.col(col_exp_var) < q_95)
print("95%点の分位数: ", q_95)


fig = plt.figure(figsize=(12, 9))
ax = fig.add_subplot()
ax.scatter(df_total_prc.select(col_exp_var), df_total_prc.select(col_obj_var))
ax.set_xlabel(col_exp_var)
ax.set_ylabel(col_obj_var)
ax.set_title("Califolnia housing scatter")
df_total_prc.select([col_exp_var, col_obj_var]).describe()

In [None]:
XY = df_total_prc.select([col_exp_var, col_obj_var]).to_numpy()
train_ds, test_ds = train_test_split(XY, test_size=0.3, random_state=42)

In [None]:
simple_lr = LinearRegression()
simple_lr.fit(train_ds[:, 0].reshape(-1, 1), train_ds[:, 1])

In [None]:
b = simple_lr.intercept_
a = simple_lr.coef_[0]
r2 = simple_lr.score(train_ds[:, 0].reshape(-1, 1), train_ds[:, 1])

print("Intercept: ", b)
print("Regression Coefficient: ", a)
print("Coefficient of determination: ", r2)
print("Regression Line: ", f"y = {a}x + {b}")

In [None]:
fig = plt.figure(figsize=(12, 9))
ax = fig.add_subplot()
ax.scatter(test_ds[:, 0], test_ds[:, 1], marker="o", facecolor="None", edgecolors="red")
ax.scatter(train_ds[:, 0], train_ds[:, 1], marker="s", facecolor="None", edgecolors="blue")
ax.set_xlabel(col_exp_var)
ax.set_ylabel(col_obj_var)
ax.set_title("Califolnia housing scatter")
df_total_prc.select([col_exp_var, col_obj_var]).describe()
x_range = ax.get_xlim()

reg_line = a * np.arange(x_range[0], x_range[1], 0.1) + b

ax.plot(np.arange(x_range[0], x_range[1], 0.1), reg_line, color="red")

ax.text(0, 0, f"$R^2={r2:0.3f}$")

