In [None]:
import pandas as pd

datos = pd.read_csv("housing.csv")

In [None]:
datos.head()

In [None]:
datos["ocean_proximity"].value_counts()

In [None]:
datos.info()

In [None]:
datos.describe()

In [None]:
datos.hist(figsize=(15,8), bins=30, edgecolor="black")

In [None]:
import seaborn as sb
sb.scatterplot(x="latitude", y="longitude", data=datos, hue="median_house_value", palette="coolwarm",
               size="population", sizes=(10, 300))

In [None]:
sb.scatterplot(x="latitude", y="longitude", data=datos[(datos.median_income > 14)], hue="median_house_value", palette="coolwarm")

In [None]:
datos.info()

In [None]:
datos_na = datos.dropna()

In [None]:
datos_na.info()

In [None]:
#Convertir la caracteística categórica a numérica
#Proximidad al oceano
datos_na["ocean_proximity"]


In [None]:
datos_na["ocean_proximity"].value_counts()
# 1,2,3,4,5

In [None]:
#Dummies / One-Hot Encoding
# NEAR BAY   INLAND   NEAR OCEAN
#    1          0         0
#    0          0         1

dummies = pd.get_dummies(datos_na["ocean_proximity"], dtype=int)

In [None]:
datos_na = datos_na.join(dummies)

In [None]:
datos_na.head()

In [None]:
datos_na = datos_na.drop(["ocean_proximity"], axis=1)

In [None]:
datos_na.head()

In [None]:
#Analisis, nuevas caracteristicas

In [None]:
datos_na.corr()

In [None]:
sb.set(rc={'figure.figsize': (15,8)})
sb.heatmap(datos_na.corr(), annot=True, cmap="YlGnBu")

In [None]:
datos_na.corr()["median_house_value"].sort_values(ascending=False)

In [None]:
sb.scatterplot(x=datos_na["median_house_value"], y=datos_na["median_income"])

In [None]:
datos_na["bedroom_ratio"] = datos_na["total_bedrooms"] / datos_na["total_rooms"]

In [None]:
sb.set(rc={'figure.figsize': (15,8)})
sb.heatmap(datos_na.corr(), annot=True, cmap="YlGnBu")

In [None]:
#Separar las caracteristicas de la etiqueta
X = datos_na.drop(["median_house_value"], axis=1)
y = datos_na["median_house_value"]

In [None]:
#Separar los datos en 2 partes: Conjunto de entrenamiento y uno de pruebas
from sklearn.model_selection import train_test_split

X_ent, X_pru, y_ent, y_pru = train_test_split(X, y, test_size=.2)

In [None]:
from sklearn.linear_model import LinearRegression

modelo=LinearRegression()

In [None]:
modelo.fit(X_ent, y_ent)

In [None]:
predicciones = modelo.predict(X_pru)

In [None]:
comparativa = {"Prediccion": predicciones, "Valor Real": y_pru}
pd.DataFrame(comparativa)

In [None]:
#Overfitting sobreajuste
print(modelo.score(X_ent, y_ent))
print(modelo.score(X_pru, y_pru))

In [None]:
#Error
from sklearn.metrics import mean_squared_error
import numpy as np

mse = mean_squared_error(y_pru, predicciones)

In [None]:
mse

In [None]:
rmse = np.sqrt(mse)

In [None]:
rmse

In [None]:
#Scaler Escalamiento


In [None]:
datos_na.describe()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_ent_esc = scaler.fit_transform(X_ent)
X_pru_esc = scaler.fit_transform(X_pru)

In [None]:
X_ent

In [None]:
pd.DataFrame(X_ent_esc)