<a href="https://colab.research.google.com/github/parkjh21c/ML-DL/blob/main/%EC%8B%A4%EC%8A%B51.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [4]:
dataset = fetch_california_housing()

print(dataset.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [5]:
x_data = pd.DataFrame(dataset.data, columns=dataset.feature_names)
y_data = pd.Series(dataset.target)

print(x_data)
print(y_data)

       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0      8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1      8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2      7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3      5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4      3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
...       ...       ...       ...        ...         ...       ...       ...   
20635  1.5603      25.0  5.045455   1.133333       845.0  2.560606     39.48   
20636  2.5568      18.0  6.114035   1.315789       356.0  3.122807     39.49   
20637  1.7000      17.0  5.205543   1.120092      1007.0  2.325635     39.43   
20638  1.8672      18.0  5.329513   1.171920       741.0  2.123209     39.43   
20639  2.3886      16.0  5.254717   1.162264      1387.0  2.616981     39.37   

       Longitude  
0        -122.23  
1

In [6]:
print(x_data.shape)

(20640, 8)


In [7]:
print(x_data.describe())

             MedInc      HouseAge      AveRooms     AveBedrms    Population  \
count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000   
mean       3.870671     28.639486      5.429000      1.096675   1425.476744   
std        1.899822     12.585558      2.474173      0.473911   1132.462122   
min        0.499900      1.000000      0.846154      0.333333      3.000000   
25%        2.563400     18.000000      4.440716      1.006079    787.000000   
50%        3.534800     29.000000      5.229129      1.048780   1166.000000   
75%        4.743250     37.000000      6.052381      1.099526   1725.000000   
max       15.000100     52.000000    141.909091     34.066667  35682.000000   

           AveOccup      Latitude     Longitude  
count  20640.000000  20640.000000  20640.000000  
mean       3.070655     35.631861   -119.569704  
std       10.386050      2.135952      2.003532  
min        0.692308     32.540000   -124.350000  
25%        2.429741     33.930000   -1

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3)

print(x_train)
print(y_train)

       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
5614   2.7308      38.0  4.080189   1.084906      1724.0  4.066038     33.79   
20411  5.5224      23.0  5.366995   1.041872      1338.0  3.295567     34.19   
7695   4.3882      29.0  5.061818   1.058182      1278.0  2.323636     33.96   
13268  3.6667      26.0  6.122340   1.063830       593.0  3.154255     34.09   
147    3.2632      52.0  4.745652   1.010870      1129.0  2.454348     37.80   
...       ...       ...       ...        ...         ...       ...       ...   
19734  3.3594      28.0  5.729904   1.073955       873.0  2.807074     40.20   
8283   7.5000      45.0  6.233129   1.055215       361.0  2.214724     33.78   
8649   5.6870      34.0  5.402746   1.011442       915.0  2.093822     33.86   
640    2.8125      29.0  4.333680   1.088358      2024.0  2.103950     37.72   
14536  4.5086      16.0  5.715640   0.992891      1612.0  3.819905     32.93   

       Longitude  
5614     -118.25  
2

In [10]:
estimator = LinearRegression()

In [11]:
estimator.fit(x_train, y_train)

In [12]:
y_predict = estimator.predict(x_train)
score = r2_score(y_train, y_predict)
print(score)

0.6090105623860487


In [13]:
y_predict = estimator.predict(x_test)
score = r2_score(y_test, y_predict)
print(score)

0.5956963045184962
