In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split
import os
from urllib import request
import tarfile
import gzip

In [None]:
url = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.tgz"
response = request.urlretrieve(url, "housing.tgz")
with tarfile.open('housing.tgz', 'r') as file:
    file.extractall(path='./data')

In [None]:
with open('./data/housing.csv', 'rb') as f_in, gzip.open('./data/housing.csv.gz', 'wb') as f_out:
    f_out.writelines(f_in)

In [None]:
os.remove('housing.tgz')
os.remove('./data/housing.csv')

In [None]:
df = pd.read_csv('./data/housing.csv.gz')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['ocean_proximity'].value_counts()

In [None]:
df['ocean_proximity'].describe()

In [None]:
df.hist(bins=50, figsize=(20, 15))
plt.savefig('obraz1.png')

In [None]:
df.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1, figsize=(7, 4))
plt.savefig('obraz2.png')

In [None]:
df.plot(kind="scatter", x="longitude", y="latitude",
        alpha=0.4, figsize=(7, 3), colorbar=True,
        s=df['population'] / 100, label='population',
        c='median_house_value', cmap=plt.get_cmap("jet"))
plt.savefig('obraz3.png')

In [None]:
df.corr()["median_house_value"].sort_values(ascending=False).reset_index().rename(
    columns={'index': 'atrybut', 'median_house_value': 'wspolczynnik_korelacji'}).to_csv('korelacja.csv', index=False)

In [None]:
sns.pairplot(df)

In [None]:
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)
len(train_set), len(test_set)

In [None]:
train_set.head(10)

In [None]:
test_set.tail(10)

In [None]:
train_set.corr()

In [None]:
test_set.corr()

In [None]:
with open('train_set.pkl', 'wb') as asdf:
    pickle.dump(train_set, asdf)

In [None]:
with open('test_set.pkl', 'wb') as asd:
    pickle.dump(test_set, asd)