# House Prices Prediction Usando ML

Fonte: Canal Eureca and livro "Hands-On Machine Learning" de Aurélien Géron

## Bibliotecas

In [1]:
# bibibliotecas genéricas
import pandas as pd
import numpy as np

# Visualização
import matplotlib.pyplot as plt

# Dataset
from sklearn.datasets import fetch_california_housing

In [2]:
# importando o data set
dados = fetch_california_housing()

In [3]:
print(dados.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [4]:
dados.data

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [5]:
dados.data.shape

(20640, 8)

In [6]:
df =pd.DataFrame(dados.data)
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
5,4.0368,52.0,4.761658,1.103627,413.0,2.139896,37.85,-122.25
6,3.6591,52.0,4.931907,0.951362,1094.0,2.128405,37.84,-122.25
7,3.12,52.0,4.797527,1.061824,1157.0,1.788253,37.84,-122.25
8,2.0804,42.0,4.294118,1.117647,1206.0,2.026891,37.84,-122.26
9,3.6912,52.0,4.970588,0.990196,1551.0,2.172269,37.84,-122.25


In [7]:
# importadandos os dados explicativos
df = pd.DataFrame(data=dados.data, columns=dados.feature_names)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [8]:
#importando o targer e adiconando-o no data frame
df['Target'] = dados.target

In [9]:
dados.target

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [10]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


## Analise dos Dados

In [11]:
#!pip install sweetviz

In [12]:
"""
import sweetviz as sv

report = sv.analyze(df, target_feat='Target')

report.show_html("./report.html")
"""

'\nimport sweetviz as sv\n\nreport = sv.analyze(df, target_feat=\'Target\')\n\nreport.show_html("./report.html")\n'

## Pré-processamento dos dados

### Tratando Feartures de localização

In [13]:
#!pip install geopy

In [14]:
from geopy.geocoders import Nominatim

geolocalizacao = Nominatim(user_agent="geoapiExercises")

In [15]:
geolocalizacao.reverse("37.88"+", "+"-122.23").raw["address"]

{'leisure': 'Ecological Study Area',
 'road': 'Panoramic Way',
 'neighbourhood': 'Panoramic Hill',
 'city': 'Berkeley',
 'county': 'Alameda County',
 'state': 'California',
 'postcode': '94720-1076',
 'country': 'United States',
 'country_code': 'us'}

In [16]:
loc_update = {"County":[],
              "Road": []}


for indice, cord in enumerate(df.iloc[:1000,6:-1].values):
    Latitude = str(cord[0])
    Longitude = str(cord[1])
    localizacao = geolocalizacao.reverse(Latitude+","+Longitude).raw['address'] # raw retorna um dict
    
    if localizacao.get('county') is None:
        localizacao['county'] = None
    if localizacao.get('road') is None:
        localizacao['road'] = None
    
    loc_update['County'].append(localizacao['county'])
    loc_update['Road'].append(localizacao['road'])
  
    
    if indice%100 == 0:
        print(indice)

loc = pd.DataFrame(loc_update)
loc

0
100
200
300
400
500
600
700
800
900


Unnamed: 0,County,Road
0,Alameda County,Panoramic Way
1,Alameda County,Caldecott Lane
2,Alameda County,Grove Shafter Freeway
3,Alameda County,Florio Street
4,Alameda County,Florio Street
...,...,...
995,Alameda County,Redwood Road
996,Alameda County,North Livermore Avenue
997,Alameda County,Arthur H. Breed Junior Freeway
998,Alameda County,Wright Brothers Avenue


In [17]:
#Criando função para pegar as coordenadas
"""
def location(cord):
    Latitude = str(cord[0])
    Longitude = str(cord[1])
    
    localizacao = geolocalizacao.reverse(Latitude+","+Longitude).raw['address'] # raw retorna um dict
    
    if localizacao.get('county') is None:
        localizacao['county'] = None
    if localizacao.get('road') is None:
        localizacao['road'] = None
    
    loc_update['County'].append(localizacao['county'])
    loc_update['Road'].append(localizacao['road'])
"""

'\ndef location(cord):\n    Latitude = str(cord[0])\n    Longitude = str(cord[1])\n    \n    localizacao = geolocalizacao.reverse(Latitude+","+Longitude).raw[\'address\'] # raw retorna um dict\n    \n    if localizacao.get(\'county\') is None:\n        localizacao[\'county\'] = None\n    if localizacao.get(\'road\') is None:\n        localizacao[\'road\'] = None\n    \n    loc_update[\'County\'].append(localizacao[\'county\'])\n    loc_update[\'Road\'].append(localizacao[\'road\'])\n'

In [18]:
"""
import pickle
loc_update = {"County":[],
              "Road": []}

for indice, cord in enumerate(df.iloc[:,6:-1].values):
    location(cord)
    # lendo continuamente nossos dados e salvando-os!!
    pickle.dump(loc_update, open('loc_update.pickle', 'wb'))
    
    if indice%100 == 0:
        print(indice)
"""

'\nimport pickle\nloc_update = {"County":[],\n              "Road": []}\n\nfor indice, cord in enumerate(df.iloc[:,6:-1].values):\n    location(cord)\n    # lendo continuamente nossos dados e salvando-os!!\n    pickle.dump(loc_update, open(\'loc_update.pickle\', \'wb\'))\n    \n    if indice%100 == 0:\n        print(indice)\n'

### Importando o arquivo pickle

In [19]:
loc_update = pickle.load(open("loc_update.pickle", "rb"))

NameError: name 'pickle' is not defined

In [None]:
loc = pd.DataFrame(loc_update)

In [None]:
loc.head()