In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
from sklearn.datasets import fetch_california_housing

In [6]:
data=fetch_california_housing()

In [7]:
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [8]:
print(data['DESCR'])

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [9]:
data['feature_names']

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [10]:
df=pd.DataFrame(data=data.data, columns=data.feature_names)

In [11]:
data.data.shape

(20640, 8)

In [12]:
df.head(5)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [13]:
df['Target']=data.target

In [14]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


# Exploratory Data Analysis

In [15]:
!pip install sweetviz

Collecting sweetviz
  Downloading sweetviz-2.1.3-py3-none-any.whl (15.1 MB)
Collecting importlib-resources>=1.2.0
  Downloading importlib_resources-5.4.0-py3-none-any.whl (28 kB)
Installing collected packages: importlib-resources, sweetviz
Successfully installed importlib-resources-5.4.0 sweetviz-2.1.3


In [16]:
import sweetviz as sv

In [17]:
report=sv.analyze(df)

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=10.0), HTML(value='')), layout=Layout(dis…




In [18]:
report.show_html("./report.html")

Report ./report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [20]:
!pip install geopy

Collecting geopy
  Downloading geopy-2.2.0-py3-none-any.whl (118 kB)
Collecting geographiclib<2,>=1.49
  Downloading geographiclib-1.52-py3-none-any.whl (38 kB)
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.52 geopy-2.2.0


# Data Preprocessing

In [22]:
from geopy.geocoders import Nominatim

In [23]:
geolocaters=Nominatim(user_agent='geoapiExcersises')

In [26]:
geolocaters.reverse("37.88"+" , "+"-122.23")[0]

'Ecological Study Area, Centennial Drive, Oakland, Alameda County, California, 94720-1076, United States'

In [27]:
def location(cord):
    Latitude=str(cord[0])
    Longitude=str(cord[1])
    
    location=geolocator.reverse(Latitude+","+Longitude).raw['address']
    
    if location.get('road')is None:
        location['road']=None
    if location.get('county')is None:
        location['county']=None
    loc_update['County'].append(location['county'])
    loc_update['Road'].append(location['road'])

In [30]:
import pickle

In [28]:
loc_update={"County":[],
           "Road":[],
           "Neighbourhood":[]}
for i,cord in enumerate(df.iloc[:,6:-1].values):
    location(cord)
    pickle.dump(loc_update,open('loc_update.pickle','wb'))
    if i%100==0:
        print(i)

NameError: name 'geolocator' is not defined