In [None]:
#Generic Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Data Set
from sklearn.datasets import fetch_california_housing

In [None]:
# get the data set
data = fetch_california_housing()

In [None]:
print(data.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [None]:
data.data.shape

(20640, 8)

In [None]:
# independent data

df = pd.DataFrame(data = data.data, columns=data.feature_names)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [None]:
# dependent

df['Target'] = data.target

In [None]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


# EDA

In [None]:
!pip install sweetviz

Collecting sweetviz
  Downloading sweetviz-2.3.1-py3-none-any.whl.metadata (24 kB)
Downloading sweetviz-2.3.1-py3-none-any.whl (15.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.1/15.1 MB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sweetviz
Successfully installed sweetviz-2.3.1


In [None]:
import sweetviz as sv
report = sv.analyze(df)
report.show_html("./report.html")


                                             |          | [  0%]   00:00 -> (? left)

Report ./report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


# Data Pre-Processing

In [None]:
# Feature Engineering
from geopy.geocoders import Nominatim

geolocator = Nominatim (user_agent = 'geoapi Exercises')

In [None]:
geolocator.reverse( "37.88"+" , "+"-122.23" ).raw['address']

{'road': 'Convict Trail',
 'city': 'Oakland',
 'county': 'Alameda County',
 'state': 'California',
 'ISO3166-2-lvl4': 'US-CA',
 'postcode': '94720',
 'country': 'United States',
 'country_code': 'us'}

In [None]:
def location(cord):
  Latitude = str(cord[0])
  Longitude = str(cord[1])

  location = geolocator.reverse(Latitude+","+Longitude).raw['address'] # raw returns a dictionary

  # if the values are missing replace bya empty string

  if location.get('Road') is None:
    location['Road'] = None

  if location.get('County') is None:
    location['County'] = None

  loc_update['County'].append(location['County'])
  loc_update['Road'].append(location['Road'])


In [None]:
"""
loc_update = {"County":[],
              "Road":[],
              "Neighbourhood":[]}

for i, cord in enumerate(df.iloc[:,6:-1].values):

  location(cord)
  pickle.dump(loc_update, open('loc_update.pickle', 'wb'))

  if i%100 == 0:
    print(i)
"""

'\nloc_update = {"County":[], \n              "Road":[],\n              "Neighbourhood":[]}\n\nfor i, cord in enumerate(df.iloc[:,6:-1].values):\n\n  location(cord)\n  pickle.dump(loc_update, open(\'loc_update.pickle\', \'wb\'))\n\n  if i%100 == 0:\n    print(i)\n'

In [None]:
# to load the pickle model
import pickle

loc_update = pickle.load(open("/content/loc_update.pickle","rb"))

In [None]:
loc_update

{'County': [None, None, None, None, None, None, None, None, None],
 'Road': [None, None, None, None, None, None, None, None, None],
 'Neighbourhood': []}

In [None]:
# to be continued