In [1]:
# Dependencies and Setup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib import rcParams
import scipy.stats as sts
import os
from collections import Counter
import requests
import json


## 2012 US census data

In [3]:
# 2012 census data for all US zipcodes
#reading 2012 combined census data
path_comb_2012=os.path.join('output_census', 'census_comb_2012.csv')
usa_2012=pd.read_csv(path_comb_2012)

In [4]:
#looking for usa_2012 dataframe
usa_2012.head()

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Unemployment Rate,House Value,House Construction Year,Monthly Owner Cost,...,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate,City,County,Lat,Lng,Housing_units,State
0,2655,3846.0,54.6,73323.0,50951.0,5.460218,4.056162,443500.0,1971.0,2566.0,...,96.515861,0.780031,1.586063,0.0,Osterville,Barnstable County,41.63,-70.39,3053.0,MA
1,2657,2974.0,52.9,46031.0,45142.0,14.122394,8.675185,467100.0,1945.0,2118.0,...,89.845326,5.749832,2.958978,0.63887,Provincetown,Barnstable County,42.05,-70.18,4494.0,MA
2,2659,741.0,61.0,51466.0,36133.0,5.668016,4.183536,469800.0,1972.0,2082.0,...,95.546559,2.564103,2.564103,0.0,South Chatham,Barnstable County,41.68,-70.02,1297.0,MA
3,2660,5881.0,51.3,48617.0,28784.0,13.977215,2.941677,342800.0,1976.0,1685.0,...,89.899677,6.86958,0.850196,0.170039,South Dennis,Barnstable County,41.71,-70.15,4551.0,MA
4,2663,96.0,34.7,21667.0,18307.0,0.0,16.666667,1000001.0,1959.0,-666666666.0,...,100.0,0.0,0.0,0.0,South Wellfleet,Barnstable County,41.915,-70.0267,412.0,MA


In [5]:
#columns of 2012  dataframe
usa_2012.columns

Index(['Zipcode', 'Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
       'Commute Time Public', 'Commute Time Car', 'High School Rate',
       'College Rate', 'Uneducated Rate', 'English Language Rate',
       'Spanish Language Rate', 'White Population Rate',
       'Black Population Rate', 'Hispanic Population Rate',
       'Asian Population Rate', 'City', 'County', 'Lat', 'Lng',
       'Housing_units', 'State'],
      dtype='object')

In [6]:
#shape of the dataframe
usa_2012.shape

(33120, 30)

In [7]:
#removing the rows with house value less than 1
usa_2012=(usa_2012[(usa_2012['House Value']>0)&
                (usa_2012['Household Income']>0)&
                (usa_2012['Monthly Owner Cost']>0)])

In [8]:
#looking for null value
usa_2012.isna().sum()

Zipcode                         0
Population                      0
Median Age                      0
Household Income                0
Per Capita Income               3
Poverty Rate                    0
Unemployment Rate               0
House Value                     0
House Construction Year         0
Monthly Owner Cost              0
Monthly Rent                    0
Public Transport Rate           5
Personal Transport Rate         5
Commute Time Public         21793
Commute Time Car            21793
High School Rate                0
College Rate                    0
Uneducated Rate                 0
English Language Rate           0
Spanish Language Rate           0
White Population Rate           0
Black Population Rate           0
Hispanic Population Rate        0
Asian Population Rate           0
City                            0
County                          1
Lat                            16
Lng                            16
Housing_units                  16
State         

In [9]:
#removing the rows with lat and lng missing value
usa_2012.dropna(subset=['Lat', 'Lng','Housing_units','County','Public Transport Rate','Personal Transport Rate'], inplace=True)

In [10]:
#shape of the clean dataframe
usa_2012.shape

(30763, 30)

In [11]:
#info of the dataframe
usa_2012.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30763 entries, 0 to 33119
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Zipcode                   30763 non-null  int64  
 1   Population                30763 non-null  float64
 2   Median Age                30763 non-null  float64
 3   Household Income          30763 non-null  float64
 4   Per Capita Income         30760 non-null  float64
 5   Poverty Rate              30763 non-null  float64
 6   Unemployment Rate         30763 non-null  float64
 7   House Value               30763 non-null  float64
 8   House Construction Year   30763 non-null  float64
 9   Monthly Owner Cost        30763 non-null  float64
 10  Monthly Rent              30763 non-null  float64
 11  Public Transport Rate     30763 non-null  float64
 12  Personal Transport Rate   30763 non-null  float64
 13  Commute Time Public       8981 non-null   float64
 14  Commut

In [12]:
#describing the dataframe
usa_2012.describe()

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Unemployment Rate,House Value,House Construction Year,Monthly Owner Cost,...,Uneducated Rate,English Language Rate,Spanish Language Rate,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate,Lat,Lng,Housing_units
count,30763.0,30763.0,30763.0,30763.0,30760.0,30763.0,30763.0,30763.0,30763.0,30763.0,...,30763.0,30763.0,30763.0,30763.0,30763.0,30763.0,30763.0,30763.0,30763.0,30763.0
mean,49374.979456,10109.399473,41.158538,52169.760296,26077.669863,14.223829,4.210818,174057.6,-496465.0,1367.633911,...,0.676337,83.798281,6.225756,84.360171,7.59512,8.640487,1.958591,38.863919,-90.790806,4316.204109
std,27361.004439,14067.145097,7.567818,22035.691145,11341.588262,10.067415,2.734409,143731.7,18222340.0,605.295337,...,0.988541,15.818083,12.560562,20.294066,15.841998,16.050894,5.099756,5.267132,14.868471,5682.205545
min,602.0,13.0,10.8,2499.0,2455.0,0.0,0.0,9999.0,-666666700.0,181.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.96,-171.69,11.0
25%,27011.5,950.0,36.4,38399.0,19492.0,7.019203,2.544529,86000.0,1960.0,964.0,...,0.0,81.522698,0.441464,79.056719,0.0,0.54257,0.0,35.47,-96.925,486.0
50%,49664.0,3333.0,40.8,47679.0,23671.5,12.166489,3.897764,127500.0,1973.0,1188.0,...,0.410003,89.184995,1.743615,92.901235,0.918079,2.629137,0.287356,39.6,-88.13,1587.0
75%,71355.5,14360.5,45.4,60519.0,29468.75,18.969324,5.479452,205700.0,1981.0,1583.0,...,0.902935,92.617816,5.520981,97.679402,6.33343,8.308888,1.58307,42.18,-80.2,6269.5
max,99929.0,115538.0,85.3,250001.0,171604.0,87.959866,50.49505,1000001.0,2006.0,4001.0,...,23.287671,100.0,100.0,100.0,100.0,100.0,70.586701,71.0,-65.28,47617.0


In [13]:
usa_2012.columns

Index(['Zipcode', 'Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
       'Commute Time Public', 'Commute Time Car', 'High School Rate',
       'College Rate', 'Uneducated Rate', 'English Language Rate',
       'Spanish Language Rate', 'White Population Rate',
       'Black Population Rate', 'Hispanic Population Rate',
       'Asian Population Rate', 'City', 'County', 'Lat', 'Lng',
       'Housing_units', 'State'],
      dtype='object')

In [14]:
usa_2012.drop(['Commute Time Car','Commute Time Public'], axis=1, inplace=True)

In [None]:
usa_2012.shape

In [16]:
#selecting only required columns
usa_2012_redu=usa_2012[['Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
        'High School Rate','English Language Rate','Spanish Language Rate',
       'College Rate', 'Uneducated Rate', 'White Population Rate', 'Black Population Rate',
       'Hispanic Population Rate', 'Asian Population Rate', 'City', 
       'Lat', 'Lng', 'Housing_units']]

In [18]:
#saving csv file for machine learning
usa_2012_redu.to_csv('output_census/usa_2012_ml.csv', index=False)

## Looking for 2012 California census data

In [None]:
#reading 2012 combined california census data
path_ca_2012=os.path.join('output_census', 'census_ca_2012.csv')
ca_2012=pd.read_csv(path_ca_2012)

In [None]:
#looking for ca_2012 dataframe
ca_2012.head()

In [None]:
#columns of 2012  dataframe
ca_2012.columns

In [None]:
#shape of the dataframe
ca_2012.shape

In [None]:
#removing the rows with house value less than 1
#removing the rows with house value less than 1
ca_2012=(ca_2012[(ca_2012['House Value']>0)&
                (ca_2012['Household Income']>0)&
                (ca_2012['Monthly Owner Cost']>0)])

In [None]:
#looking for null value
ca_2012.isna().sum()

In [None]:
#removing the rows with lat and lng missing value
ca_2012.dropna(subset=['Public Transport Rate',
                        'Personal Transport Rate'], inplace=True)

In [None]:
#shape of the clean dataframe
ca_2012.shape

In [None]:
#converting minutes to hour

ca_2012['Commute Time Public']=ca_2012['Commute Time Public']/60
ca_2012['Commute Time Car']=ca_2012['Commute Time Car']/60

In [None]:
#converting to int
ca_2012['Population']=ca_2012['Population'].astype(int)

In [None]:
#info of the dataframe
ca_2012.info()

In [None]:
#describing the dataframe
ca_2012.describe()

In [None]:
#selecting only important columns
ca_12=ca_2012[['City','Population', 'House Value','Household Income',
       'Poverty Rate', 'Unemployment Rate','Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Uneducated Rate' ]]

In [None]:
#only selecting rows with +ve owner cost and rent
ca_12=(ca_12[(ca_12['Monthly Owner Cost']>0)&
             (ca_12['Monthly Rent']>0)])

In [None]:
ca_12.shape

In [None]:
#formating columns with 2 decimal place
ca_12[['House Value','Household Income',
       'Poverty Rate', 'Unemployment Rate','Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Commute Time Public', 'Commute Time Car', 
       'Uneducated Rate' ]]=(ca_12[[ 'House Value','Household Income',
                                   'Poverty Rate', 'Unemployment Rate',
                                     'Monthly Owner Cost', 'Monthly Rent',
                                    'Public Transport Rate', 'Commute Time Public',
                                     'Commute Time Car', 'Uneducated Rate']]
                                                   .applymap('{:.2f}'.format))

In [None]:
#using groupby method to groupby the data accordint to city
ca_12_city=ca_12.groupby('City').agg({'Population':'sum',
                              'House Value':'mean',
                               'Household Income':'mean',
                               'Poverty Rate':'mean',
                               'Unemployment Rate':'mean',
                               'Monthly Owner Cost':'mean',
                               'Monthly Rent':'mean',
                               'Public Transport Rate':'mean',
                               'Uneducated Rate':'mean'}).reset_index()

In [None]:
#sorting data according to the population
ca_12_city=ca_12_city.sort_values(by=['House Value',
                                'Monthly Owner Cost','Household Income',
                                'Poverty Rate','Unemployment Rate',
                                'Public Transport Rate','Monthly Rent', 
                                'Uneducated Rate' ], ascending=False)

In [None]:
#printing the new sorted datasets
ca_12_city.head()

In [None]:
#city with higher house value
ca_12_city.nlargest(10, 'House Value')

In [None]:
#City with smallest poverty rate
ca_12_city.nsmallest(10, 'Poverty Rate')

In [None]:
#city with smallest unemployment rate
ca_12_city.nsmallest(10, 'Unemployment Rate')

In [None]:
#city with larger commute time
#ca_12_city.nsmallest(10, 'Commute Time Car')

In [None]:
#city with smallest monthly owner cost
ca_12_city.nsmallest(10, 'Monthly Owner Cost')

In [None]:
#city with largest household income
ca_12_city.nlargest(10, 'Household Income')

## Plotting 2012 california data

In [None]:
#plotting all correlation using seaborn heatmap
corr_12=ca_12.corr()
import seaborn as sns
fig, ax=plt.subplots(figsize=(15,15))
sns.heatmap(corr_12,vmin=-1, vmax=1, ax=ax, cmap='BrBG')#annot=True,
plt.show()

In [None]:
#plotting scatter plot of lat and lng
fig, ax=plt.subplots(figsize=(15,15))
ca_2012.plot(kind='scatter', x='Lng', y='Lat', alpha=0.4, 
                s=ca_2012['Population']/100,label='Population',
                 c='House Value', cmap=plt.get_cmap('rainbow'),
                colorbar=True, ax=ax)

plt.show()

In [None]:
#plotting scatter plot of lat and lng
fig, ax=plt.subplots(figsize=(15,15))
ca_2012.plot(kind='scatter', x='Lng', y='Lat', alpha=0.3, 
                s=ca_2012['Population']/100,label='Population',
                 c='Household Income', cmap=plt.get_cmap('hsv'),
                colorbar=True, ax=ax)

plt.show()

## 2014 US census data

In [44]:
# 2014 census data for all US zipcodes
#reading 2014 combined census data
path_comb_2014=os.path.join('output_census', 'census_comb_2014.csv')
usa_2014=pd.read_csv(path_comb_2014)

In [45]:
#looking for usa_2014 dataframe
usa_2014.head()

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Unemployment Rate,House Value,House Construction Year,Monthly Owner Cost,...,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate,City,County,Lat,Lng,Housing_units,State
0,601,18088.0,37.1,10833.0,7229.0,60.32176,11.012826,105400.0,1982.0,722.0,...,96.240602,0.967492,99.806502,0.0,Adjuntas,Adjuntas Municipio,,,,PR
1,602,40859.0,39.0,16353.0,9048.0,53.168213,10.639027,91200.0,1980.0,843.0,...,58.540346,2.684843,93.203456,0.183558,Aguada,Aguada Municipio,18.36,-67.18,18073.0,PR
2,603,53162.0,39.2,16323.0,9888.0,48.957902,7.623867,128700.0,1977.0,841.0,...,73.001392,3.598435,96.275535,1.183176,Aguadilla,Aguadilla Municipio,18.45,-67.11,25653.0,PR
3,606,6415.0,39.2,14138.0,6385.0,58.893219,2.681216,105800.0,1977.0,569.0,...,87.622759,2.400624,99.890881,0.0,Maricao,Maricao Municipio,18.2,-66.9,2877.0,PR
4,610,28805.0,39.7,17265.0,8197.0,49.119944,4.481861,113700.0,1979.0,752.0,...,67.127235,3.478563,99.12168,0.090262,Anasco,Aasco Municipio,18.28,-67.13,12618.0,PR


In [46]:
#columns of 2017  dataframe
usa_2014.columns

Index(['Zipcode', 'Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
       'Commute Time Public', 'Commute Time Car', 'High School Rate',
       'College Rate', 'Uneducated Rate', 'English Language Rate',
       'Spanish Language Rate', 'White Population Rate',
       'Black Population Rate', 'Hispanic Population Rate',
       'Asian Population Rate', 'City', 'County', 'Lat', 'Lng',
       'Housing_units', 'State'],
      dtype='object')

In [47]:
#shape of the dataframe
usa_2014.shape

(33120, 30)

In [48]:
#removing the rows with house value less than 1
usa_2014=(usa_2014[(usa_2014['House Value']>0)&
                (usa_2014['Household Income']>0)&
                (usa_2014['Monthly Owner Cost']>0)])

In [49]:
#looking for null value
usa_2014.isna().sum()

Zipcode                         0
Population                      0
Median Age                      0
Household Income                0
Per Capita Income               0
Poverty Rate                    0
Unemployment Rate               0
House Value                     0
House Construction Year         0
Monthly Owner Cost              0
Monthly Rent                    0
Public Transport Rate           0
Personal Transport Rate         0
Commute Time Public         21504
Commute Time Car            21504
High School Rate                0
College Rate                    0
Uneducated Rate                 0
English Language Rate           0
Spanish Language Rate           0
White Population Rate           0
Black Population Rate           0
Hispanic Population Rate        0
Asian Population Rate           0
City                            0
County                          1
Lat                            16
Lng                            16
Housing_units                  16
State         

In [50]:
#removing the rows with lat and lng missing value
usa_2014.dropna(subset=['Lat', 'Lng','Housing_units','County'], inplace=True)

In [52]:
#shape of the clean dataframe
usa_2014.shape

(30279, 30)

In [53]:
#info of the dataframe
usa_2014.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30279 entries, 1 to 33119
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Zipcode                   30279 non-null  int64  
 1   Population                30279 non-null  float64
 2   Median Age                30279 non-null  float64
 3   Household Income          30279 non-null  float64
 4   Per Capita Income         30279 non-null  float64
 5   Poverty Rate              30279 non-null  float64
 6   Unemployment Rate         30279 non-null  float64
 7   House Value               30279 non-null  float64
 8   House Construction Year   30279 non-null  float64
 9   Monthly Owner Cost        30279 non-null  float64
 10  Monthly Rent              30279 non-null  float64
 11  Public Transport Rate     30279 non-null  float64
 12  Personal Transport Rate   30279 non-null  float64
 13  Commute Time Public       8785 non-null   float64
 14  Commut

In [54]:
#describing the dataframe
usa_2014.describe()

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Unemployment Rate,House Value,House Construction Year,Monthly Owner Cost,...,Uneducated Rate,English Language Rate,Spanish Language Rate,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate,Lat,Lng,Housing_units
count,30279.0,30279.0,30279.0,30279.0,30279.0,30279.0,30279.0,30279.0,30279.0,30279.0,...,30279.0,30279.0,30279.0,30279.0,30279.0,30279.0,30279.0,30279.0,30279.0,30279.0
mean,49334.417748,10426.459229,41.611344,53063.861158,26804.471746,14.742626,4.140698,171188.9,-372326.2,1355.15846,...,0.700768,83.763007,6.309543,84.036088,7.655989,8.965161,2.062962,38.866127,-90.74707,4382.167971
std,27325.345993,14417.289993,7.563017,22132.645093,11631.510169,9.978317,2.629677,139765.7,15792430.0,579.772253,...,0.945483,15.84425,12.569965,20.246507,15.707045,16.110106,5.248675,5.262738,14.832344,5702.991737
min,602.0,21.0,11.0,2499.0,2054.0,0.0,0.0,9999.0,-666666700.0,191.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.96,-171.69,15.0
25%,27117.0,1007.0,36.7,39080.5,20108.5,7.579045,2.514045,86800.0,1962.0,975.0,...,0.032589,81.456571,0.460736,78.58597,0.0,0.746153,0.0,35.46,-96.9,515.0
50%,49646.0,3488.0,41.3,48676.0,24409.0,12.689394,3.824522,127000.0,1974.0,1183.0,...,0.446429,89.290225,1.758004,92.499753,1.04639,2.916228,0.34188,39.6,-88.1,1647.0
75%,71249.0,14872.0,45.9,61524.5,30302.0,19.659492,5.33429,199600.0,1983.0,1551.0,...,0.955696,92.734983,5.691485,97.432878,6.554124,8.796821,1.699295,42.18,-80.2,6398.5
max,99929.0,115013.0,84.4,250001.0,392835.0,88.498403,52.678571,1000001.0,2011.0,4001.0,...,33.108108,100.0,94.440094,100.0,100.0,100.0,76.744186,71.0,-65.28,47617.0


In [55]:
usa_2014.columns

Index(['Zipcode', 'Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
       'Commute Time Public', 'Commute Time Car', 'High School Rate',
       'College Rate', 'Uneducated Rate', 'English Language Rate',
       'Spanish Language Rate', 'White Population Rate',
       'Black Population Rate', 'Hispanic Population Rate',
       'Asian Population Rate', 'City', 'County', 'Lat', 'Lng',
       'Housing_units', 'State'],
      dtype='object')

In [57]:
usa_2014.drop(['Commute Time Car','Commute Time Public'], axis=1, inplace=True)

In [58]:
usa_2014.shape

(30279, 28)

In [59]:
#selecting only required columns
usa_2014_redu=usa_2014[['Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
        'High School Rate','English Language Rate','Spanish Language Rate',
       'College Rate', 'Uneducated Rate', 'White Population Rate', 'Black Population Rate',
       'Hispanic Population Rate', 'Asian Population Rate', 'City', 
       'Lat', 'Lng', 'Housing_units']]

In [60]:
#saving csv file for machine learning
usa_2014_redu.to_csv('output_census/usa_2014_ml.csv', index=False)

## 2014 California census data

In [None]:
#reading 2014 combined california census data
path_ca_2014=os.path.join('output_census', 'census_ca_2014.csv')
ca_2014=pd.read_csv(path_ca_2014)

In [None]:
#looking for ca_2014 dataframe
ca_2014.head()

In [None]:
#columns of 2014  dataframe
ca_2014.columns

In [None]:
#shape of the dataframe
ca_2014.shape

In [None]:
#removing the rows with house value less than 1
ca_2014=ca_2014[ca_2014['House Value']>0]

In [None]:
#looking for null value
ca_2014.isna().sum()

In [None]:
#shape of the clean dataframe
ca_2014.shape

In [None]:
#info of the dataframe
ca_2014.info()

In [None]:
#describing the dataframe
ca_2014.describe()

## 2015 US census data

In [61]:
# 2015 census data for all US zipcodes
#reading 2015 combined census data
path_comb_2015=os.path.join('output_census', 'census_comb_2015.csv')
usa_2015=pd.read_csv(path_comb_2015)

In [62]:
#looking for usa_2015 dataframe
usa_2015.head()

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Unemployment Rate,House Value,House Construction Year,Monthly Owner Cost,...,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate,City,County,Lat,Lng,Housing_units,State
0,12810,724.0,46.5,57500.0,25551.0,10.773481,2.762431,116200.0,1975.0,1054.0,...,99.033149,0.0,1.933702,0.276243,Athol,Warren County,43.48,-73.88,363.0,NY
1,12811,67.0,45.9,-666666666.0,11590.0,0.0,29.850746,-666666666.0,1976.0,-666666666.0,...,100.0,0.0,0.0,0.0,Bakers Mills,Warren County,43.622,-74.035,65.0,NY
2,12812,58.0,64.1,49583.0,23600.0,0.0,0.0,122500.0,1950.0,,...,100.0,0.0,0.0,0.0,Blue Mountain Lake,Hamilton County,43.9,-74.3,324.0,NY
3,12814,1282.0,46.4,58176.0,35508.0,3.978159,0.936037,317100.0,1977.0,1670.0,...,89.859594,2.808112,2.4961,1.092044,Bolton Landing,Warren County,43.6,-73.6,1822.0,NY
4,12815,1103.0,54.2,60458.0,30685.0,14.415231,2.629193,227300.0,1976.0,1149.0,...,91.296464,1.359927,0.543971,0.0,Brant Lake,Warren County,43.69,-73.71,1112.0,NY


In [63]:
#columns of 2017  dataframe
usa_2015.columns

Index(['Zipcode', 'Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
       'Commute Time Public', 'Commute Time Car', 'High School Rate',
       'College Rate', 'Uneducated Rate', 'English Language Rate',
       'Spanish Language Rate', 'White Population Rate',
       'Black Population Rate', 'Hispanic Population Rate',
       'Asian Population Rate', 'City', 'County', 'Lat', 'Lng',
       'Housing_units', 'State'],
      dtype='object')

In [64]:
#shape of the dataframe
usa_2015.shape

(33120, 30)

In [65]:
#removing the rows with house value less than 1
usa_2015=(usa_2015[(usa_2015['House Value']>0)&
                (usa_2015['Household Income']>0)&
                (usa_2015['Monthly Owner Cost']>0)])

In [66]:
#looking for null value
usa_2015.isna().sum()

Zipcode                         0
Population                      0
Median Age                      0
Household Income                0
Per Capita Income               0
Poverty Rate                    0
Unemployment Rate               0
House Value                     0
House Construction Year         0
Monthly Owner Cost              0
Monthly Rent                  384
Public Transport Rate           0
Personal Transport Rate         0
Commute Time Public         20378
Commute Time Car            20378
High School Rate                0
College Rate                    0
Uneducated Rate                 0
English Language Rate           0
Spanish Language Rate           0
White Population Rate           0
Black Population Rate           0
Hispanic Population Rate        0
Asian Population Rate           0
City                            0
County                          1
Lat                            16
Lng                            16
Housing_units                  16
State         

In [68]:
#removing the rows with lat and lng missing value
usa_2015.dropna(subset=['Lat', 'Lng','Housing_units','County','Monthly Rent'], inplace=True)

In [69]:
#shape of the clean dataframe
usa_2015.shape

(28536, 30)

In [70]:
#info of the dataframe
usa_2015.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28536 entries, 0 to 32153
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Zipcode                   28536 non-null  int64  
 1   Population                28536 non-null  float64
 2   Median Age                28536 non-null  float64
 3   Household Income          28536 non-null  float64
 4   Per Capita Income         28536 non-null  float64
 5   Poverty Rate              28536 non-null  float64
 6   Unemployment Rate         28536 non-null  float64
 7   House Value               28536 non-null  float64
 8   House Construction Year   28536 non-null  float64
 9   Monthly Owner Cost        28536 non-null  float64
 10  Monthly Rent              28536 non-null  float64
 11  Public Transport Rate     28536 non-null  float64
 12  Personal Transport Rate   28536 non-null  float64
 13  Commute Time Public       8494 non-null   float64
 14  Commut

In [71]:
#describing the dataframe
usa_2015.describe()

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Unemployment Rate,House Value,House Construction Year,Monthly Owner Cost,...,Uneducated Rate,English Language Rate,Spanish Language Rate,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate,Lat,Lng,Housing_units
count,28536.0,28536.0,28536.0,28536.0,28536.0,28536.0,28536.0,28536.0,28536.0,28536.0,...,28536.0,28536.0,28536.0,28536.0,28536.0,28536.0,28536.0,28536.0,28536.0,28536.0
mean,49155.317494,11097.450308,41.628627,53875.452096,27251.652439,14.536845,3.746196,176455.6,-278376.3,1343.261705,...,0.713976,83.494467,6.501781,83.643806,7.826074,9.308119,2.194418,38.853435,-90.61702,4625.416982
std,27298.825244,14790.178461,7.341023,22206.816349,11612.344819,9.534711,2.275871,153261.9,13668480.0,566.568303,...,0.929513,15.948903,12.716065,20.236977,15.720425,16.259979,5.427007,5.272376,14.771781,5781.116505
min,602.0,15.0,13.9,6278.0,3049.0,0.0,0.0,9999.0,-666666700.0,350.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.96,-171.69,13.0
25%,27241.25,1206.0,36.8,39874.25,20484.75,7.630184,2.299091,88900.0,1962.0,974.0,...,0.1221,81.160116,0.550327,77.896722,0.0,0.971741,0.0,35.4,-96.7,609.0
50%,49456.5,4078.5,41.4,49494.0,24809.0,12.620713,3.448276,130800.0,1975.0,1172.0,...,0.479007,89.208815,1.888473,92.05298,1.191837,3.227883,0.419633,39.6845,-88.03,1878.0
75%,70818.25,16190.0,45.9,62179.0,30752.0,19.320275,4.800213,202800.0,1983.0,1526.0,...,0.97089,92.641098,5.870331,97.149897,6.935731,9.219832,1.86036,42.2,-80.17,6925.25
max,99929.0,114982.0,82.0,250001.0,285783.0,86.538462,33.974359,2000001.0,2010.0,4001.0,...,44.700461,100.0,100.0,100.0,100.0,100.0,86.635945,71.0,-65.28,47617.0


In [72]:
usa_2015.columns

Index(['Zipcode', 'Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
       'Commute Time Public', 'Commute Time Car', 'High School Rate',
       'College Rate', 'Uneducated Rate', 'English Language Rate',
       'Spanish Language Rate', 'White Population Rate',
       'Black Population Rate', 'Hispanic Population Rate',
       'Asian Population Rate', 'City', 'County', 'Lat', 'Lng',
       'Housing_units', 'State'],
      dtype='object')

In [73]:
usa_2015.drop(['Commute Time Car','Commute Time Public'], axis=1, inplace=True)

In [74]:
usa_2015.shape

(28536, 28)

In [75]:
#selecting only required columns
usa_2015_redu=usa_2015[['Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
        'High School Rate','English Language Rate','Spanish Language Rate',
       'College Rate', 'Uneducated Rate', 'White Population Rate', 'Black Population Rate',
       'Hispanic Population Rate', 'Asian Population Rate', 'City', 
       'Lat', 'Lng', 'Housing_units']]

In [76]:
#saving csv file for machine learning
usa_2015_redu.to_csv('output_census/usa_2015_ml.csv', index=False)

## 2015 California census data

In [None]:
#reading 2015 combined california census data
path_ca_2015=os.path.join('output_census', 'census_ca_2015.csv')
ca_2015=pd.read_csv(path_ca_2015)

In [None]:
#looking for ca_2015 dataframe
ca_2015.head()

In [None]:
#columns of 2015  dataframe
ca_2015.columns

In [None]:
#shape of the dataframe
ca_2015.shape

In [None]:
#removing the rows with house value less than 1
ca_2015=ca_2015[ca_2015['House Value']>0]

In [None]:
#looking for null value
ca_2015.isna().sum()

In [None]:
#removing the rows with lat and lng missing value
ca_2015.dropna(subset=['Household Income', 'Monthly Rent', 
                        'Monthly Owner Cost','Personal Transport Rate',
                      'Public Transport Rate'],inplace=True)

In [None]:
#shape of the clean dataframe
ca_2015.shape

In [None]:
#info of the dataframe
ca_2015.info()

In [None]:
#describing the dataframe
ca_2015.describe()

## 2017 US census data

In [29]:
# 2017 census data for all US zipcodes
#reading 2017 combined census data
path_comb_2017=os.path.join('output_census', 'census_comb_2017.csv')
usa_2017=pd.read_csv(path_comb_2017)

In [30]:
#looking for usa_2017 dataframe
usa_2017.head()

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Unemployment Rate,House Value,House Construction Year,Monthly Owner Cost,...,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate,City,County,Lat,Lng,Housing_units,State
0,601,17599.0,38.9,11757.0,7041.0,64.105915,13.943974,82500.0,1981.0,748.0,...,77.765782,0.681857,99.624979,0.0,Adjuntas,Adjuntas Municipio,,,,PR
1,602,39209.0,40.9,16190.0,8978.0,52.100283,6.473004,87300.0,1979.0,846.0,...,66.854549,2.785075,93.692775,0.0,Aguada,Aguada Municipio,18.36,-67.18,18073.0,PR
2,603,50135.0,40.4,16645.0,10897.0,50.216416,7.156677,122300.0,1977.0,867.0,...,71.225691,3.95931,97.46684,1.111,Aguadilla,Aguadilla Municipio,18.45,-67.11,25653.0,PR
3,606,6304.0,42.8,13387.0,5960.0,64.911168,3.236041,92700.0,1979.0,538.0,...,48.302665,2.538071,99.809645,0.0,Maricao,Maricao Municipio,18.2,-66.9,2877.0,PR
4,610,27590.0,41.4,18741.0,9266.0,45.498369,5.342515,90300.0,1979.0,733.0,...,61.754259,3.062704,97.317869,0.0,Anasco,Aasco Municipio,18.28,-67.13,12618.0,PR


In [31]:
#columns of 2017  dataframe
usa_2017.columns

Index(['Zipcode', 'Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
       'Commute Time Public', 'Commute Time Car', 'High School Rate',
       'College Rate', 'Uneducated Rate', 'English Language',
       'Spanish Language', 'White Population Rate', 'Black Population Rate',
       'Hispanic Population Rate', 'Asian Population Rate', 'City', 'County',
       'Lat', 'Lng', 'Housing_units', 'State'],
      dtype='object')

In [32]:
#shape of the dataframe
usa_2017.shape

(33120, 30)

In [33]:
#removing the rows with house value less than 1
usa_2017=(usa_2017[(usa_2017['House Value']>0)&
                (usa_2017['Household Income']>0)&
                (usa_2017['Monthly Owner Cost']>0)])

In [34]:
#looking for null value
usa_2017.isna().sum()

Zipcode                         0
Population                      0
Median Age                      0
Household Income                0
Per Capita Income               0
Poverty Rate                    0
Unemployment Rate               0
House Value                     0
House Construction Year         0
Monthly Owner Cost              0
Monthly Rent                    0
Public Transport Rate           0
Personal Transport Rate         0
Commute Time Public         20602
Commute Time Car            20602
High School Rate                0
College Rate                    0
Uneducated Rate                 0
English Language            29110
Spanish Language            29110
White Population Rate           0
Black Population Rate           0
Hispanic Population Rate        0
Asian Population Rate           0
City                            0
County                          1
Lat                            16
Lng                            16
Housing_units                  16
State         

In [35]:
#removing the rows with lat and lng missing value
usa_2017.dropna(subset=['Lat', 'Lng','Housing_units','County'], inplace=True)

In [36]:
#shape of the clean dataframe
usa_2017.shape

(29093, 30)

In [37]:
#info of the dataframe
usa_2017.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29093 entries, 1 to 33119
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Zipcode                   29093 non-null  int64  
 1   Population                29093 non-null  float64
 2   Median Age                29093 non-null  float64
 3   Household Income          29093 non-null  float64
 4   Per Capita Income         29093 non-null  float64
 5   Poverty Rate              29093 non-null  float64
 6   Unemployment Rate         29093 non-null  float64
 7   House Value               29093 non-null  float64
 8   House Construction Year   29093 non-null  float64
 9   Monthly Owner Cost        29093 non-null  float64
 10  Monthly Rent              29093 non-null  float64
 11  Public Transport Rate     29093 non-null  float64
 12  Personal Transport Rate   29093 non-null  float64
 13  Commute Time Public       8502 non-null   float64
 14  Commut

In [38]:
#describing the dataframe
usa_2017.describe()

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Unemployment Rate,House Value,House Construction Year,Monthly Owner Cost,...,Uneducated Rate,English Language,Spanish Language,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate,Lat,Lng,Housing_units
count,29093.0,29093.0,29093.0,29093.0,29093.0,29093.0,29093.0,29093.0,29093.0,29093.0,...,29093.0,0.0,0.0,29093.0,29093.0,29093.0,29093.0,29093.0,29093.0,29093.0
mean,49133.22734,11063.88987,42.144512,57222.865878,29194.840099,13.905609,3.000631,187936.3,-318838.4,1364.261437,...,0.719213,,,83.446066,7.798114,9.510061,2.253109,38.856601,-90.622085,4549.981301
std,27367.633916,15010.032395,7.66211,23747.660188,12563.431705,9.327151,2.045864,167848.9,14621190.0,577.959805,...,0.92401,,,20.344314,15.635153,16.335293,5.524311,5.257513,14.798852,5755.615274
min,602.0,20.0,15.4,5902.0,3534.0,0.0,0.0,13900.0,-666666700.0,331.0,...,0.0,,,0.0,0.0,0.0,0.0,17.96,-171.69,16.0
25%,27013.0,1149.0,37.1,41932.0,21830.0,7.172161,1.764706,93300.0,1963.0,987.0,...,0.096012,,,77.47858,0.0,1.025783,0.0,35.4,-96.71,585.0
50%,49412.0,3871.0,41.8,52188.0,26535.0,11.917423,2.713026,139000.0,1975.0,1188.0,...,0.472888,,,91.905444,1.198963,3.387892,0.44069,39.67,-88.0,1797.0
75%,71044.0,16038.0,46.5,66181.0,32940.0,18.479779,3.837719,216400.0,1984.0,1543.0,...,0.97116,,,97.059294,6.948303,9.551657,1.916236,42.2,-80.14,6748.0
max,99929.0,119204.0,83.1,250001.0,298129.0,81.612713,43.396226,2000001.0,2012.0,4001.0,...,25.46729,,,100.0,100.0,100.0,74.248386,71.0,-65.28,47617.0


In [39]:
usa_2017.columns

Index(['Zipcode', 'Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
       'Commute Time Public', 'Commute Time Car', 'High School Rate',
       'College Rate', 'Uneducated Rate', 'English Language',
       'Spanish Language', 'White Population Rate', 'Black Population Rate',
       'Hispanic Population Rate', 'Asian Population Rate', 'City', 'County',
       'Lat', 'Lng', 'Housing_units', 'State'],
      dtype='object')

In [40]:
usa_2017.drop(['Commute Time Car','Commute Time Public',
               'English Language','Spanish Language'], axis=1, inplace=True)

In [41]:
usa_2017.shape

(29093, 26)

In [42]:
#selecting only required columns
usa_2017_redu=usa_2017[['Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
        'High School Rate',
       'College Rate', 'Uneducated Rate', 'White Population Rate', 'Black Population Rate',
       'Hispanic Population Rate', 'Asian Population Rate', 'City', 
       'Lat', 'Lng', 'Housing_units']]

In [43]:
#saving csv file for machine learning
#usa_2017_redu.to_csv('output_census/usa_2017_ml.csv', index=False)

## 2017 California census data

In [19]:
#reading 2017 combined california census data
path_ca_2017=os.path.join('output_census', 'census_ca_2017.csv')
ca_2017=pd.read_csv(path_ca_2017)

In [20]:
#looking for usa_2017 dataframe
ca_2017.head()

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Unemployment Rate,House Value,House Construction Year,Monthly Owner Cost,...,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate,City,County,Lat,Lng,Housing_units,State
0,90001,58738.0,28.7,35660.0,11882.0,31.757976,4.974633,305500.0,1949.0,1695.0,...,32.367462,9.164425,89.841329,0.224727,Los Angeles,Los Angeles County,33.97,-118.25,13788.0,CA
1,90002,52856.0,27.7,34000.0,11807.0,33.666944,5.032541,272400.0,1951.0,1678.0,...,37.683517,21.569926,76.420842,0.565688,Los Angeles,Los Angeles County,33.95,-118.25,12598.0,CA
2,90003,70490.0,28.2,34397.0,11305.0,32.929494,5.440488,300700.0,1951.0,1688.0,...,29.767343,22.383317,76.766917,0.285147,Los Angeles,Los Angeles County,33.96,-118.27,17127.0,CA
3,90004,62733.0,35.1,46581.0,30590.0,19.104777,4.182806,918500.0,1943.0,3467.0,...,37.758437,4.055282,51.350964,25.074522,Los Angeles,Los Angeles County,34.08,-118.31,24278.0,CA
4,90005,39562.0,35.4,32461.0,21566.0,28.562762,5.156463,713400.0,1949.0,3067.0,...,20.33517,6.228199,49.519741,34.823821,Los Angeles,Los Angeles County,34.06,-118.31,16345.0,CA


In [21]:
#columns of 2017  dataframe
ca_2017.columns

Index(['Zipcode', 'Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
       'Commute Time Public', 'Commute Time Car', 'High School Rate',
       'College Rate', 'Uneducated Rate', 'English Language',
       'Spanish Language', 'White Population Rate', 'Black Population Rate',
       'Hispanic Population Rate', 'Asian Population Rate', 'City', 'County',
       'Lat', 'Lng', 'Housing_units', 'State'],
      dtype='object')

In [22]:
#shape of the dataframe
ca_2017.shape

(1763, 30)

In [23]:
#removing the rows with house value less than 1
ca_2017=ca_2017[ca_2017['House Value']>0]

In [24]:
#looking for null value
ca_2017.isna().sum()

Zipcode                        0
Population                     0
Median Age                     0
Household Income               0
Per Capita Income              0
Poverty Rate                   0
Unemployment Rate              0
House Value                    0
House Construction Year        0
Monthly Owner Cost             0
Monthly Rent                   0
Public Transport Rate          0
Personal Transport Rate        0
Commute Time Public          950
Commute Time Car             950
High School Rate               0
College Rate                   0
Uneducated Rate                0
English Language            1603
Spanish Language            1603
White Population Rate          0
Black Population Rate          0
Hispanic Population Rate       0
Asian Population Rate          0
City                           0
County                         0
Lat                            0
Lng                            0
Housing_units                  0
State                          0
dtype: int

In [25]:
#removing the rows with lat and lng missing value


In [26]:
#shape of the clean dataframe
ca_2017.shape

(1603, 30)

In [27]:
#info of the dataframe
ca_2017.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1603 entries, 0 to 1762
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Zipcode                   1603 non-null   int64  
 1   Population                1603 non-null   float64
 2   Median Age                1603 non-null   float64
 3   Household Income          1603 non-null   float64
 4   Per Capita Income         1603 non-null   float64
 5   Poverty Rate              1603 non-null   float64
 6   Unemployment Rate         1603 non-null   float64
 7   House Value               1603 non-null   float64
 8   House Construction Year   1603 non-null   float64
 9   Monthly Owner Cost        1603 non-null   float64
 10  Monthly Rent              1603 non-null   float64
 11  Public Transport Rate     1603 non-null   float64
 12  Personal Transport Rate   1603 non-null   float64
 13  Commute Time Public       653 non-null    float64
 14  Commute 

In [28]:
#describing the dataframe
ca_2017.describe()

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Unemployment Rate,House Value,House Construction Year,Monthly Owner Cost,...,Uneducated Rate,English Language,Spanish Language,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate,Lat,Lng,Housing_units
count,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,...,1603.0,0.0,0.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0
mean,93615.858391,24255.411104,40.699439,-11576610.0,34704.905178,15.069615,3.758846,480489.8,-829802.5,-22455800.0,...,1.380712,,,69.891782,4.140529,30.870654,9.906122,36.347634,-119.908611,8513.565814
std,1815.079634,22409.72975,9.258523,87372600.0,19231.863089,9.774355,2.091052,344943.7,23540880.0,120319100.0,...,1.865957,,,20.034087,6.839564,24.580061,13.176867,2.417502,2.122225,7221.991293
min,90001.0,21.0,16.5,-666666700.0,4804.0,0.0,0.0,39000.0,-666666700.0,-666666700.0,...,0.0,,,8.955224,0.0,0.0,0.0,32.55,-124.3,21.0
25%,92235.0,3117.5,34.1,44932.5,22010.0,7.782075,2.589084,241300.0,1965.0,1495.5,...,0.304018,,,56.454612,0.477819,11.320723,1.160332,34.05,-121.885,1478.5
50%,93641.0,20521.0,39.0,61801.0,30130.0,12.911282,3.486169,386400.0,1975.0,2003.0,...,0.964467,,,74.309912,1.864598,23.381443,4.955789,36.63,-120.0,7496.0
75%,95352.5,38483.0,46.05,84916.5,42448.0,20.369123,4.757319,624600.0,1982.0,2610.0,...,1.949272,,,86.061824,4.936286,45.864916,12.930861,38.2,-118.06435,13804.0
max,96161.0,108051.0,75.3,250001.0,154723.0,63.384064,20.521542,2000001.0,2009.0,4001.0,...,49.253731,,,100.0,76.8075,98.388541,91.044776,41.94,-114.3,37182.0


## 2019 US census data

In [None]:
# 2019 census data for all US zipcodes
#reading 2019 combined census data
path_comb_2019=os.path.join('output_census', 'census_comb_2019.csv')
usa_2019=pd.read_csv(path_comb_2019)

In [None]:
#looking for usa_2014 dataframe
usa_2019.head()

In [None]:
#columns of 2014  dataframe
usa_2019.columns

In [None]:
#shape of the dataframe
usa_2019.shape

In [None]:
#removing the rows with house value less than 1
usa_2019=(usa_2019[(usa_2019['House Value']>0)&
                (usa_2019['Household Income']>0)&
                (usa_2019['Monthly Owner Cost']>0)])

In [None]:
#looking for null value
usa_2019.isna().sum()

In [None]:
#removing the rows with lat and lng missing value
usa_2019.dropna(subset=['Lat', 'Lng','Housing_units','County', 
                        'Public Transport Rate','Personal Transport Rate'], 
                         inplace=True)

In [None]:
#shape of the clean dataframe
usa_2019.shape

In [None]:
#info of the dataframe
usa_2019.info()

In [None]:
#describing the dataframe
usa_2019.describe()

In [None]:
usa_2019.columns

In [None]:
usa_2019.drop(['Commute Time Car','Commute Time Public'], axis=1, inplace=True)

In [None]:
usa_2019.shape

In [None]:
#selecting only required columns
usa_2019_redu=usa_2019[['Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
        'High School Rate',
       'College Rate', 'Uneducated Rate', 'White Population Rate', 'Black Population Rate',
       'Hispanic Population Rate', 'Asian Population Rate', 'City', 
       'Lat', 'Lng', 'Housing_units']]

In [None]:
#saving csv file for machine learning
#usa_2019_redu.to_csv('output_census/usa_2019_ml.csv', index=False)

## 2019 California census data

In [None]:
#reading 2019 combined california census data
path_ca_2019=os.path.join('output_census', 'census_ca_2019.csv')
ca_2019=pd.read_csv(path_ca_2019)

In [None]:
#looking for usa_2019 dataframe
ca_2019.head()

In [None]:
#columns of 2019  dataframe
ca_2019.columns

In [None]:
#shape of the dataframe
ca_2019.shape

In [None]:
#removing the rows with house value less than 1
ca_2019=(ca_2019[(ca_2019['House Value']>0)&
                (ca_2019['Household Income']>0)&
                (ca_2019['Monthly Owner Cost']>0)])

In [None]:
#looking for null value
ca_2019.isna().sum()

In [None]:
#removing the rows with lat and lng missing value


In [None]:
#shape of the clean dataframe
ca_2019.shape

In [None]:
#info of the dataframe
ca_2019.info()

In [None]:
#describing the dataframe
ca_2019.describe()

In [None]:
#selecting only important columns
ca_19=ca_2019[['City','Population', 'House Value','Household Income',
       'Poverty Rate', 'Unemployment Rate','Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate','College Rate','White Population Rate',
       'Uneducated Rate' ]]

In [None]:
#only selecting rows with +ve owner cost and rent
ca_19=(ca_19[(ca_19['Monthly Owner Cost']>0)&
             (ca_19['Monthly Rent']>0)])

In [None]:
#saving csv file for machine learning
ca_19.to_csv('output_census/ca_2019_ml.csv', index=False)

In [None]:
#formating columns with 2 decimal place
ca_19[['House Value','Household Income',
       'Poverty Rate', 'Unemployment Rate','Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Commute Time Public', 'Commute Time Car', 
       'Uneducated Rate' ]]=(ca_19[[ 'House Value','Household Income',
                                   'Poverty Rate', 'Unemployment Rate',
                                     'Monthly Owner Cost', 'Monthly Rent',
                                    'Public Transport Rate', 'Commute Time Public',
                                     'Commute Time Car', 'Uneducated Rate']]
                                                   .applymap('{:.2f}'.format))

In [None]:
#using groupby method to groupby the data accordint to city
ca_19_city=ca_19.groupby('City').agg({'Population':'sum',
                              'House Value':'mean',
                               'Household Income':'mean',
                               'Poverty Rate':'mean',
                               'Unemployment Rate':'mean',
                               'Monthly Owner Cost':'mean',
                               'Monthly Rent':'mean',
                               'Public Transport Rate':'mean',
                               'Personal Transport Rate':'mean',
                                'College Rate':'mean',
                                'White Population Rate':'mean',      
                               'Uneducated Rate':'mean'}).reset_index()

In [None]:
#sorting data according to the population
ca_19_city=ca_19_city.sort_values(by=['House Value',
                                'Monthly Owner Cost','Household Income',
                                'Poverty Rate','Unemployment Rate',
                                'Public Transport Rate','Monthly Rent', 
                                'Uneducated Rate' ], ascending=False)

In [None]:
#saving csv file for plotting
ca_19.to_csv('output_census/ca_2019_fg.csv', index=False)

In [None]:
ca_19_city.shape

In [None]:
#printing the new sorted datasets
ca_19_city.head()

In [None]:
#city with higher house value
ca_19_city.nlargest(10, 'House Value')

In [None]:
#City with smallest poverty rate
ca_19_city.nsmallest(10, 'Poverty Rate')

In [None]:
#city with smallest unemployment rate
ca_19_city.nsmallest(10, 'Unemployment Rate')

In [None]:
#city with larger commute time
ca_19_city.nsmallest(10, 'Commute Time Car')

In [None]:
#city with smallest monthly owner cost
ca_19_city.nsmallest(10, 'Monthly Owner Cost')

In [None]:
#city with largest household income
ca_19_city.nlargest(10, 'Household Income')

## Ploting 2019 California Data

In [None]:
#plotting all correlation using seaborn heatmap
corr_19=ca_19.corr()
import seaborn as sns
fig, ax=plt.subplots(figsize=(15,15))
sns.heatmap(corr_19,vmin=-1, vmax=1, ax=ax, cmap='BrBG')#annot=True,
plt.show()

In [None]:
corr_19=ca_19.corr()
corr_19.style.background_gradient(cmap='coolwarm')

In [None]:
c1 = corr_19.abs().unstack()
c1.sort_values(ascending = False)[12:40]

In [None]:
#plotting
fig, ax=plt.subplots(figsize=(15,15))
ca_2019.plot(kind='scatter', x='Lng', y='Lat', alpha=0.2, 
                s=ca_2019['Population']/100,label='Population',
                 c='House Value', cmap=plt.get_cmap('jet'),
                colorbar=True, ax=ax)

plt.show()