In [1]:
# Dependencies and Setup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib import rcParams
import scipy.stats as sts
import os
from collections import Counter
import requests
import json


## 2012 US census data

In [None]:
# 2012 census data for all US zipcodes
#reading 2012 combined census data
path_comb_2012=os.path.join('output_census', 'census_comb_2012.csv')
usa_2012=pd.read_csv(path_comb_2012)

In [None]:
#looking for usa_2012 dataframe
usa_2012.head()

In [None]:
#columns of 2012  dataframe
usa_2012.columns

In [None]:
#shape of the dataframe
usa_2012.shape

In [None]:
#removing the rows with house value less than 1
usa_2012=(usa_2012[(usa_2012['House Value']>0)&
                (usa_2012['Household Income']>0)&
                (usa_2012['Monthly Owner Cost']>0)])

In [None]:
#looking for null value
usa_2012.isna().sum()

In [None]:
#removing the rows with lat and lng missing value
usa_2012.dropna(subset=['Lat', 'Lng','Housing_units','County','Public Transport Rate','Personal Transport Rate'], inplace=True)

In [None]:
#shape of the clean dataframe
usa_2012.shape

In [None]:
#info of the dataframe
usa_2012.info()

In [None]:
#describing the dataframe
usa_2012.describe()

In [None]:
usa_2012.columns

In [None]:
usa_2012.drop(['Commute Time Car','Commute Time Public'], axis=1, inplace=True)

In [None]:
usa_2012.shape

In [None]:
#selecting only required columns
usa_2012_redu=usa_2012[['Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
        'High School Rate','English Language Rate','Spanish Language Rate',
       'College Rate', 'Uneducated Rate', 'White Population Rate', 'Black Population Rate',
       'Hispanic Population Rate', 'Asian Population Rate', 'City', 
       'Lat', 'Lng', 'Housing_units']]

In [None]:
#saving csv file for machine learning
#usa_2012_redu.to_csv('output_census/usa_2012_ml.csv', index=False)

## Looking for 2012 California census data

In [None]:
#reading 2012 combined california census data
path_ca_2012=os.path.join('output_census', 'census_ca_2012.csv')
ca_2012=pd.read_csv(path_ca_2012)

In [None]:
#looking for ca_2012 dataframe
ca_2012.head()

In [None]:
#columns of 2012  dataframe
ca_2012.columns

In [None]:
#shape of the dataframe
ca_2012.shape

In [None]:
#removing the rows with house value less than 1
#removing the rows with house value less than 1
ca_2012=(ca_2012[(ca_2012['House Value']>0)&
                (ca_2012['Household Income']>0)&
                (ca_2012['Monthly Owner Cost']>0)])

In [None]:
#looking for null value
ca_2012.isna().sum()

In [None]:
#removing the rows with missing value
ca_2012.dropna(subset=['Public Transport Rate',
                        'Personal Transport Rate'], inplace=True)

In [None]:
#shape of the clean dataframe
ca_2012.shape

In [None]:
#filling nan to zero for commute time public and commute time car columns
ca_2012[['Commute Time Public', 'Commute Time Car' ]]=ca_2012[['Commute Time Public', 'Commute Time Car' ]].fillna(0)

In [None]:
ca_2012['Commute Time Public']

In [None]:
#converting to int
ca_2012['Population']=ca_2012['Population'].astype(int)

In [None]:
#info of the dataframe
ca_2012.info()

In [None]:
#describing the dataframe
ca_2012.describe()

In [None]:
#saving csv file for plotting
#ca_2012.to_csv('output_census/ca_2012_fg.csv', index=False)

## 2014 US census data

In [2]:
# 2014 census data for all US zipcodes
#reading 2014 combined census data
path_comb_2014=os.path.join('output_census', 'census_comb_2014.csv')
usa_2014=pd.read_csv(path_comb_2014)

In [3]:
#looking for usa_2014 dataframe
usa_2014.head()

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Unemployment Rate,House Value,House Construction Year,Monthly Owner Cost,...,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate,City,County,Lat,Lng,Housing_units,State
0,601,18088.0,37.1,10833.0,7229.0,60.32176,11.012826,105400.0,1982.0,722.0,...,96.240602,0.967492,99.806502,0.0,Adjuntas,Adjuntas Municipio,,,,PR
1,602,40859.0,39.0,16353.0,9048.0,53.168213,10.639027,91200.0,1980.0,843.0,...,58.540346,2.684843,93.203456,0.183558,Aguada,Aguada Municipio,18.36,-67.18,18073.0,PR
2,603,53162.0,39.2,16323.0,9888.0,48.957902,7.623867,128700.0,1977.0,841.0,...,73.001392,3.598435,96.275535,1.183176,Aguadilla,Aguadilla Municipio,18.45,-67.11,25653.0,PR
3,606,6415.0,39.2,14138.0,6385.0,58.893219,2.681216,105800.0,1977.0,569.0,...,87.622759,2.400624,99.890881,0.0,Maricao,Maricao Municipio,18.2,-66.9,2877.0,PR
4,610,28805.0,39.7,17265.0,8197.0,49.119944,4.481861,113700.0,1979.0,752.0,...,67.127235,3.478563,99.12168,0.090262,Anasco,Aasco Municipio,18.28,-67.13,12618.0,PR


In [4]:
#columns of 2017  dataframe
usa_2014.columns

Index(['Zipcode', 'Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
       'Commute Time Public', 'Commute Time Car', 'High School Rate',
       'College Rate', 'Uneducated Rate', 'English Language Rate',
       'Spanish Language Rate', 'White Population Rate',
       'Black Population Rate', 'Hispanic Population Rate',
       'Asian Population Rate', 'City', 'County', 'Lat', 'Lng',
       'Housing_units', 'State'],
      dtype='object')

In [5]:
#shape of the dataframe
usa_2014.shape

(33120, 30)

In [6]:
#removing the rows with house value less than 1
usa_2014=(usa_2014[(usa_2014['House Value']>0)&
                (usa_2014['Household Income']>0)&
                (usa_2014['Monthly Owner Cost']>0)])

In [7]:
#looking for null value
usa_2014.isna().sum()

Zipcode                         0
Population                      0
Median Age                      0
Household Income                0
Per Capita Income               0
Poverty Rate                    0
Unemployment Rate               0
House Value                     0
House Construction Year         0
Monthly Owner Cost              0
Monthly Rent                    0
Public Transport Rate           0
Personal Transport Rate         0
Commute Time Public         21504
Commute Time Car            21504
High School Rate                0
College Rate                    0
Uneducated Rate                 0
English Language Rate           0
Spanish Language Rate           0
White Population Rate           0
Black Population Rate           0
Hispanic Population Rate        0
Asian Population Rate           0
City                            0
County                          1
Lat                            16
Lng                            16
Housing_units                  16
State         

In [8]:
#removing the rows with lat and lng missing value
usa_2014.dropna(subset=['Lat', 'Lng','Housing_units','County'], inplace=True)

In [9]:
#shape of the clean dataframe
usa_2014.shape

(30279, 30)

In [None]:
#info of the dataframe
usa_2014.info()

In [None]:
#describing the dataframe
usa_2014.describe()

In [None]:
usa_2014.columns

In [None]:
usa_2014.drop(['Commute Time Car','Commute Time Public'], axis=1, inplace=True)

In [None]:
usa_2014.shape

In [None]:
#selecting only required columns
usa_2014_redu=usa_2014[['Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
        'High School Rate','English Language Rate','Spanish Language Rate',
       'College Rate', 'Uneducated Rate', 'White Population Rate', 'Black Population Rate',
       'Hispanic Population Rate', 'Asian Population Rate', 'City', 
       'Lat', 'Lng', 'Housing_units']]

In [None]:
#saving csv file for machine learning
#usa_2014_redu.to_csv('output_census/usa_2014_ml.csv', index=False)

## 2014 California census data

In [10]:
#reading 2014 combined california census data
path_ca_2014=os.path.join('output_census', 'census_ca_2014.csv')
ca_2014=pd.read_csv(path_ca_2014)

In [11]:
#looking for ca_2014 dataframe
ca_2014.head()

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Unemployment Rate,House Value,House Construction Year,Monthly Owner Cost,...,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate,City,County,Lat,Lng,Housing_units,State
0,90001,56314.0,27.3,34050.0,11224.0,33.55116,5.645133,241800.0,1951.0,1649.0,...,49.893455,9.464787,89.814256,0.147388,Los Angeles,Los Angeles County,33.97,-118.25,13788.0,CA
1,90002,50098.0,26.2,30214.0,10497.0,36.159128,5.5611,218800.0,1951.0,1633.0,...,48.375185,24.583816,74.184598,0.301409,Los Angeles,Los Angeles County,33.95,-118.25,12598.0,CA
2,90003,66913.0,26.8,30016.0,9915.0,38.381182,6.874598,228600.0,1949.0,1683.0,...,22.385785,24.637963,74.111159,0.361664,Los Angeles,Los Angeles County,33.96,-118.27,17127.0,CA
3,90004,63547.0,35.8,38493.0,26575.0,25.135726,7.929564,724900.0,1949.0,3578.0,...,34.542937,3.573733,50.831668,25.648732,Los Angeles,Los Angeles County,34.08,-118.31,24278.0,CA
4,90005,38638.0,34.0,31214.0,19305.0,28.316683,7.00088,635500.0,1953.0,3252.0,...,19.550701,4.28076,52.792588,33.324706,Los Angeles,Los Angeles County,34.06,-118.31,16345.0,CA


In [12]:
#columns of 2014  dataframe
ca_2014.columns

Index(['Zipcode', 'Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
       'Commute Time Public', 'Commute Time Car', 'High School Rate',
       'College Rate', 'Uneducated Rate', 'English Language Rate',
       'Spanish Language Rate', 'White Population Rate',
       'Black Population Rate', 'Hispanic Population Rate',
       'Asian Population Rate', 'City', 'County', 'Lat', 'Lng',
       'Housing_units', 'State'],
      dtype='object')

In [13]:
#shape of the dataframe
ca_2014.shape

(1763, 30)

In [14]:
#removing the rows with house value less than 1
ca_2014=(ca_2014[(ca_2014['House Value']>0)&
                (ca_2014['Household Income']>0)&
                (ca_2014['Monthly Owner Cost']>0)])

In [15]:
#looking for null value
ca_2014.isna().sum()

Zipcode                       0
Population                    0
Median Age                    0
Household Income              0
Per Capita Income             0
Poverty Rate                  0
Unemployment Rate             0
House Value                   0
House Construction Year       0
Monthly Owner Cost            0
Monthly Rent                  0
Public Transport Rate         0
Personal Transport Rate       0
Commute Time Public         986
Commute Time Car            986
High School Rate              0
College Rate                  0
Uneducated Rate               0
English Language Rate         0
Spanish Language Rate         0
White Population Rate         0
Black Population Rate         0
Hispanic Population Rate      0
Asian Population Rate         0
City                          0
County                        0
Lat                           0
Lng                           0
Housing_units                 0
State                         0
dtype: int64

In [16]:
#removing the rows with missing value


In [18]:
#shape of the clean dataframe
ca_2014.shape

(1591, 30)

In [19]:
#filling nan to zero for commute time public and commute time car columns
ca_2014[['Commute Time Public', 'Commute Time Car' ]]=ca_2014[['Commute Time Public', 'Commute Time Car' ]].fillna(0)

In [20]:
ca_2014['Commute Time Public']

0            0.0
1            0.0
2            0.0
3            0.0
4       324945.0
          ...   
1757         0.0
1758         0.0
1759         0.0
1760         0.0
1762      3510.0
Name: Commute Time Public, Length: 1591, dtype: float64

In [21]:
#converting to int
ca_2014['Population']=ca_2014['Population'].astype(int)

In [22]:
#info of the dataframe
ca_2014.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1591 entries, 0 to 1762
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Zipcode                   1591 non-null   int64  
 1   Population                1591 non-null   int64  
 2   Median Age                1591 non-null   float64
 3   Household Income          1591 non-null   float64
 4   Per Capita Income         1591 non-null   float64
 5   Poverty Rate              1591 non-null   float64
 6   Unemployment Rate         1591 non-null   float64
 7   House Value               1591 non-null   float64
 8   House Construction Year   1591 non-null   float64
 9   Monthly Owner Cost        1591 non-null   float64
 10  Monthly Rent              1591 non-null   float64
 11  Public Transport Rate     1591 non-null   float64
 12  Personal Transport Rate   1591 non-null   float64
 13  Commute Time Public       1591 non-null   float64
 14  Commute 

In [23]:
#describing the dataframe
ca_2014.describe()

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Unemployment Rate,House Value,House Construction Year,Monthly Owner Cost,...,Uneducated Rate,English Language Rate,Spanish Language Rate,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate,Lat,Lng,Housing_units
count,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0,...,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0
mean,93608.19736,23859.588309,39.799623,63338.534884,31638.92269,15.7996,5.34885,390728.7,-1255102.0,2150.230044,...,1.310003,62.382804,20.973316,70.809027,4.25109,30.239868,9.573416,36.339154,-119.898256,8578.024513
std,1815.620943,21839.2633,8.754959,28161.809543,17261.20338,10.052693,2.475556,241884.2,28930920.0,773.058022,...,1.628642,22.576559,19.742622,19.451062,7.296059,24.185344,12.65802,2.410507,2.112705,7210.895207
min,90001.0,50.0,19.3,11922.0,5180.0,0.0,0.0,22900.0,-666666700.0,427.0,...,0.0,2.068966,0.0,9.683246,0.0,0.0,0.0,32.55,-124.3,21.0
25%,92233.5,3171.5,33.4,42817.5,20276.5,8.318399,3.853681,205850.0,1964.5,1561.5,...,0.278107,46.525173,6.043723,58.657711,0.500798,10.671194,1.15741,34.05,-121.845,1558.0
50%,93636.0,20173.0,38.5,57287.0,27463.0,13.675235,5.174385,331300.0,1974.0,2029.0,...,0.89162,65.609986,14.003956,74.697646,1.885617,22.791519,4.850746,36.6,-120.0,7538.0
75%,95349.0,38008.0,45.0,77818.0,38396.5,20.980855,6.655864,523950.0,1981.0,2605.0,...,1.893405,81.001156,30.952544,86.095517,4.91266,44.050939,11.991283,38.2,-118.06935,13845.5
max,96161.0,106521.0,74.5,236912.0,142620.0,67.642753,24.84375,1000001.0,2011.0,4001.0,...,33.108108,100.0,89.793103,100.0,81.710857,100.0,73.655063,41.94,-114.3,37182.0


In [24]:
#saving csv file for plotting
#ca_2014.to_csv('output_census/ca_2014_fg.csv', index=False)

## 2015 US census data

In [None]:
# 2015 census data for all US zipcodes
#reading 2015 combined census data
path_comb_2015=os.path.join('output_census', 'census_comb_2015.csv')
usa_2015=pd.read_csv(path_comb_2015)

In [None]:
#looking for usa_2015 dataframe
usa_2015.head()

In [None]:
#columns of 2017  dataframe
usa_2015.columns

In [None]:
#shape of the dataframe
usa_2015.shape

In [None]:
#removing the rows with house value less than 1
usa_2015=(usa_2015[(usa_2015['House Value']>0)&
                (usa_2015['Household Income']>0)&
                (usa_2015['Monthly Owner Cost']>0)])

In [None]:
#looking for null value
usa_2015.isna().sum()

In [None]:
#removing the rows with lat and lng missing value
usa_2015.dropna(subset=['Lat', 'Lng','Housing_units','County','Monthly Rent'], inplace=True)

In [None]:
#shape of the clean dataframe
usa_2015.shape

In [None]:
#info of the dataframe
usa_2015.info()

In [None]:
#describing the dataframe
usa_2015.describe()

In [None]:
usa_2015.columns

In [None]:
usa_2015.drop(['Commute Time Car','Commute Time Public'], axis=1, inplace=True)

In [None]:
usa_2015.shape

In [None]:
#selecting only required columns
usa_2015_redu=usa_2015[['Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
        'High School Rate','English Language Rate','Spanish Language Rate',
       'College Rate', 'Uneducated Rate', 'White Population Rate', 'Black Population Rate',
       'Hispanic Population Rate', 'Asian Population Rate', 'City', 
       'Lat', 'Lng', 'Housing_units']]

In [None]:
#saving csv file for machine learning
#usa_2015_redu.to_csv('output_census/usa_2015_ml.csv', index=False)

## 2015 California census data

In [25]:
#reading 2015 combined california census data
path_ca_2015=os.path.join('output_census', 'census_ca_2015.csv')
ca_2015=pd.read_csv(path_ca_2015)

In [26]:
#looking for ca_2015 dataframe
ca_2015.head()

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Unemployment Rate,House Value,House Construction Year,Monthly Owner Cost,...,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate,City,County,Lat,Lng,Housing_units,State
0,95422,15476.0,42.3,25578.0,16355.0,36.792453,8.419488,86800.0,1976.0,1123.0,...,72.221504,6.216077,23.901525,0.394159,Clearlake,Lake County,38.97,-122.64,8546.0,CA
1,95423,3260.0,55.8,27889.0,18760.0,33.067485,7.239264,160100.0,1975.0,1307.0,...,89.969325,1.717791,6.349693,0.490798,Clearlake Oaks,Lake County,39.1,-122.6,2680.0,CA
2,95425,10597.0,42.3,60517.0,29688.0,8.106068,3.189582,353400.0,1982.0,2026.0,...,77.6918,0.990846,31.112579,4.916486,Cloverdale,Sonoma County,38.8,-123.0,4544.0,CA
3,95426,1544.0,50.9,62833.0,27425.0,24.740933,4.274611,195100.0,1979.0,1295.0,...,91.904145,0.0,7.707254,6.282383,Cobb,Lake County,38.81,-122.73,1315.0,CA
4,95428,2365.0,40.1,30975.0,16874.0,31.585624,7.906977,235800.0,1973.0,1250.0,...,57.589852,0.5074,16.490486,0.12685,Covelo,Mendocino County,39.7,-123.1,1160.0,CA


In [27]:
#columns of 2015  dataframe
ca_2015.columns

Index(['Zipcode', 'Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
       'Commute Time Public', 'Commute Time Car', 'High School Rate',
       'College Rate', 'Uneducated Rate', 'English Language Rate',
       'Spanish Language Rate', 'White Population Rate',
       'Black Population Rate', 'Hispanic Population Rate',
       'Asian Population Rate', 'City', 'County', 'Lat', 'Lng',
       'Housing_units', 'State'],
      dtype='object')

In [28]:
#shape of the dataframe
ca_2015.shape

(1763, 30)

In [29]:
#removing the rows with house value less than 1
ca_2015=(ca_2015[(ca_2015['House Value']>0)&
                (ca_2015['Household Income']>0)&
                (ca_2015['Monthly Owner Cost']>0)])

In [30]:
#looking for null value
ca_2015.isna().sum()

Zipcode                       0
Population                    0
Median Age                    0
Household Income              0
Per Capita Income             0
Poverty Rate                  0
Unemployment Rate             0
House Value                   0
House Construction Year       0
Monthly Owner Cost            0
Monthly Rent                 18
Public Transport Rate         0
Personal Transport Rate       0
Commute Time Public         908
Commute Time Car            908
High School Rate              0
College Rate                  0
Uneducated Rate               0
English Language Rate         0
Spanish Language Rate         0
White Population Rate         0
Black Population Rate         0
Hispanic Population Rate      0
Asian Population Rate         0
City                          0
County                        0
Lat                           0
Lng                           0
Housing_units                 0
State                         0
dtype: int64

In [31]:
#removing the rows with lat and lng missing value
ca_2015.dropna(subset=[ 'Monthly Rent'],inplace=True)

In [32]:
#filling nan to zero for commute time public and commute time car columns
ca_2015[['Commute Time Public', 'Commute Time Car' ]]=ca_2015[['Commute Time Public', 'Commute Time Car' ]].fillna(0)

In [33]:
#shape of the clean dataframe
ca_2015.shape

(1497, 30)

In [34]:
#saving csv file for plotting
#ca_2015.to_csv('output_census/ca_2015_fg.csv', index=False)

## 2017 US census data

In [None]:
# 2017 census data for all US zipcodes
#reading 2017 combined census data
path_comb_2017=os.path.join('output_census', 'census_comb_2017.csv')
usa_2017=pd.read_csv(path_comb_2017)

In [None]:
#looking for usa_2017 dataframe
usa_2017.head()

In [None]:
#columns of 2017  dataframe
usa_2017.columns

In [None]:
#shape of the dataframe
usa_2017.shape

In [None]:
#removing the rows with house value less than 1
usa_2017=(usa_2017[(usa_2017['House Value']>0)&
                (usa_2017['Household Income']>0)&
                (usa_2017['Monthly Owner Cost']>0)])

In [None]:
#looking for null value
usa_2017.isna().sum()

In [None]:
#removing the rows with lat and lng missing value
usa_2017.dropna(subset=['Lat', 'Lng','Housing_units','County'], inplace=True)

In [None]:
#shape of the clean dataframe
usa_2017.shape

In [None]:
#info of the dataframe
usa_2017.info()

In [None]:
#describing the dataframe
usa_2017.describe()

In [None]:
usa_2017.columns

In [None]:
usa_2017.drop(['Commute Time Car','Commute Time Public',
               'English Language','Spanish Language'], axis=1, inplace=True)

In [None]:
usa_2017.shape

In [None]:
#selecting only required columns
usa_2017_redu=usa_2017[['Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
        'High School Rate',
       'College Rate', 'Uneducated Rate', 'White Population Rate', 'Black Population Rate',
       'Hispanic Population Rate', 'Asian Population Rate', 'City', 
       'Lat', 'Lng', 'Housing_units']]

In [None]:
#saving csv file for machine learning
#usa_2017_redu.to_csv('output_census/usa_2017_ml.csv', index=False)

## 2017 California census data

In [35]:
#reading 2017 combined california census data
path_ca_2017=os.path.join('output_census', 'census_ca_2017.csv')
ca_2017=pd.read_csv(path_ca_2017)

In [36]:
#looking for usa_2017 dataframe
ca_2017.head()

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Unemployment Rate,House Value,House Construction Year,Monthly Owner Cost,...,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate,City,County,Lat,Lng,Housing_units,State
0,90001,58738.0,28.7,35660.0,11882.0,31.757976,4.974633,305500.0,1949.0,1695.0,...,32.367462,9.164425,89.841329,0.224727,Los Angeles,Los Angeles County,33.97,-118.25,13788.0,CA
1,90002,52856.0,27.7,34000.0,11807.0,33.666944,5.032541,272400.0,1951.0,1678.0,...,37.683517,21.569926,76.420842,0.565688,Los Angeles,Los Angeles County,33.95,-118.25,12598.0,CA
2,90003,70490.0,28.2,34397.0,11305.0,32.929494,5.440488,300700.0,1951.0,1688.0,...,29.767343,22.383317,76.766917,0.285147,Los Angeles,Los Angeles County,33.96,-118.27,17127.0,CA
3,90004,62733.0,35.1,46581.0,30590.0,19.104777,4.182806,918500.0,1943.0,3467.0,...,37.758437,4.055282,51.350964,25.074522,Los Angeles,Los Angeles County,34.08,-118.31,24278.0,CA
4,90005,39562.0,35.4,32461.0,21566.0,28.562762,5.156463,713400.0,1949.0,3067.0,...,20.33517,6.228199,49.519741,34.823821,Los Angeles,Los Angeles County,34.06,-118.31,16345.0,CA


In [37]:
#columns of 2017  dataframe
ca_2017.columns

Index(['Zipcode', 'Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
       'Commute Time Public', 'Commute Time Car', 'High School Rate',
       'College Rate', 'Uneducated Rate', 'English Language',
       'Spanish Language', 'White Population Rate', 'Black Population Rate',
       'Hispanic Population Rate', 'Asian Population Rate', 'City', 'County',
       'Lat', 'Lng', 'Housing_units', 'State'],
      dtype='object')

In [38]:
#shape of the dataframe
ca_2017.shape

(1763, 30)

In [39]:
#removing the rows with house value less than 1
ca_2017=(ca_2017[(ca_2017['House Value']>0)&
                (ca_2017['Household Income']>0)&
                (ca_2017['Monthly Owner Cost']>0)])

In [40]:
#looking for null value
ca_2017.isna().sum()

Zipcode                        0
Population                     0
Median Age                     0
Household Income               0
Per Capita Income              0
Poverty Rate                   0
Unemployment Rate              0
House Value                    0
House Construction Year        0
Monthly Owner Cost             0
Monthly Rent                   0
Public Transport Rate          0
Personal Transport Rate        0
Commute Time Public          901
Commute Time Car             901
High School Rate               0
College Rate                   0
Uneducated Rate                0
English Language            1534
Spanish Language            1534
White Population Rate          0
Black Population Rate          0
Hispanic Population Rate       0
Asian Population Rate          0
City                           0
County                         0
Lat                            0
Lng                            0
Housing_units                  0
State                          0
dtype: int

In [None]:
#removing the rows with lat and lng missing value


In [41]:
#filling nan to zero for commute time public and commute time car columns
ca_2017[['Commute Time Public', 'Commute Time Car' ]]=ca_2017[['Commute Time Public', 'Commute Time Car' ]].fillna(0)

In [42]:
#shape of the clean dataframe
ca_2017.shape

(1534, 30)

In [43]:
#saving csv file for plotting
#ca_2017.to_csv('output_census/ca_2017_fg.csv', index=False)

## 2019 US census data

In [None]:
# 2019 census data for all US zipcodes
#reading 2019 combined census data
path_comb_2019=os.path.join('output_census', 'census_comb_2019.csv')
usa_2019=pd.read_csv(path_comb_2019)

In [None]:
#looking for usa_2014 dataframe
usa_2019.head()

In [None]:
#columns of 2014  dataframe
usa_2019.columns

In [None]:
#shape of the dataframe
usa_2019.shape

In [None]:
#removing the rows with house value less than 1
usa_2019=(usa_2019[(usa_2019['House Value']>0)&
                (usa_2019['Household Income']>0)&
                (usa_2019['Monthly Owner Cost']>0)])

In [None]:
#looking for null value
usa_2019.isna().sum()

In [None]:
#removing the rows with lat and lng missing value
usa_2019.dropna(subset=['Lat', 'Lng','Housing_units','County', 
                        'Public Transport Rate','Personal Transport Rate'], 
                         inplace=True)

In [None]:
#shape of the clean dataframe
usa_2019.shape

In [None]:
usa_2019.info()

In [None]:
usa_2019.drop(['Commute Time Car','Commute Time Public'], axis=1, inplace=True)

In [None]:
usa_2019.shape

In [None]:
#selecting only required columns
usa_2019_redu=usa_2019[['Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
        'High School Rate',
       'College Rate', 'Uneducated Rate', 'White Population Rate', 'Black Population Rate',
       'Hispanic Population Rate', 'Asian Population Rate', 'City', 
       'Lat', 'Lng', 'Housing_units']]

In [None]:
#saving csv file for machine learning
#usa_2019_redu.to_csv('output_census/usa_2019_ml.csv', index=False)

## 2019 California census data

In [44]:
#reading 2019 combined california census data
path_ca_2019=os.path.join('output_census', 'census_ca_2019.csv')
ca_2019=pd.read_csv(path_ca_2019)

In [45]:
#looking for usa_2019 dataframe
ca_2019.head()

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Unemployment Rate,House Value,House Construction Year,Monthly Owner Cost,...,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate,City,County,Lat,Lng,Housing_units,State
0,90001,59832.0,29.1,43360.0,13727.0,25.49639,3.929335,359000.0,1949,1853.0,...,41.88227,8.921647,90.134042,0.315884,Los Angeles,Los Angeles County,33.97,-118.25,13788.0,CA
1,90002,53302.0,28.0,37285.0,13284.0,31.188323,4.11054,345900.0,1954,1813.0,...,42.973997,18.912986,78.886346,1.02998,Los Angeles,Los Angeles County,33.95,-118.25,12598.0,CA
2,90003,73730.0,28.3,40598.0,13441.0,29.959311,4.002441,362800.0,1954,1892.0,...,33.116777,20.676794,78.348027,0.29703,Los Angeles,Los Angeles County,33.96,-118.27,17127.0,CA
3,90004,60541.0,35.2,49675.0,32090.0,17.784642,2.685783,1063200.0,1946,3489.0,...,35.676649,3.61408,50.289886,25.465387,Los Angeles,Los Angeles County,34.08,-118.31,24278.0,CA
4,90005,39732.0,35.6,38491.0,24267.0,25.999195,3.28954,777100.0,1950,3149.0,...,23.336354,5.763616,49.481526,34.221786,Los Angeles,Los Angeles County,34.06,-118.31,16345.0,CA


In [46]:
#columns of 2019  dataframe
ca_2019.columns

Index(['Zipcode', 'Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
       'Commute Time Public', 'Commute Time Car', 'High School Rate',
       'College Rate', 'Uneducated Rate', 'English Language',
       'Spanish Language', 'White Population Rate', 'Black Population Rate',
       'Hispanic Population Rate', 'Asian Population Rate', 'City', 'County',
       'Lat', 'Lng', 'Housing_units', 'State'],
      dtype='object')

In [47]:
#shape of the dataframe
ca_2019.shape

(1763, 30)

In [48]:
#removing the rows with house value less than 1
ca_2019=(ca_2019[(ca_2019['House Value']>0)&
                (ca_2019['Household Income']>0)&
                (ca_2019['Monthly Owner Cost']>0)])

In [49]:
#looking for null value
ca_2019.isna().sum()

Zipcode                        0
Population                     0
Median Age                     0
Household Income               0
Per Capita Income              0
Poverty Rate                   0
Unemployment Rate              0
House Value                    0
House Construction Year        0
Monthly Owner Cost             0
Monthly Rent                   0
Public Transport Rate          0
Personal Transport Rate        0
Commute Time Public          879
Commute Time Car             879
High School Rate               0
College Rate                   0
Uneducated Rate                0
English Language            1524
Spanish Language            1524
White Population Rate          0
Black Population Rate          0
Hispanic Population Rate       0
Asian Population Rate          0
City                           0
County                         0
Lat                            0
Lng                            0
Housing_units                  0
State                          0
dtype: int

In [None]:
#removing the rows with lat and lng missing value


In [50]:
#filling nan to zero for commute time public and commute time car columns
ca_2019[['Commute Time Public', 'Commute Time Car' ]]=ca_2019[['Commute Time Public', 'Commute Time Car' ]].fillna(0)

In [51]:
#shape of the clean dataframe
ca_2019.shape

(1524, 30)

In [52]:
#saving csv file for plotting
#ca_2019.to_csv('output_census/ca_2019_fg.csv', index=False)

In [None]:
#saving csv file for machine learning
#ca_19.to_csv('output_census/ca_2019_ml.csv', index=False)