In [1]:
# Dependencies and Setup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib import rcParams
import scipy.stats as sts
import os
from collections import Counter
import requests
import json

# Google API Key
from api_keys import gkey

In [2]:
#setting background of figure black
plt.style.use('ggplot')

In [3]:
#Set Fonts/Plot Style
rcParams['figure.figsize'] = [15.0, 10.0]
rcParams['figure.dpi'] = 80
rcParams['savefig.dpi'] = 100
rcParams['font.size'] = 20
rcParams['legend.fontsize'] = 'large'
rcParams['figure.titlesize'] = 'medium'
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Arial']
rcParams['xtick.direction'] = 'in'
rcParams['ytick.direction'] = 'in'
rcParams['xtick.top'] = True
rcParams['ytick.right'] = True
rcParams['xtick.major.size'] = 10
rcParams['ytick.major.size'] = 10
rcParams['xtick.minor.size'] = 5
rcParams['ytick.minor.size'] = 5
rcParams['xtick.minor.visible'] = True
rcParams['ytick.minor.visible'] = True

## Looking for 2012 US census data

In [None]:
# 2012 census data for all US zipcodes
#reading 2012 combined census data
path_comb_2012=os.path.join('output_census', 'census_comb_2012.csv')
usa_2012=pd.read_csv(path_comb_2012)

In [None]:
#looking for usa_2012 dataframe
usa_2012.head()

In [None]:
#columns of 2012  dataframe
usa_2012.columns

In [None]:
#shape of the dataframe
usa_2012.shape

In [None]:
#removing the rows with house value less than 1
usa_2012=(usa_2012[(usa_2012['House Value']>0)&
                (usa_2012['Household Income']>0)&
                (usa_2012['Monthly Owner Cost']>0)])

In [None]:
#looking for null value
usa_2012.isna().sum()

In [None]:
#removing the rows with lat and lng missing value
usa_2012.dropna(subset=['Lat', 'Lng', 
                        'Public Transport Rate',
                        'Personal Transport Rate','Per Capita Income'], 
                         inplace=True)

In [None]:
#shape of the clean dataframe
usa_2012.shape

In [None]:
#info of the dataframe
usa_2012.info()

In [None]:
#describing the dataframe
usa_2012.describe()

## Looking for 2012 California census data

In [None]:
#reading 2012 combined california census data
path_ca_2012=os.path.join('output_census', 'census_ca_2012.csv')
ca_2012=pd.read_csv(path_ca_2012)

In [None]:
#looking for ca_2012 dataframe
ca_2012.head()

In [None]:
#columns of 2012  dataframe
ca_2012.columns

In [None]:
#shape of the dataframe
ca_2012.shape

In [None]:
#removing the rows with house value less than 1
#removing the rows with house value less than 1
ca_2012=(ca_2012[(ca_2012['House Value']>0)&
                (ca_2012['Household Income']>0)&
                (ca_2012['Monthly Owner Cost']>0)])

In [None]:
#looking for null value
ca_2012.isna().sum()

In [None]:
#removing the rows with lat and lng missing value
ca_2012.dropna(subset=['Public Transport Rate',
                        'Personal Transport Rate'], inplace=True)

In [None]:
#shape of the clean dataframe
ca_2012.shape

In [None]:
#converting minutes to hour

ca_2012['Commute Time Public']=ca_2012['Commute Time Public']/60
ca_2012['Commute Time Car']=ca_2012['Commute Time Car']/60

In [None]:
#converting to int
ca_2012['Population']=ca_2012['Population'].astype(int)

In [None]:
#info of the dataframe
ca_2012.info()

In [None]:
#describing the dataframe
ca_2012.describe()

In [None]:
#selecting only important columns
ca_12=ca_2012[['City','Population', 'House Value','Household Income',
       'Poverty Rate', 'Unemployment Rate','Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Uneducated Rate' ]]

In [None]:
#only selecting rows with +ve owner cost and rent
ca_12=(ca_12[(ca_12['Monthly Owner Cost']>0)&
             (ca_12['Monthly Rent']>0)])

In [None]:
ca_12.shape

In [None]:
#formating columns with 2 decimal place
ca_12[['House Value','Household Income',
       'Poverty Rate', 'Unemployment Rate','Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Commute Time Public', 'Commute Time Car', 
       'Uneducated Rate' ]]=(ca_12[[ 'House Value','Household Income',
                                   'Poverty Rate', 'Unemployment Rate',
                                     'Monthly Owner Cost', 'Monthly Rent',
                                    'Public Transport Rate', 'Commute Time Public',
                                     'Commute Time Car', 'Uneducated Rate']]
                                                   .applymap('{:.2f}'.format))

In [None]:
#using groupby method to groupby the data accordint to city
ca_12_city=ca_12.groupby('City').agg({'Population':'sum',
                              'House Value':'mean',
                               'Household Income':'mean',
                               'Poverty Rate':'mean',
                               'Unemployment Rate':'mean',
                               'Monthly Owner Cost':'mean',
                               'Monthly Rent':'mean',
                               'Public Transport Rate':'mean',
                               'Uneducated Rate':'mean'}).reset_index()

In [None]:
#sorting data according to the population
ca_12_city=ca_12_city.sort_values(by=['House Value',
                                'Monthly Owner Cost','Household Income',
                                'Poverty Rate','Unemployment Rate',
                                'Public Transport Rate','Monthly Rent', 
                                'Uneducated Rate' ], ascending=False)

In [None]:
#printing the new sorted datasets
ca_12_city.head()

In [None]:
#city with higher house value
ca_12_city.nlargest(10, 'House Value')

In [None]:
#City with smallest poverty rate
ca_12_city.nsmallest(10, 'Poverty Rate')

In [None]:
#city with smallest unemployment rate
ca_12_city.nsmallest(10, 'Unemployment Rate')

In [None]:
#city with larger commute time
#ca_12_city.nsmallest(10, 'Commute Time Car')

In [None]:
#city with smallest monthly owner cost
ca_12_city.nsmallest(10, 'Monthly Owner Cost')

In [None]:
#city with largest household income
ca_12_city.nlargest(10, 'Household Income')

## Plotting 2012 california data

In [None]:
#plotting all correlation using seaborn heatmap
corr_12=ca_12.corr()
import seaborn as sns
fig, ax=plt.subplots(figsize=(15,15))
sns.heatmap(corr_12,vmin=-1, vmax=1, ax=ax, cmap='BrBG')#annot=True,
plt.show()

In [None]:
#plotting scatter plot of lat and lng
fig, ax=plt.subplots(figsize=(15,15))
ca_2012.plot(kind='scatter', x='Lng', y='Lat', alpha=0.4, 
                s=ca_2012['Population']/100,label='Population',
                 c='House Value', cmap=plt.get_cmap('rainbow'),
                colorbar=True, ax=ax)

plt.show()

In [None]:
#plotting scatter plot of lat and lng
fig, ax=plt.subplots(figsize=(15,15))
ca_2012.plot(kind='scatter', x='Lng', y='Lat', alpha=0.3, 
                s=ca_2012['Population']/100,label='Population',
                 c='Household Income', cmap=plt.get_cmap('hsv'),
                colorbar=True, ax=ax)

plt.show()

## Looking for 2014 US census data

In [None]:
# 2014 census data for all US zipcodes
#reading 2014 combined census data
path_comb_2014=os.path.join('output_census', 'census_comb_2014.csv')
usa_2014=pd.read_csv(path_comb_2014)

In [None]:
#looking for usa_2014 dataframe
usa_2014.head()

In [None]:
#columns of 2014  dataframe
usa_2014.columns

In [None]:
#shape of the dataframe
usa_2014.shape

In [None]:
#removing the rows with house value less than 1
usa_2014=usa_2014[usa_2014['House Value']>0]

In [None]:
#looking for null value
usa_2014.isna().sum()

In [None]:
#removing the rows with lat and lng missing value
usa_2014.dropna(subset=['Lat', 'Lng', 
                        'Public Transport Rate',
                        'Personal Transport Rate',
                        'Housing_units','County'], inplace=True)

In [None]:
#shape of the clean dataframe
usa_2014.shape

In [None]:
#info of the dataframe
usa_2014.info()

In [None]:
#describing the dataframe
usa_2014.describe()

## Looking for 2014 California census data

In [None]:
#reading 2014 combined california census data
path_ca_2014=os.path.join('output_census', 'census_ca_2014.csv')
ca_2014=pd.read_csv(path_ca_2014)

In [None]:
#looking for ca_2014 dataframe
ca_2014.head()

In [None]:
#columns of 2014  dataframe
ca_2014.columns

In [None]:
#shape of the dataframe
ca_2014.shape

In [None]:
#removing the rows with house value less than 1
ca_2014=ca_2014[ca_2014['House Value']>0]

In [None]:
#looking for null value
ca_2014.isna().sum()

In [None]:
#shape of the clean dataframe
ca_2014.shape

In [None]:
#info of the dataframe
ca_2014.info()

In [None]:
#describing the dataframe
ca_2014.describe()

## Looking for 2015 US census data

In [None]:
# 2015 census data for all US zipcodes
#reading 2015 combined census data
path_comb_2015=os.path.join('output_census', 'census_comb_2015.csv')
usa_2015=pd.read_csv(path_comb_2015)

In [None]:
#looking for usa_2015 dataframe
usa_2015.head()

In [None]:
#columns of 2015  dataframe
usa_2015.columns

In [None]:
#shape of the dataframe
usa_2015.shape

In [None]:
#removing the rows with house value less than 1
usa_2015=usa_2015[usa_2015['House Value']>0]

In [None]:
#looking for null value
usa_2015.isna().sum()

In [None]:
#removing the rows with lat and lng missing value
usa_2015.dropna(subset=['Lat', 'Lng', 'Per Capita Income','County',
                        'Monthly Owner Cost','Monthly Rent',
                        'Public Transport Rate','Household Income',
                        'Personal Transport Rate'], 
                         inplace=True)

In [None]:
#shape of the clean dataframe
usa_2015.shape

In [None]:
#info of the dataframe
usa_2015.info()

In [None]:
#describing the dataframe
usa_2015.describe()

## Looking for 2015 California census data

In [None]:
#reading 2015 combined california census data
path_ca_2015=os.path.join('output_census', 'census_ca_2015.csv')
ca_2015=pd.read_csv(path_ca_2015)

In [None]:
#looking for ca_2015 dataframe
ca_2015.head()

In [None]:
#columns of 2015  dataframe
ca_2015.columns

In [None]:
#shape of the dataframe
ca_2015.shape

In [None]:
#removing the rows with house value less than 1
ca_2015=ca_2015[ca_2015['House Value']>0]

In [None]:
#looking for null value
ca_2015.isna().sum()

In [None]:
#removing the rows with lat and lng missing value
ca_2015.dropna(subset=['Household Income', 'Monthly Rent', 
                        'Monthly Owner Cost','Personal Transport Rate',
                      'Public Transport Rate'],inplace=True)

In [None]:
#shape of the clean dataframe
ca_2015.shape

In [None]:
#info of the dataframe
ca_2015.info()

In [None]:
#describing the dataframe
ca_2015.describe()

## Looking for 2017 US census data

In [21]:
# 2017 census data for all US zipcodes
#reading 2017 combined census data
path_comb_2017=os.path.join('output_census', 'census_comb_2017.csv')
usa_2017=pd.read_csv(path_comb_2017)

In [22]:
#looking for usa_2017 dataframe
usa_2017.head()

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Unemployment Rate,House Value,House Construction Year,Monthly Owner Cost,...,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate,City,County,Lat,Lng,Housing_units,State
0,601,17599.0,38.9,11757.0,7041.0,64.105915,13.943974,82500.0,1981.0,748.0,...,77.765782,0.681857,99.624979,0.0,Adjuntas,Adjuntas Municipio,,,,PR
1,602,39209.0,40.9,16190.0,8978.0,52.100283,6.473004,87300.0,1979.0,846.0,...,66.854549,2.785075,93.692775,0.0,Aguada,Aguada Municipio,18.36,-67.18,18073.0,PR
2,603,50135.0,40.4,16645.0,10897.0,50.216416,7.156677,122300.0,1977.0,867.0,...,71.225691,3.95931,97.46684,1.111,Aguadilla,Aguadilla Municipio,18.45,-67.11,25653.0,PR
3,606,6304.0,42.8,13387.0,5960.0,64.911168,3.236041,92700.0,1979.0,538.0,...,48.302665,2.538071,99.809645,0.0,Maricao,Maricao Municipio,18.2,-66.9,2877.0,PR
4,610,27590.0,41.4,18741.0,9266.0,45.498369,5.342515,90300.0,1979.0,733.0,...,61.754259,3.062704,97.317869,0.0,Anasco,Aasco Municipio,18.28,-67.13,12618.0,PR


In [23]:
#columns of 2017  dataframe
usa_2017.columns

Index(['Zipcode', 'Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
       'Commute Time Public', 'Commute Time Car', 'High School Rate',
       'College Rate', 'Uneducated Rate', 'English Language',
       'Spanish Language', 'White Population Rate', 'Black Population Rate',
       'Hispanic Population Rate', 'Asian Population Rate', 'City', 'County',
       'Lat', 'Lng', 'Housing_units', 'State'],
      dtype='object')

In [24]:
#shape of the dataframe
usa_2017.shape

(33120, 30)

In [25]:
#removing the rows with house value less than 1
usa_2017=(usa_2017[(usa_2017['House Value']>0)&
                (usa_2017['Household Income']>0)&
                (usa_2017['Monthly Owner Cost']>0)])

In [27]:
#looking for null value
usa_2017.isna().sum()

Zipcode                         0
Population                      0
Median Age                      0
Household Income                0
Per Capita Income               0
Poverty Rate                    0
Unemployment Rate               0
House Value                     0
House Construction Year         0
Monthly Owner Cost              0
Monthly Rent                    0
Public Transport Rate           0
Personal Transport Rate         0
Commute Time Public         20602
Commute Time Car            20602
High School Rate                0
College Rate                    0
Uneducated Rate                 0
English Language            29110
Spanish Language            29110
White Population Rate           0
Black Population Rate           0
Hispanic Population Rate        0
Asian Population Rate           0
City                            0
County                          1
Lat                            16
Lng                            16
Housing_units                  16
State         

In [28]:
#removing the rows with lat and lng missing value
usa_2017.dropna(subset=['Lat', 'Lng','Housing_units','County'], inplace=True)

In [29]:
#shape of the clean dataframe
usa_2017.shape

(29093, 30)

In [30]:
#info of the dataframe
usa_2017.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29093 entries, 1 to 33119
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Zipcode                   29093 non-null  int64  
 1   Population                29093 non-null  float64
 2   Median Age                29093 non-null  float64
 3   Household Income          29093 non-null  float64
 4   Per Capita Income         29093 non-null  float64
 5   Poverty Rate              29093 non-null  float64
 6   Unemployment Rate         29093 non-null  float64
 7   House Value               29093 non-null  float64
 8   House Construction Year   29093 non-null  float64
 9   Monthly Owner Cost        29093 non-null  float64
 10  Monthly Rent              29093 non-null  float64
 11  Public Transport Rate     29093 non-null  float64
 12  Personal Transport Rate   29093 non-null  float64
 13  Commute Time Public       8502 non-null   float64
 14  Commut

In [31]:
#describing the dataframe
usa_2017.describe()

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Unemployment Rate,House Value,House Construction Year,Monthly Owner Cost,...,Uneducated Rate,English Language,Spanish Language,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate,Lat,Lng,Housing_units
count,29093.0,29093.0,29093.0,29093.0,29093.0,29093.0,29093.0,29093.0,29093.0,29093.0,...,29093.0,0.0,0.0,29093.0,29093.0,29093.0,29093.0,29093.0,29093.0,29093.0
mean,49133.22734,11063.88987,42.144512,57222.865878,29194.840099,13.905609,3.000631,187936.3,-318838.4,1364.261437,...,0.719213,,,83.446066,7.798114,9.510061,2.253109,38.856601,-90.622085,4549.981301
std,27367.633916,15010.032395,7.66211,23747.660188,12563.431705,9.327151,2.045864,167848.9,14621190.0,577.959805,...,0.92401,,,20.344314,15.635153,16.335293,5.524311,5.257513,14.798852,5755.615274
min,602.0,20.0,15.4,5902.0,3534.0,0.0,0.0,13900.0,-666666700.0,331.0,...,0.0,,,0.0,0.0,0.0,0.0,17.96,-171.69,16.0
25%,27013.0,1149.0,37.1,41932.0,21830.0,7.172161,1.764706,93300.0,1963.0,987.0,...,0.096012,,,77.47858,0.0,1.025783,0.0,35.4,-96.71,585.0
50%,49412.0,3871.0,41.8,52188.0,26535.0,11.917423,2.713026,139000.0,1975.0,1188.0,...,0.472888,,,91.905444,1.198963,3.387892,0.44069,39.67,-88.0,1797.0
75%,71044.0,16038.0,46.5,66181.0,32940.0,18.479779,3.837719,216400.0,1984.0,1543.0,...,0.97116,,,97.059294,6.948303,9.551657,1.916236,42.2,-80.14,6748.0
max,99929.0,119204.0,83.1,250001.0,298129.0,81.612713,43.396226,2000001.0,2012.0,4001.0,...,25.46729,,,100.0,100.0,100.0,74.248386,71.0,-65.28,47617.0


In [32]:
usa_2017.columns

Index(['Zipcode', 'Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
       'Commute Time Public', 'Commute Time Car', 'High School Rate',
       'College Rate', 'Uneducated Rate', 'English Language',
       'Spanish Language', 'White Population Rate', 'Black Population Rate',
       'Hispanic Population Rate', 'Asian Population Rate', 'City', 'County',
       'Lat', 'Lng', 'Housing_units', 'State'],
      dtype='object')

In [33]:
usa_2017.drop(['Commute Time Car','Commute Time Public',
               'English Language','Spanish Language'], axis=1, inplace=True)

In [34]:
usa_2017.shape

(29093, 26)

In [35]:
#selecting only required columns
usa_2017_redu=usa_2017[['Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
        'High School Rate',
       'College Rate', 'Uneducated Rate', 'White Population Rate', 'Black Population Rate',
       'Hispanic Population Rate', 'Asian Population Rate', 'City', 
       'Lat', 'Lng', 'Housing_units']]

In [36]:
#saving csv file for machine learning
usa_2017_redu.to_csv('output_census/usa_2017_ml.csv', index=False)

## Looking for 2017 California census data

In [None]:
#reading 2017 combined california census data
path_ca_2017=os.path.join('output_census', 'census_ca_2017.csv')
ca_2017=pd.read_csv(path_ca_2017)

In [None]:
#looking for usa_2017 dataframe
ca_2017.head()

In [None]:
#columns of 2017  dataframe
ca_2017.columns

In [None]:
#shape of the dataframe
ca_2017.shape

In [None]:
#removing the rows with house value less than 1
ca_2017=ca_2017[ca_2017['House Value']>0]

In [None]:
#looking for null value
ca_2017.isna().sum()

In [None]:
#removing the rows with lat and lng missing value


In [None]:
#shape of the clean dataframe
ca_2017.shape

In [None]:
#info of the dataframe
ca_2017.info()

In [None]:
#describing the dataframe
ca_2017.describe()

## Looking for 2019 US census data

In [4]:
# 2019 census data for all US zipcodes
#reading 2019 combined census data
path_comb_2019=os.path.join('output_census', 'census_comb_2019.csv')
usa_2019=pd.read_csv(path_comb_2019)

In [5]:
#looking for usa_2014 dataframe
usa_2019.head()

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Unemployment Rate,House Value,House Construction Year,Monthly Owner Cost,...,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate,City,County,Lat,Lng,Housing_units,State
0,601,17113.0,41.9,14361.0,7493.0,61.660726,11.500029,83900.0,1981,771.0,...,73.721732,1.063519,99.643546,0.011687,Adjuntas,Adjuntas Municipio,,,,PR
1,602,37751.0,42.9,16807.0,9694.0,49.410612,3.76149,85300.0,1978,877.0,...,85.240126,2.405234,92.469074,0.0,Aguada,Aguada Municipio,18.36,-67.18,18073.0,PR
2,603,47081.0,42.1,16049.0,11259.0,50.319662,6.790425,118400.0,1980,832.0,...,79.350481,3.215735,97.368365,0.229392,Aguadilla,Aguadilla Municipio,18.45,-67.11,25653.0,PR
3,606,6392.0,44.3,12119.0,6093.0,65.472466,2.64393,80800.0,1978,526.0,...,45.525657,1.439299,99.155194,0.0,Maricao,Maricao Municipio,18.2,-66.9,2877.0,PR
4,610,26686.0,42.7,19898.0,10572.0,45.731844,3.855954,87600.0,1978,751.0,...,82.597617,1.727498,95.061081,0.0,Anasco,Aasco Municipio,18.28,-67.13,12618.0,PR


In [6]:
#columns of 2014  dataframe
usa_2019.columns

Index(['Zipcode', 'Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
       'Commute Time Public', 'Commute Time Car', 'High School Rate',
       'College Rate', 'Uneducated Rate', 'English Language',
       'Spanish Language', 'White Population Rate', 'Black Population Rate',
       'Hispanic Population Rate', 'Asian Population Rate', 'City', 'County',
       'Lat', 'Lng', 'Housing_units', 'State'],
      dtype='object')

In [7]:
#shape of the dataframe
usa_2019.shape

(33120, 30)

In [8]:
#removing the rows with house value less than 1
usa_2019=(usa_2019[(usa_2019['House Value']>0)&
                (usa_2019['Household Income']>0)&
                (usa_2019['Monthly Owner Cost']>0)])

In [9]:
#looking for null value
usa_2019.isna().sum()

Zipcode                         0
Population                      0
Median Age                      0
Household Income                0
Per Capita Income               0
Poverty Rate                    0
Unemployment Rate               0
House Value                     0
House Construction Year         0
Monthly Owner Cost              0
Monthly Rent                    0
Public Transport Rate           1
Personal Transport Rate         1
Commute Time Public         20708
Commute Time Car            20708
High School Rate                0
College Rate                    0
Uneducated Rate                 0
English Language            28941
Spanish Language            28941
White Population Rate           0
Black Population Rate           0
Hispanic Population Rate        0
Asian Population Rate           0
City                            0
County                          1
Lat                            16
Lng                            16
Housing_units                  16
State         

In [10]:
#removing the rows with lat and lng missing value
usa_2019.dropna(subset=['Lat', 'Lng','Housing_units','County', 
                        'Public Transport Rate','Personal Transport Rate'], 
                         inplace=True)

In [11]:
#shape of the clean dataframe
usa_2019.shape

(28923, 30)

In [12]:
#info of the dataframe
usa_2019.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28923 entries, 1 to 33119
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Zipcode                   28923 non-null  int64  
 1   Population                28923 non-null  float64
 2   Median Age                28923 non-null  float64
 3   Household Income          28923 non-null  float64
 4   Per Capita Income         28923 non-null  float64
 5   Poverty Rate              28923 non-null  float64
 6   Unemployment Rate         28923 non-null  float64
 7   House Value               28923 non-null  float64
 8   House Construction Year   28923 non-null  int64  
 9   Monthly Owner Cost        28923 non-null  float64
 10  Monthly Rent              28923 non-null  float64
 11  Public Transport Rate     28923 non-null  float64
 12  Personal Transport Rate   28923 non-null  float64
 13  Commute Time Public       8228 non-null   float64
 14  Commut

In [13]:
#describing the dataframe
usa_2019.describe()

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Unemployment Rate,House Value,House Construction Year,Monthly Owner Cost,...,Uneducated Rate,English Language,Spanish Language,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate,Lat,Lng,Housing_units
count,28923.0,28923.0,28923.0,28923.0,28923.0,28923.0,28923.0,28923.0,28923.0,28923.0,...,28923.0,0.0,0.0,28923.0,28923.0,28923.0,28923.0,28923.0,28923.0,28923.0
mean,49121.160184,11250.464336,42.459513,61886.646302,31680.641116,13.042679,2.457554,206118.5,-159467.7,1428.424022,...,0.721531,,,83.124381,7.852341,9.805983,2.343961,38.866237,-90.628141,4575.325554
std,27357.506521,15247.867882,7.784226,25958.786527,13621.93222,8.928869,1.83056,187346.2,10370320.0,612.976365,...,0.943708,,,20.379255,15.557849,16.466417,5.679577,5.264032,14.817801,5762.985962
min,602.0,24.0,14.5,6493.0,3054.0,0.0,0.0,9999.0,-666666700.0,99.0,...,0.0,,,0.0,0.0,0.0,0.0,17.96,-173.0,0.0
25%,27029.0,1154.0,37.3,45470.5,23686.5,6.620662,1.354366,101600.0,1964.0,1030.0,...,0.097671,,,76.915201,0.0,1.114428,0.0,35.4,-96.74,596.0
50%,49339.0,3961.0,41.9,56268.0,28748.0,11.154656,2.187713,151700.0,1976.0,1235.0,...,0.463939,,,91.508983,1.28853,3.629829,0.46523,39.68,-88.0,1824.0
75%,71017.0,16251.0,46.9,71543.0,35665.5,17.339537,3.161629,238000.0,1984.0,1621.0,...,0.976494,,,96.848344,7.140009,10.089356,2.027271,42.2,-80.13,6797.5
max,99929.0,128294.0,91.5,250001.0,187336.0,85.142857,30.11583,2000001.0,2014.0,4001.0,...,26.595745,,,100.0,100.0,100.0,75.288276,71.0,-65.28,47617.0


In [14]:
usa_2019.columns

Index(['Zipcode', 'Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
       'Commute Time Public', 'Commute Time Car', 'High School Rate',
       'College Rate', 'Uneducated Rate', 'English Language',
       'Spanish Language', 'White Population Rate', 'Black Population Rate',
       'Hispanic Population Rate', 'Asian Population Rate', 'City', 'County',
       'Lat', 'Lng', 'Housing_units', 'State'],
      dtype='object')

In [16]:
usa_2019.drop(['Commute Time Car','Commute Time Public'], axis=1, inplace=True)

In [17]:
usa_2019.shape

(28923, 28)

In [18]:
#selecting only required columns
usa_2019_redu=usa_2019[['Population', 'Median Age', 'Household Income',
       'Per Capita Income', 'Poverty Rate', 'Unemployment Rate', 'House Value',
       'House Construction Year', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate',
        'High School Rate',
       'College Rate', 'Uneducated Rate', 'White Population Rate', 'Black Population Rate',
       'Hispanic Population Rate', 'Asian Population Rate', 'City', 
       'Lat', 'Lng', 'Housing_units']]

In [19]:
#saving csv file for machine learning
usa_2019_redu.to_csv('output_census/usa_2019_ml.csv', index=False)

## Looking for 2019 California census data

In [None]:
#reading 2019 combined california census data
path_ca_2019=os.path.join('output_census', 'census_ca_2019.csv')
ca_2019=pd.read_csv(path_ca_2019)

In [None]:
#looking for usa_2019 dataframe
ca_2019.head()

In [None]:
#columns of 2019  dataframe
ca_2019.columns

In [None]:
#shape of the dataframe
ca_2019.shape

In [None]:
#removing the rows with house value less than 1
ca_2019=(ca_2019[(ca_2019['House Value']>0)&
                (ca_2019['Household Income']>0)&
                (ca_2019['Monthly Owner Cost']>0)])

In [None]:
#looking for null value
ca_2019.isna().sum()

In [None]:
#removing the rows with lat and lng missing value


In [None]:
#shape of the clean dataframe
ca_2019.shape

In [None]:
#info of the dataframe
ca_2019.info()

In [None]:
#describing the dataframe
ca_2019.describe()

In [None]:
#selecting only important columns
ca_19=ca_2019[['City','Population', 'House Value','Household Income',
       'Poverty Rate', 'Unemployment Rate','Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate','College Rate','White Population Rate',
       'Uneducated Rate' ]]

In [None]:
#only selecting rows with +ve owner cost and rent
ca_19=(ca_19[(ca_19['Monthly Owner Cost']>0)&
             (ca_19['Monthly Rent']>0)])

In [None]:
#saving csv file for machine learning
ca_19.to_csv('output_census/ca_2019_ml.csv', index=False)

In [None]:
#formating columns with 2 decimal place
ca_19[['House Value','Household Income',
       'Poverty Rate', 'Unemployment Rate','Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Commute Time Public', 'Commute Time Car', 
       'Uneducated Rate' ]]=(ca_19[[ 'House Value','Household Income',
                                   'Poverty Rate', 'Unemployment Rate',
                                     'Monthly Owner Cost', 'Monthly Rent',
                                    'Public Transport Rate', 'Commute Time Public',
                                     'Commute Time Car', 'Uneducated Rate']]
                                                   .applymap('{:.2f}'.format))

In [None]:
#using groupby method to groupby the data accordint to city
ca_19_city=ca_19.groupby('City').agg({'Population':'sum',
                              'House Value':'mean',
                               'Household Income':'mean',
                               'Poverty Rate':'mean',
                               'Unemployment Rate':'mean',
                               'Monthly Owner Cost':'mean',
                               'Monthly Rent':'mean',
                               'Public Transport Rate':'mean',
                               'Personal Transport Rate':'mean',
                                'College Rate':'mean',
                                'White Population Rate':'mean',      
                               'Uneducated Rate':'mean'}).reset_index()

In [None]:
#sorting data according to the population
ca_19_city=ca_19_city.sort_values(by=['House Value',
                                'Monthly Owner Cost','Household Income',
                                'Poverty Rate','Unemployment Rate',
                                'Public Transport Rate','Monthly Rent', 
                                'Uneducated Rate' ], ascending=False)

In [None]:
ca_19_city.shape

In [None]:
#printing the new sorted datasets
ca_19_city.head()

In [None]:
#city with higher house value
ca_19_city.nlargest(10, 'House Value')

In [None]:
#City with smallest poverty rate
ca_19_city.nsmallest(10, 'Poverty Rate')

In [None]:
#city with smallest unemployment rate
ca_19_city.nsmallest(10, 'Unemployment Rate')

In [None]:
#city with larger commute time
ca_19_city.nsmallest(10, 'Commute Time Car')

In [None]:
#city with smallest monthly owner cost
ca_19_city.nsmallest(10, 'Monthly Owner Cost')

In [None]:
#city with largest household income
ca_19_city.nlargest(10, 'Household Income')

## Ploting 2019 California Data

In [None]:
#plotting all correlation using seaborn heatmap
corr_19=ca_19.corr()
import seaborn as sns
fig, ax=plt.subplots(figsize=(15,15))
sns.heatmap(corr_19,vmin=-1, vmax=1, ax=ax, cmap='BrBG')#annot=True,
plt.show()

In [None]:
corr_19=ca_19.corr()
corr_19.style.background_gradient(cmap='coolwarm')

In [None]:
c1 = corr_19.abs().unstack()
c1.sort_values(ascending = False)[12:40]

In [None]:
#plotting
fig, ax=plt.subplots(figsize=(15,15))
ca_2019.plot(kind='scatter', x='Lng', y='Lat', alpha=0.2, 
                s=ca_2019['Population']/100,label='Population',
                 c='House Value', cmap=plt.get_cmap('jet'),
                colorbar=True, ax=ax)

plt.show()