In [1]:
# Dependencies and Setup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib import rcParams
import scipy.stats as sts
import os
from collections import Counter
import requests
import json

# API Keys
from api_keys import gkey

## Looking for the house price data in us cities

In [2]:
#importing housing sale data (downloaded from kaggle)
price_path=os.path.join('Resources', 'Sale_Prices_City.csv')
housing_price=pd.read_csv(price_path)

In [3]:
#looking for housing_price data
housing_price.head()

Unnamed: 0.1,Unnamed: 0,RegionID,RegionName,StateName,SizeRank,2008-03,2008-04,2008-05,2008-06,2008-07,...,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12,2020-01,2020-02,2020-03
0,0,6181,New York,New York,1,,,,,,...,563200.0,570500.0,572800.0,569900.0,560800.0,571500.0,575100.0,571700.0,568300.0,573600.0
1,1,12447,Los Angeles,California,2,507600.0,489600.0,463000.0,453100.0,438100.0,...,706800.0,711800.0,717300.0,714100.0,711900.0,718400.0,727100.0,738200.0,760200.0,
2,2,39051,Houston,Texas,3,138400.0,135500.0,132200.0,131000.0,133400.0,...,209700.0,207400.0,207600.0,207000.0,211400.0,211500.0,217700.0,219200.0,223800.0,
3,3,17426,Chicago,Illinois,4,325100.0,314800.0,286900.0,274600.0,268500.0,...,271500.0,266500.0,264900.0,265000.0,264100.0,264300.0,270000.0,281400.0,302900.0,309200.0
4,4,6915,San Antonio,Texas,5,130900.0,131300.0,131200.0,131500.0,131600.0,...,197100.0,198700.0,200200.0,200800.0,203400.0,203800.0,205400.0,205400.0,208300.0,


In [4]:
#columns of housing price dataframe
housing_price.columns


Index(['Unnamed: 0', 'RegionID', 'RegionName', 'StateName', 'SizeRank',
       '2008-03', '2008-04', '2008-05', '2008-06', '2008-07',
       ...
       '2019-06', '2019-07', '2019-08', '2019-09', '2019-10', '2019-11',
       '2019-12', '2020-01', '2020-02', '2020-03'],
      dtype='object', length=150)

In [5]:
#housing price is for each month, we need to sum and average to ger price per year
#to get the average housing data 
for i in range(2009, 2020):
    housing_price[f'{i}']=((housing_price[[f'{i}-01',f'{i}-02',f'{i}-03',f'{i}-01',
                                      f'{i}-05',f'{i}-06',f'{i}-07',f'{i}-08',
                                     f'{i}-09',f'{i}-10',f'{i}-11',f'{i}-12']].sum(axis=1))/12)

In [7]:
#only selecting the columns that are needed for our project
housing=(housing_price[['RegionName', 'StateName', 'SizeRank','2015',
                       '2016','2017','2018','2019']])

In [8]:
housing.head()

Unnamed: 0,RegionName,StateName,SizeRank,2015,2016,2017,2018,2019
0,New York,New York,1,517833.333333,534275.0,542425.0,555416.666667,565016.666667
1,Los Angeles,California,2,491658.333333,530025.0,567958.333333,637291.666667,704691.666667
2,Houston,Texas,3,171058.333333,181208.333333,190675.0,196850.0,208958.333333
3,Chicago,Illinois,4,236341.666667,218975.0,228258.333333,245083.333333,265641.666667
4,San Antonio,Texas,5,164600.0,171308.333333,177666.666667,188108.333333,197250.0


In [9]:
#shape of housing dataset
print(f'No of rowsare {housing.shape[0]} and number of columns are {housing.shape[1]}')

No of rowsare 3728 and number of columns are 8


In [10]:
#info of the dataframe
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3728 entries, 0 to 3727
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   RegionName  3728 non-null   object 
 1   StateName   3728 non-null   object 
 2   SizeRank    3728 non-null   int64  
 3   2015        3728 non-null   float64
 4   2016        3728 non-null   float64
 5   2017        3728 non-null   float64
 6   2018        3728 non-null   float64
 7   2019        3728 non-null   float64
dtypes: float64(5), int64(1), object(2)
memory usage: 233.1+ KB


In [11]:
#total number of unique city
housing.value_counts('RegionName')

RegionName
Springfield       9
Franklin          7
Monroe            7
Farmington        6
Salem             6
                 ..
Hanford           1
Hannibal          1
Happy Valley      1
Harbor Springs    1
Zionsville        1
Length: 3158, dtype: int64

In [12]:
#totoal unique city
housing['RegionName'].nunique()

3158

In [13]:
#there are around 600 city with duplicate value
housing=housing.drop_duplicates(subset=['RegionName'])
#housing dataframe shape
housing.shape

(3158, 8)

In [15]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3158 entries, 0 to 3727
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   RegionName  3158 non-null   object 
 1   StateName   3158 non-null   object 
 2   SizeRank    3158 non-null   int64  
 3   2015        3158 non-null   float64
 4   2016        3158 non-null   float64
 5   2017        3158 non-null   float64
 6   2018        3158 non-null   float64
 7   2019        3158 non-null   float64
dtypes: float64(5), int64(1), object(2)
memory usage: 222.0+ KB


In [16]:
#saving the clean housing data to output folder
housing.to_csv('output_data/housing_price.csv', index=False)

## Looking for the crime data in us cities

In [17]:
#importing crime data (downloaded from kaggle)
crime_path=os.path.join('Resources', 'crime_data_w_population_and_crime_rate.csv')
crime_rate=pd.read_csv(crime_path)

In [18]:
#looking for crime data
crime_rate.head()

Unnamed: 0,county_name,crime_rate_per_100000,index,EDITION,PART,IDNO,CPOPARST,CPOPCRIM,AG_ARRST,AG_OFF,...,RAPE,ROBBERY,AGASSLT,BURGLRY,LARCENY,MVTHEFT,ARSON,population,FIPS_ST,FIPS_CTY
0,"St. Louis city, MO",1791.995377,1,1,4,1612,318667,318667,15,15,...,200,1778,3609,4995,13791,3543,464,318416,29,510
1,"Crittenden County, AR",1754.914968,2,1,4,130,50717,50717,4,4,...,38,165,662,1482,1753,189,28,49746,5,35
2,"Alexander County, IL",1664.700485,3,1,4,604,8040,8040,2,2,...,2,5,119,82,184,12,2,7629,17,3
3,"Kenedy County, TX",1456.31068,4,1,4,2681,444,444,1,1,...,3,1,2,5,4,4,0,412,48,261
4,"De Soto Parish, LA",1447.40243,5,1,4,1137,26971,26971,3,3,...,4,17,368,149,494,60,0,27083,22,31


In [19]:
crime_rate.columns

Index(['county_name', 'crime_rate_per_100000', 'index', 'EDITION', 'PART',
       'IDNO', 'CPOPARST', 'CPOPCRIM', 'AG_ARRST', 'AG_OFF', 'COVIND', 'INDEX',
       'MODINDX', 'MURDER', 'RAPE', 'ROBBERY', 'AGASSLT', 'BURGLRY', 'LARCENY',
       'MVTHEFT', 'ARSON', 'population', 'FIPS_ST', 'FIPS_CTY'],
      dtype='object')

In [20]:
#shape of crime dataset
print(f'No of rowsare {crime_rate.shape[0]} and number of columns are {crime_rate.shape[1]}')

No of rowsare 3136 and number of columns are 24


In [21]:
#looking for unique county and city name
crime_rate['county_name'].nunique()

3136

In [22]:
#looking for only certains columns
crime=(crime_rate[['county_name','population', 'crime_rate_per_100000',
                   'MURDER', 'RAPE', 'ROBBERY', 'AGASSLT', 'BURGLRY', 'LARCENY',
                   'MVTHEFT', 'ARSON']])

In [23]:
crime.head()

Unnamed: 0,county_name,population,crime_rate_per_100000,MURDER,RAPE,ROBBERY,AGASSLT,BURGLRY,LARCENY,MVTHEFT,ARSON
0,"St. Louis city, MO",318416,1791.995377,119,200,1778,3609,4995,13791,3543,464
1,"Crittenden County, AR",49746,1754.914968,8,38,165,662,1482,1753,189,28
2,"Alexander County, IL",7629,1664.700485,1,2,5,119,82,184,12,2
3,"Kenedy County, TX",412,1456.31068,0,3,1,2,5,4,4,0
4,"De Soto Parish, LA",27083,1447.40243,3,4,17,368,149,494,60,0


In [24]:
#saving the crime_rate data to output folder
crime.to_csv('output_data/crime_rate.csv', index=False)

## Looking for school datasets in US cities

In [25]:
#importing private school data (downloaded from kaggle)
school_path=os.path.join('Resources', 'Private_Schools.csv')
private_school=pd.read_csv(school_path)

In [26]:
#looking for private school datasets
private_school.head()

Unnamed: 0,X,Y,FID,OBJECTID,NCESID,NAME,ADDRESS,CITY,STATE,ZIP,...,SOURCEDATE,VAL_METHOD,VAL_DATE,WEBSITE,LEVEL_,ENROLLMENT,START_GRAD,END_GRADE,FT_TEACHER,SHELTER_ID
0,-8136078.0,5055506.0,1,1201,A0507274,A CHILD'S GARDEN,20 IVY BROOK RD,SHELTON,CT,6484,...,2010/01/29 00:00:00,IMAGERY,2010/02/25 00:00:00,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,1,6,2,3,2,NOT AVAILABLE
1,-8147300.0,5037067.0,2,1202,A0500844,BRIDGEPORT INTERNATIONAL ACADEMY,285 LAFAYETTE ST STE 200,BRIDGEPORT,CT,6604,...,2010/01/29 00:00:00,IMAGERY/OTHER,2010/02/23 00:00:00,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,2,60,14,17,8,NOT AVAILABLE
2,-8197042.0,5018050.0,3,1203,A0108054,CARMEL ACADMEY,270 LAKE AVE,GREENWICH,CT,6830,...,2010/01/29 00:00:00,IMAGERY,2010/03/05 00:00:00,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,1,225,3,13,53,NOT AVAILABLE
3,-8171504.0,5077291.0,4,1204,A0507283,COUNTRY KIDS CLUB,94 OLD STATE RD,BROOKFIELD,CT,6804,...,2010/01/29 00:00:00,GEOCODE,2010/02/25 00:00:00,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,1,10,2,3,1,NOT AVAILABLE
4,-8197596.0,5017131.0,5,1205,01612613,EAGLE HILL SCHOOL,45 GLENVILLE RD,GREENWICH,CT,6831,...,2010/01/29 00:00:00,IMAGERY/OTHER,2010/02/26 00:00:00,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,3,253,4,15,65,NOT AVAILABLE


In [27]:
private_school.columns

Index(['X', 'Y', 'FID', 'OBJECTID', 'NCESID', 'NAME', 'ADDRESS', 'CITY',
       'STATE', 'ZIP', 'ZIP4', 'TELEPHONE', 'TYPE', 'STATUS', 'POPULATION',
       'COUNTY', 'COUNTYFIPS', 'COUNTRY', 'LATITUDE', 'LONGITUDE',
       'NAICS_CODE', 'NAICS_DESC', 'SOURCE', 'SOURCEDATE', 'VAL_METHOD',
       'VAL_DATE', 'WEBSITE', 'LEVEL_', 'ENROLLMENT', 'START_GRAD',
       'END_GRADE', 'FT_TEACHER', 'SHELTER_ID'],
      dtype='object')

In [28]:
private_school['COUNTRY'].value_counts()

USA    31064
Name: COUNTRY, dtype: int64

In [29]:
#cleaning datasets 
school=private_school[['NAME', 'ADDRESS', 'CITY',
                    'STATE', 'ZIP',  'TYPE',  'POPULATION',
                    'COUNTY',  'COUNTRY', 'LATITUDE', 'LONGITUDE', 'LEVEL_']]

In [30]:
#shape of school dataset
print(f'No of rowsare {school.shape[0]} and number of columns are {school.shape[1]}')

No of rowsare 31064 and number of columns are 12


In [31]:
school.head()

Unnamed: 0,NAME,ADDRESS,CITY,STATE,ZIP,TYPE,POPULATION,COUNTY,COUNTRY,LATITUDE,LONGITUDE,LEVEL_
0,A CHILD'S GARDEN,20 IVY BROOK RD,SHELTON,CT,6484,7,8,FAIRFIELD,USA,41.29199,-73.087632,1
1,BRIDGEPORT INTERNATIONAL ACADEMY,285 LAFAYETTE ST STE 200,BRIDGEPORT,CT,6604,3,68,FAIRFIELD,USA,41.167416,-73.188438,2
2,CARMEL ACADMEY,270 LAKE AVE,GREENWICH,CT,6830,3,278,FAIRFIELD,USA,41.03869,-73.635283,1
3,COUNTRY KIDS CLUB,94 OLD STATE RD,BROOKFIELD,CT,6804,7,11,FAIRFIELD,USA,41.438861,-73.405868,1
4,EAGLE HILL SCHOOL,45 GLENVILLE RD,GREENWICH,CT,6831,4,318,FAIRFIELD,USA,41.032465,-73.640254,3


In [32]:
school.columns

Index(['NAME', 'ADDRESS', 'CITY', 'STATE', 'ZIP', 'TYPE', 'POPULATION',
       'COUNTY', 'COUNTRY', 'LATITUDE', 'LONGITUDE', 'LEVEL_'],
      dtype='object')

In [33]:
school_clean=school.groupby('CITY').agg({'NAME':'count',
                           'STATE': lambda x : x.unique(),                    
                           'COUNTY': lambda x : x.unique(),
                           'ZIP': lambda x : x.unique()}).reset_index()

In [34]:
#number of unique city
school_clean['CITY'].nunique()

6163

In [35]:
#saving the crime_rate data to output folder
school_clean.to_csv('output_data/school.csv', index=False)

## Looking with airports datasets in US cities

In [36]:
#importing airports data (downloaded from kaggle)
airport_path=os.path.join('Resources', 'airports.csv')
airport=pd.read_csv(airport_path)

In [37]:
airport.columns

Index(['IATA', 'AIRPORT', 'CITY', 'STATE', 'COUNTRY', 'LATITUDE', 'LONGITUDE'], dtype='object')

In [38]:
airport['CITY'].value_counts()

Portland        2
Charleston      2
Jacksonville    2
New York        2
Columbia        2
               ..
Bozeman         1
Burlington      1
Baton Rouge     1
Butte           1
Yuma            1
Name: CITY, Length: 324, dtype: int64

In [39]:
airport_clean=airport.groupby(['CITY', 'STATE']).agg({'AIRPORT':'count'}).reset_index()

airport_clean

Unnamed: 0,CITY,STATE,AIRPORT
0,Aberdeen,SD,1
1,Abilene,TX,1
2,Adak,AK,1
3,Agana,GU,1
4,Aguadilla,PR,1
...,...,...,...
329,Windsor Locks,CT,1
330,Worcester,MA,1
331,Wrangell,AK,1
332,Yakutat,AK,1


In [40]:
#saving the airport data to output folder
airport_clean.to_csv('output_data/airports.csv', index=False)

## Airports data from whole world

In [41]:
#importing airports and other data for world(downloaded from kaggle)
airport_path_exd=os.path.join('Resources', 'airports-extended.csv')
airports_exd=pd.read_csv(airport_path_exd)

In [47]:
airports_exd.head()

Unnamed: 0,1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.081689834590001,145.391998291,5282,10,U,Pacific/Port_Moresby,airport,OurAirports
0,2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.20708,145.789001,20,10,U,Pacific/Port_Moresby,airport,OurAirports
1,3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.82679,144.296005,5388,10,U,Pacific/Port_Moresby,airport,OurAirports
2,4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239,10,U,Pacific/Port_Moresby,airport,OurAirports
3,5,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.44338,147.220001,146,10,U,Pacific/Port_Moresby,airport,OurAirports
4,6,Wewak International Airport,Wewak,Papua New Guinea,WWK,AYWK,-3.58383,143.669006,19,10,U,Pacific/Port_Moresby,airport,OurAirports


In [42]:
#only geeting the data for usa
airports_usa=airports_exd[airports_exd['Papua New Guinea']=='United States']

#only selecting certain columns
airports_usa=airports_usa[['Goroka Airport', 'Goroka', 'Papua New Guinea', 
       '-6.081689834590001', '145.391998291', 
       'Pacific/Port_Moresby', 'airport',]]

In [43]:
#changing the name of the columns as header is missing
airports_usa.rename(columns={'Goroka Airport': "Name", 
                             'Goroka': 'City',
                             'Papua New Guinea' :'Country',
                            '-6.081689834590001' : 'Lat',
                             '145.391998291' : 'Lng',
                               'Pacific/Port_Moresby': "Others",
                             'airport':"type"}, inplace=True)

In [44]:
#only airport types
airports=airports_usa[airports_usa['type']=='airport']
airports.shape

(1674, 7)

In [45]:
airports['City'].value_counts()

New York         8
Columbus         7
Houston          6
Greenville       5
Jacksonville     5
                ..
Lynchburg        1
Lewistown        1
Klamath Falls    1
Lebanon          1
Gerlach          1
Name: City, Length: 1399, dtype: int64

In [46]:
airports_clean=airports.groupby('City').agg({'Name':'count'}).reset_index()
airports_clean.columns=['City', 'Count']
airports_clean

Unnamed: 0,City,Count
0,Aberdeen,2
1,Abilene,2
2,Ada,1
3,Adak Island,1
4,Addison,1
...,...,...
1394,Zachar Bay,1
1395,Zanesville,1
1396,Zelienople,1
1397,Zephyrhills,1


## Looking for the hospital data in US Cities

In [48]:
#importing hospitals data (downloaded from kaggle)
hospital_path=os.path.join('Resources', 'Hospitals.csv')
hospitals_data=pd.read_csv(hospital_path)

In [49]:
hospitals_data.shape

(7570, 34)

In [50]:
hospitals_data.columns

Index(['X', 'Y', 'OBJECTID', 'ID', 'NAME', 'ADDRESS', 'CITY', 'STATE', 'ZIP',
       'ZIP4', 'TELEPHONE', 'TYPE', 'STATUS', 'POPULATION', 'COUNTY',
       'COUNTYFIPS', 'COUNTRY', 'LATITUDE', 'LONGITUDE', 'NAICS_CODE',
       'NAICS_DESC', 'SOURCE', 'SOURCEDATE', 'VAL_METHOD', 'VAL_DATE',
       'WEBSITE', 'STATE_ID', 'ALT_NAME', 'ST_FIPS', 'OWNER', 'TTL_STAFF',
       'BEDS', 'TRAUMA', 'HELIPAD'],
      dtype='object')

In [51]:
#selecting only important columns
hospitals=hospitals_data[['NAME', 'ADDRESS', 'CITY', 'STATE', 'ZIP',
       'ZIP4', 'TELEPHONE', 'TYPE', 'STATUS', 'POPULATION', 'COUNTY',
       'COUNTYFIPS', 'COUNTRY', 'LATITUDE', 'LONGITUDE']]

In [56]:
hospitals_clean=hospitals.groupby(['CITY','STATE']).agg({'NAME':'count',                  
                                               'COUNTY': lambda x : x.unique()}).reset_index()

hospitals_clean=hospitals_clean.sort_values(by='NAME',ascending=False)
hospitals_clean

Unnamed: 0,CITY,STATE,NAME,COUNTY
1775,HOUSTON,TX,68,"[HARRIS, MEDINA]"
3410,SAN ANTONIO,TX,45,BEXAR
683,CHICAGO,IL,41,COOK
924,DALLAS,TX,38,DALLAS
3043,PHILADELPHIA,PA,35,"[PHILADELPHIA, PHILADELPHI]"
...,...,...,...,...
1612,HANOVER,PA,1,YORK
1613,HARBOR BEACH,MI,1,HURON
1614,HARBOR CITY,CA,1,LOS ANGELES
1615,HARDEEVILLE,SC,1,JASPER


In [57]:
hospitals_clean.nlargest(10, "NAME")

Unnamed: 0,CITY,STATE,NAME,COUNTY
1775,HOUSTON,TX,68,"[HARRIS, MEDINA]"
3410,SAN ANTONIO,TX,45,BEXAR
683,CHICAGO,IL,41,COOK
924,DALLAS,TX,38,DALLAS
3043,PHILADELPHIA,PA,35,"[PHILADELPHIA, PHILADELPHI]"
2227,LOS ANGELES,CA,34,LOS ANGELES
2084,LAS VEGAS,NV,29,CLARK
3049,PHOENIX,AZ,28,MARICOPA
183,AUSTIN,TX,28,"[TRAVIS, WILLIAMSON]"
2723,NEW YORK,NY,27,NEW YORK


In [53]:
#saving the hospitals data to output folder
hospitals_clean.to_csv('output_data/hospitals.csv', index=False)