In [3]:
# Dependencies and Setup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib import rcParams
import scipy.stats as sts
import os
from collections import Counter
import requests
import json

# API Keys
#from api_keys import gkey

## Looking for the house price data in us cities

In [4]:
#importing housing sale data (downloaded from kaggle)
price_path=os.path.join('Resources', 'Sale_Prices_City.csv')
housing_price=pd.read_csv(price_path)

In [5]:
#looking for housing_price data
housing_price.head()

Unnamed: 0.1,Unnamed: 0,RegionID,RegionName,StateName,SizeRank,2008-03,2008-04,2008-05,2008-06,2008-07,...,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12,2020-01,2020-02,2020-03
0,0,6181,New York,New York,1,,,,,,...,563200.0,570500.0,572800.0,569900.0,560800.0,571500.0,575100.0,571700.0,568300.0,573600.0
1,1,12447,Los Angeles,California,2,507600.0,489600.0,463000.0,453100.0,438100.0,...,706800.0,711800.0,717300.0,714100.0,711900.0,718400.0,727100.0,738200.0,760200.0,
2,2,39051,Houston,Texas,3,138400.0,135500.0,132200.0,131000.0,133400.0,...,209700.0,207400.0,207600.0,207000.0,211400.0,211500.0,217700.0,219200.0,223800.0,
3,3,17426,Chicago,Illinois,4,325100.0,314800.0,286900.0,274600.0,268500.0,...,271500.0,266500.0,264900.0,265000.0,264100.0,264300.0,270000.0,281400.0,302900.0,309200.0
4,4,6915,San Antonio,Texas,5,130900.0,131300.0,131200.0,131500.0,131600.0,...,197100.0,198700.0,200200.0,200800.0,203400.0,203800.0,205400.0,205400.0,208300.0,


In [6]:
#columns of housing price dataframe
housing_price.columns


Index(['Unnamed: 0', 'RegionID', 'RegionName', 'StateName', 'SizeRank',
       '2008-03', '2008-04', '2008-05', '2008-06', '2008-07',
       ...
       '2019-06', '2019-07', '2019-08', '2019-09', '2019-10', '2019-11',
       '2019-12', '2020-01', '2020-02', '2020-03'],
      dtype='object', length=150)

In [7]:
#housing price is for each month, we need to sum and average to ger price per year
#to get the average housing data 
for i in range(2009, 2020):
    housing_price[f'{i}']=((housing_price[[f'{i}-01',f'{i}-02',f'{i}-03',f'{i}-01',
                                      f'{i}-05',f'{i}-06',f'{i}-07',f'{i}-08',
                                     f'{i}-09',f'{i}-10',f'{i}-11',f'{i}-12']].sum(axis=1))/12)

In [8]:
#only selecting the columns that are needed for our project
housing=(housing_price[['RegionName', 'StateName', 'SizeRank','2015',
                       '2016','2017','2018','2019']])

In [9]:
housing.head()

Unnamed: 0,RegionName,StateName,SizeRank,2015,2016,2017,2018,2019
0,New York,New York,1,517833.333333,534275.0,542425.0,555416.666667,565016.666667
1,Los Angeles,California,2,491658.333333,530025.0,567958.333333,637291.666667,704691.666667
2,Houston,Texas,3,171058.333333,181208.333333,190675.0,196850.0,208958.333333
3,Chicago,Illinois,4,236341.666667,218975.0,228258.333333,245083.333333,265641.666667
4,San Antonio,Texas,5,164600.0,171308.333333,177666.666667,188108.333333,197250.0


In [10]:
#shape of housing dataset
print(f'No of rows are {housing.shape[0]} and number of columns are {housing.shape[1]}')

No of rows are 3728 and number of columns are 8


In [11]:
#info of the dataframe
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3728 entries, 0 to 3727
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   RegionName  3728 non-null   object 
 1   StateName   3728 non-null   object 
 2   SizeRank    3728 non-null   int64  
 3   2015        3728 non-null   float64
 4   2016        3728 non-null   float64
 5   2017        3728 non-null   float64
 6   2018        3728 non-null   float64
 7   2019        3728 non-null   float64
dtypes: float64(5), int64(1), object(2)
memory usage: 233.1+ KB


In [12]:
#total number of unique city
housing.value_counts('RegionName')

RegionName
Springfield    9
Monroe         7
Franklin       7
Washington     6
Auburn         6
              ..
Ortonville     1
Orting         1
Orrville       1
Oroville       1
Aberdeen       1
Length: 3158, dtype: int64

In [13]:
#totoal unique city
housing['RegionName'].nunique()

3158

In [30]:
#there are around 600 city with duplicate value
housing=housing.drop_duplicates(subset=['RegionName'])
#housing dataframe shape
housing.shape

(3158, 8)

In [15]:
#total unique city after droping duplicates
housing['RegionName'].nunique()

3158

In [16]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3158 entries, 0 to 3727
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   RegionName  3158 non-null   object 
 1   StateName   3158 non-null   object 
 2   SizeRank    3158 non-null   int64  
 3   2015        3158 non-null   float64
 4   2016        3158 non-null   float64
 5   2017        3158 non-null   float64
 6   2018        3158 non-null   float64
 7   2019        3158 non-null   float64
dtypes: float64(5), int64(1), object(2)
memory usage: 222.0+ KB


In [31]:
#saving the clean housing data to output folder
housing.to_csv('output_data/housing_price.csv', index=False)

## Looking for the crime data in us cities

In [None]:
#importing crime data (downloaded from kaggle)
crime_path=os.path.join('Resources', 'crime_data_w_population_and_crime_rate.csv')
crime_rate=pd.read_csv(crime_path)

In [None]:
#looking for crime data
crime_rate.head()

In [None]:
crime_rate.columns

In [None]:
#shape of crime dataset
print(f'No of rowsare {crime_rate.shape[0]} and number of columns are {crime_rate.shape[1]}')

In [None]:
#looking for unique county and city name
crime_rate['county_name'].nunique()

In [None]:
#looking for only certains columns
crime=(crime_rate[['county_name','population', 'crime_rate_per_100000',
                   'MURDER', 'RAPE', 'ROBBERY', 'AGASSLT', 'BURGLRY', 'LARCENY',
                   'MVTHEFT', 'ARSON']])

In [None]:
crime.head()

In [None]:
#saving the crime_rate data to output folder
crime.to_csv('output_data/crime_rate.csv', index=False)

## Looking for school datasets in US cities

In [None]:
#importing private school data (downloaded from kaggle)
school_path=os.path.join('Resources', 'Private_Schools.csv')
private_school=pd.read_csv(school_path)

In [None]:
#looking for private school datasets
private_school.head()

In [None]:
private_school.columns

In [None]:
private_school['COUNTRY'].value_counts()

In [None]:
#cleaning datasets 
school=private_school[['NAME', 'ADDRESS', 'CITY',
                    'STATE', 'ZIP',  'TYPE',  'POPULATION',
                    'COUNTY',  'COUNTRY', 'LATITUDE', 'LONGITUDE', 'LEVEL_']]

In [None]:
#shape of school dataset
print(f'No of rowsare {school.shape[0]} and number of columns are {school.shape[1]}')

In [None]:
school.head()

In [None]:
school.columns

In [None]:
school_clean=school.groupby('CITY').agg({'NAME':'count',
                           'STATE': lambda x : x.unique(),                    
                           'COUNTY': lambda x : x.unique(),
                           'ZIP': lambda x : x.unique()}).reset_index()

In [None]:
#number of unique city
school_clean['CITY'].nunique()

In [None]:
#saving the crime_rate data to output folder
school_clean.to_csv('output_data/school.csv', index=False)

## Looking with airports datasets in US cities

In [None]:
#importing airports data (downloaded from kaggle)
airport_path=os.path.join('Resources', 'airports.csv')
airport=pd.read_csv(airport_path)

In [None]:
airport.columns

In [None]:
airport['CITY'].value_counts()

In [None]:
airport_clean=airport.groupby(['CITY', 'STATE']).agg({'AIRPORT':'count'}).reset_index()

airport_clean

In [None]:
#saving the airport data to output folder
airport_clean.to_csv('output_data/airports.csv', index=False)

## Airports data from whole world

In [None]:
#importing airports and other data for world(downloaded from kaggle)
airport_path_exd=os.path.join('Resources', 'airports-extended.csv')
airports_exd=pd.read_csv(airport_path_exd)

In [None]:
airports_exd.head()

In [None]:
#only geeting the data for usa
airports_usa=airports_exd[airports_exd['Papua New Guinea']=='United States']

#only selecting certain columns
airports_usa=airports_usa[['Goroka Airport', 'Goroka', 'Papua New Guinea', 
       '-6.081689834590001', '145.391998291', 
       'Pacific/Port_Moresby', 'airport',]]

In [None]:
#changing the name of the columns as header is missing
airports_usa.rename(columns={'Goroka Airport': "Name", 
                             'Goroka': 'City',
                             'Papua New Guinea' :'Country',
                            '-6.081689834590001' : 'Lat',
                             '145.391998291' : 'Lng',
                               'Pacific/Port_Moresby': "Others",
                             'airport':"type"}, inplace=True)

In [None]:
#only airport types
airports=airports_usa[airports_usa['type']=='airport']
airports.shape

In [None]:
airports['City'].value_counts()

In [None]:
airports_clean=airports.groupby('City').agg({'Name':'count'}).reset_index()
airports_clean.columns=['City', 'Count']
airports_clean

## Looking for the hospital data in US Cities

In [None]:
#importing hospitals data (downloaded from kaggle)
hospital_path=os.path.join('Resources', 'Hospitals.csv')
hospitals_data=pd.read_csv(hospital_path)

In [None]:
hospitals_data.shape

In [None]:
hospitals_data.columns

In [None]:
#selecting only important columns
hospitals=hospitals_data[['NAME', 'ADDRESS', 'CITY', 'STATE', 'ZIP',
       'ZIP4', 'TELEPHONE', 'TYPE', 'STATUS', 'POPULATION', 'COUNTY',
       'COUNTYFIPS', 'COUNTRY', 'LATITUDE', 'LONGITUDE']]

In [None]:
hospitals_clean=hospitals.groupby(['CITY','STATE']).agg({'NAME':'count',                  
                                               'COUNTY': lambda x : x.unique()}).reset_index()

hospitals_clean