## Purpose
- Joining tables for testing segment 1 deliverables
- ML outside of SQL
- Testing SQL loading/pulling data from

In [1]:
# Dependencies
import os
import pandas as pd
import numpy as np

In [2]:
# Load Files
path1 = os.path.join("..","Resources", "yelp_test1.csv")
yelp_test_df=pd.read_csv(path1)

path2 = os.path.join("..","Resources", "zillow_housing_prelim_clean.csv")
zillow_df=pd.read_csv(path2)

In [3]:
yelp_test_df.head()

Unnamed: 0,business_id,city,state,postal_code,stars,review_count,Diversity
0,mpf3x-BjTdTEA3yCZrAYPw,Affton,MO,63123,3.0,15,3
1,MTSW4McQd7CbVtyjqoe9mw,Philadelphia,PA,19107,4.0,80,1
2,CF33F8-E6oudUQ46HnavjQ,Ashland City,TN,37015,2.0,6,1
3,bBDDEgkFA1Otx9Lfe7BZUQ,Nashville,TN,37207,1.5,10,1
4,eEOYSgkmpB90uNA7lDOMRA,Tampa Bay,FL,33602,4.0,10,3


In [4]:
yelp_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34988 entries, 0 to 34987
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   business_id   34988 non-null  object 
 1   city          34988 non-null  object 
 2   state         34988 non-null  object 
 3   postal_code   34984 non-null  object 
 4   stars         34988 non-null  float64
 5   review_count  34988 non-null  int64  
 6   Diversity     34988 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 1.9+ MB


In [5]:
zillow_df.head()

Unnamed: 0.1,Unnamed: 0,postal_code,State,City,CountyName,2016-01-31,2017,2018,2019,2020,2021
0,0,10025,NY,New York,New York County,1007253.0,1019755,1046267,1068521,1035675,1114145
1,1,60657,IL,Chicago,Cook County,451789.0,478119,484504,478017,483617,508407
2,2,10023,NY,New York,New York County,1356052.0,1409487,1488460,1400072,1453596,1450186
3,3,77494,TX,Katy,Harris County,340359.0,332746,336492,337284,343088,404243
4,4,60614,IL,Chicago,Cook County,592368.0,621614,628324,620016,620261,648360


In [6]:
zillow_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27696 entries, 0 to 27695
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   27696 non-null  int64  
 1   postal_code  27696 non-null  int64  
 2   State        27696 non-null  object 
 3   City         27696 non-null  object 
 4   CountyName   27696 non-null  object 
 5   2016-01-31   27696 non-null  float64
 6   2017         27696 non-null  int64  
 7   2018         27696 non-null  int64  
 8   2019         27696 non-null  int64  
 9   2020         27696 non-null  int64  
 10  2021         27696 non-null  int64  
dtypes: float64(1), int64(7), object(3)
memory usage: 2.3+ MB


In [9]:
# Need to change data type of postal_code in one of the data sets to merge (currently different data types)
zillow_df["postal_code"]=zillow_df["postal_code"].astype(str)

In [10]:
zillow_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27696 entries, 0 to 27695
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   27696 non-null  int64  
 1   postal_code  27696 non-null  object 
 2   State        27696 non-null  object 
 3   City         27696 non-null  object 
 4   CountyName   27696 non-null  object 
 5   2016-01-31   27696 non-null  float64
 6   2017         27696 non-null  int64  
 7   2018         27696 non-null  int64  
 8   2019         27696 non-null  int64  
 9   2020         27696 non-null  int64  
 10  2021         27696 non-null  int64  
dtypes: float64(1), int64(6), object(4)
memory usage: 2.3+ MB


In [11]:
# Merge the 2 data sets on postal code
test_merge_df = pd.merge(yelp_test_df, zillow_df,how="inner",left_on="postal_code", right_on="postal_code", right_index=False)

In [12]:
test_merge_df.head()

Unnamed: 0.1,business_id,city,state,postal_code,stars,review_count,Diversity,Unnamed: 0,State,City,CountyName,2016-01-31,2017,2018,2019,2020,2021
0,mpf3x-BjTdTEA3yCZrAYPw,Affton,MO,63123,3.0,15,3,367,MO,Affton,Saint Louis County,139727.0,152645,158655,165220,173803,198744
1,lB6jIOb1UBATmdfot4KJew,Saint Louis,MO,63123,4.0,126,2,367,MO,Affton,Saint Louis County,139727.0,152645,158655,165220,173803,198744
2,fBO0Cb-tbx5fvaWsp4sKtw,Saint Louis,MO,63123,3.5,83,1,367,MO,Affton,Saint Louis County,139727.0,152645,158655,165220,173803,198744
3,cXEx8dhmoFZcsJjXS2lLuQ,Saint Louis,MO,63123,3.5,51,4,367,MO,Affton,Saint Louis County,139727.0,152645,158655,165220,173803,198744
4,-D_L4GHXJQvy2wv0RkrupA,St. Louis,MO,63123,2.5,21,2,367,MO,Affton,Saint Louis County,139727.0,152645,158655,165220,173803,198744


In [13]:
# Drop "Unnamed: 0" column (left over index from a previous export??)
test_merge_df.drop(["Unnamed: 0"], axis=1, inplace=True)

In [14]:
test_merge_df.head()

Unnamed: 0,business_id,city,state,postal_code,stars,review_count,Diversity,State,City,CountyName,2016-01-31,2017,2018,2019,2020,2021
0,mpf3x-BjTdTEA3yCZrAYPw,Affton,MO,63123,3.0,15,3,MO,Affton,Saint Louis County,139727.0,152645,158655,165220,173803,198744
1,lB6jIOb1UBATmdfot4KJew,Saint Louis,MO,63123,4.0,126,2,MO,Affton,Saint Louis County,139727.0,152645,158655,165220,173803,198744
2,fBO0Cb-tbx5fvaWsp4sKtw,Saint Louis,MO,63123,3.5,83,1,MO,Affton,Saint Louis County,139727.0,152645,158655,165220,173803,198744
3,cXEx8dhmoFZcsJjXS2lLuQ,Saint Louis,MO,63123,3.5,51,4,MO,Affton,Saint Louis County,139727.0,152645,158655,165220,173803,198744
4,-D_L4GHXJQvy2wv0RkrupA,St. Louis,MO,63123,2.5,21,2,MO,Affton,Saint Louis County,139727.0,152645,158655,165220,173803,198744


In [15]:
test_merge_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30454 entries, 0 to 30453
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   business_id   30454 non-null  object 
 1   city          30454 non-null  object 
 2   state         30454 non-null  object 
 3   postal_code   30454 non-null  object 
 4   stars         30454 non-null  float64
 5   review_count  30454 non-null  int64  
 6   Diversity     30454 non-null  int64  
 7   State         30454 non-null  object 
 8   City          30454 non-null  object 
 9   CountyName    30454 non-null  object 
 10  2016-01-31    30454 non-null  float64
 11  2017          30454 non-null  int64  
 12  2018          30454 non-null  int64  
 13  2019          30454 non-null  int64  
 14  2020          30454 non-null  int64  
 15  2021          30454 non-null  int64  
dtypes: float64(2), int64(7), object(7)
memory usage: 3.9+ MB


In [16]:
# Export to csv
test_merge_df.to_csv("../Resources/test_yelp_zillow.csv")