# Project 2 - Team 5 

## Objective

Obtain resources from Multiple Listing Service and State of Texas websites to extract, transform, and load csv files to conduct an informative analysis in the future to help our clients to find the perfect house with the perfect school.

## Type DataBase
•	SQL Postgres

## HAR Data

### Extraction

In [1]:
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
from secrets import username, password
from googlemaps import Client as GoogleMaps
from secrets import API_KEY
from geopy.geocoders import Nominatim

In [2]:
# Store csv into df
csv_file= "../Resources/small_alt_HAR.csv"  #Change to smaller file for latlong coding
raw_har_df= pd.read_csv(csv_file)
raw_har_df

Unnamed: 0,mls,street_number,street_name,unit,city,zip,county,subdivision,home_type,year_built,...,style,list_price,market_area,area,dom,cdom,list_date,school_district,elementary,high_school
0,90227486,2403,Austin Street,,Houston,77004,Harris,Austin Skyline,Single-Family,2015,...,Contemporary/Modern,445000,16,Midtown - Houston,150,150,9/9/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
1,76269817,3123,Austin Street,,Houston,77004,Harris,Montione Vls/Elgin Pt Rep,Single-Family,2014,...,Mediterranean,489000,16,Midtown - Houston,30,30,1/7/2021,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
2,44013547,3108,Crawford Street,,Houston,77004,Harris,Midtown,Single-Family,0,...,Contemporary/Modern,429000,16,Midtown - Houston,135,135,9/24/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
3,78420188,3110,Crawford Street,,Houston,77004,Harris,Midtown,Single-Family,0,...,Contemporary/Modern,439000,16,Midtown - Houston,135,135,9/24/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
4,32279965,1804,Dennis Street,,Houston,77004,Harris,Weekley at Chenvert,Single-Family,2014,...,"Contemporary/Modern, Traditional",404000,16,Midtown - Houston,36,36,1/1/2021,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
5,12892280,1411,Elgin Street,,Houston,77004,Harris,Montione Vls/Elgin Pt Rep,Single-Family,2014,...,Traditional,429000,16,Midtown - Houston,5,45,2/1/2021,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
6,47688089,1527,FRANCIS Street,,Houston,77004,Harris,FRANCIS STATION,Single-Family,2000,...,Traditional,320000,16,Midtown - Houston,20,20,11/16/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
7,14702643,1611,Holman Street,,Houston,77004,Harris,Holman Outlot 38,Single-Family,2013,...,"Mediterranean, Traditional",1075000,16,Midtown - Houston,59,423,12/9/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
8,23353569,1713,Holman Street,,Houston,77004,Harris,Midtown,Single-Family,2019,...,Contemporary/Modern,609900,16,Midtown - Houston,66,485,12/2/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
9,95756492,1315,Isabella Street,,Houston,77004,Harris,Midtown Elevated,Single-Family,2019,...,Traditional,629900,16,Midtown - Houston,12,962,1/25/2021,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL


### Transformation

In [3]:
# replace elementary with el to match TEA table
raw_har1_df= raw_har_df.replace({"elementary" :"ELEMENTARY"}, "EL", regex = True)
raw_har1_df

Unnamed: 0,mls,street_number,street_name,unit,city,zip,county,subdivision,home_type,year_built,...,style,list_price,market_area,area,dom,cdom,list_date,school_district,elementary,high_school
0,90227486,2403,Austin Street,,Houston,77004,Harris,Austin Skyline,Single-Family,2015,...,Contemporary/Modern,445000,16,Midtown - Houston,150,150,9/9/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
1,76269817,3123,Austin Street,,Houston,77004,Harris,Montione Vls/Elgin Pt Rep,Single-Family,2014,...,Mediterranean,489000,16,Midtown - Houston,30,30,1/7/2021,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
2,44013547,3108,Crawford Street,,Houston,77004,Harris,Midtown,Single-Family,0,...,Contemporary/Modern,429000,16,Midtown - Houston,135,135,9/24/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
3,78420188,3110,Crawford Street,,Houston,77004,Harris,Midtown,Single-Family,0,...,Contemporary/Modern,439000,16,Midtown - Houston,135,135,9/24/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
4,32279965,1804,Dennis Street,,Houston,77004,Harris,Weekley at Chenvert,Single-Family,2014,...,"Contemporary/Modern, Traditional",404000,16,Midtown - Houston,36,36,1/1/2021,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
5,12892280,1411,Elgin Street,,Houston,77004,Harris,Montione Vls/Elgin Pt Rep,Single-Family,2014,...,Traditional,429000,16,Midtown - Houston,5,45,2/1/2021,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
6,47688089,1527,FRANCIS Street,,Houston,77004,Harris,FRANCIS STATION,Single-Family,2000,...,Traditional,320000,16,Midtown - Houston,20,20,11/16/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
7,14702643,1611,Holman Street,,Houston,77004,Harris,Holman Outlot 38,Single-Family,2013,...,"Mediterranean, Traditional",1075000,16,Midtown - Houston,59,423,12/9/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
8,23353569,1713,Holman Street,,Houston,77004,Harris,Midtown,Single-Family,2019,...,Contemporary/Modern,609900,16,Midtown - Houston,66,485,12/2/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
9,95756492,1315,Isabella Street,,Houston,77004,Harris,Midtown Elevated,Single-Family,2019,...,Traditional,629900,16,Midtown - Houston,12,962,1/25/2021,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL


In [4]:
#Replace NaN with ""
raw_har1_df.unit = raw_har1_df.unit.fillna('')
raw_har1_df

Unnamed: 0,mls,street_number,street_name,unit,city,zip,county,subdivision,home_type,year_built,...,style,list_price,market_area,area,dom,cdom,list_date,school_district,elementary,high_school
0,90227486,2403,Austin Street,,Houston,77004,Harris,Austin Skyline,Single-Family,2015,...,Contemporary/Modern,445000,16,Midtown - Houston,150,150,9/9/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
1,76269817,3123,Austin Street,,Houston,77004,Harris,Montione Vls/Elgin Pt Rep,Single-Family,2014,...,Mediterranean,489000,16,Midtown - Houston,30,30,1/7/2021,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
2,44013547,3108,Crawford Street,,Houston,77004,Harris,Midtown,Single-Family,0,...,Contemporary/Modern,429000,16,Midtown - Houston,135,135,9/24/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
3,78420188,3110,Crawford Street,,Houston,77004,Harris,Midtown,Single-Family,0,...,Contemporary/Modern,439000,16,Midtown - Houston,135,135,9/24/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
4,32279965,1804,Dennis Street,,Houston,77004,Harris,Weekley at Chenvert,Single-Family,2014,...,"Contemporary/Modern, Traditional",404000,16,Midtown - Houston,36,36,1/1/2021,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
5,12892280,1411,Elgin Street,,Houston,77004,Harris,Montione Vls/Elgin Pt Rep,Single-Family,2014,...,Traditional,429000,16,Midtown - Houston,5,45,2/1/2021,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
6,47688089,1527,FRANCIS Street,,Houston,77004,Harris,FRANCIS STATION,Single-Family,2000,...,Traditional,320000,16,Midtown - Houston,20,20,11/16/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
7,14702643,1611,Holman Street,,Houston,77004,Harris,Holman Outlot 38,Single-Family,2013,...,"Mediterranean, Traditional",1075000,16,Midtown - Houston,59,423,12/9/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
8,23353569,1713,Holman Street,,Houston,77004,Harris,Midtown,Single-Family,2019,...,Contemporary/Modern,609900,16,Midtown - Houston,66,485,12/2/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL
9,95756492,1315,Isabella Street,,Houston,77004,Harris,Midtown Elevated,Single-Family,2019,...,Traditional,629900,16,Midtown - Houston,12,962,1/25/2021,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR HIGH SCHOOL


In [5]:
# replace high school with h s to match TEA table
customer_data_df = raw_har1_df.replace({"high_school" :"HIGH SCHOOL"}, "H S", regex = True)
customer_data_df

Unnamed: 0,mls,street_number,street_name,unit,city,zip,county,subdivision,home_type,year_built,...,style,list_price,market_area,area,dom,cdom,list_date,school_district,elementary,high_school
0,90227486,2403,Austin Street,,Houston,77004,Harris,Austin Skyline,Single-Family,2015,...,Contemporary/Modern,445000,16,Midtown - Houston,150,150,9/9/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR H S
1,76269817,3123,Austin Street,,Houston,77004,Harris,Montione Vls/Elgin Pt Rep,Single-Family,2014,...,Mediterranean,489000,16,Midtown - Houston,30,30,1/7/2021,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR H S
2,44013547,3108,Crawford Street,,Houston,77004,Harris,Midtown,Single-Family,0,...,Contemporary/Modern,429000,16,Midtown - Houston,135,135,9/24/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR H S
3,78420188,3110,Crawford Street,,Houston,77004,Harris,Midtown,Single-Family,0,...,Contemporary/Modern,439000,16,Midtown - Houston,135,135,9/24/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR H S
4,32279965,1804,Dennis Street,,Houston,77004,Harris,Weekley at Chenvert,Single-Family,2014,...,"Contemporary/Modern, Traditional",404000,16,Midtown - Houston,36,36,1/1/2021,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR H S
5,12892280,1411,Elgin Street,,Houston,77004,Harris,Montione Vls/Elgin Pt Rep,Single-Family,2014,...,Traditional,429000,16,Midtown - Houston,5,45,2/1/2021,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR H S
6,47688089,1527,FRANCIS Street,,Houston,77004,Harris,FRANCIS STATION,Single-Family,2000,...,Traditional,320000,16,Midtown - Houston,20,20,11/16/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR H S
7,14702643,1611,Holman Street,,Houston,77004,Harris,Holman Outlot 38,Single-Family,2013,...,"Mediterranean, Traditional",1075000,16,Midtown - Houston,59,423,12/9/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR H S
8,23353569,1713,Holman Street,,Houston,77004,Harris,Midtown,Single-Family,2019,...,Contemporary/Modern,609900,16,Midtown - Houston,66,485,12/2/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR H S
9,95756492,1315,Isabella Street,,Houston,77004,Harris,Midtown Elevated,Single-Family,2019,...,Traditional,629900,16,Midtown - Houston,12,962,1/25/2021,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR H S


In [6]:
# Total info
customer_data_df.shape

(29, 29)

In [7]:
# Columns list
customer_data_df.columns

Index(['mls', 'street_number', 'street_name', 'unit', 'city', 'zip', 'county',
       'subdivision', 'home_type', 'year_built', 'bedrooms', 'full_baths',
       'half_baths', 'total_baths', 'room_count', 'fireplaces', 'stories',
       'pool_private', 'garages', 'style', 'list_price', 'market_area', 'area',
       'dom', 'cdom', 'list_date', 'school_district', 'elementary',
       'high_school'],
      dtype='object')

In [8]:
# Navigate a single column
customer_data_df['city']

0     Houston
1     Houston
2     Houston
3     Houston
4     Houston
5     Houston
6     Houston
7     Houston
8     Houston
9     Houston
10    Houston
11    Houston
12    Houston
13    Houston
14    Houston
15    Houston
16    Houston
17    Houston
18    Houston
19    Houston
20    Houston
21    Houston
22    Houston
23    Houston
24    Houston
25    Houston
26    Houston
27    Houston
28    Houston
Name: city, dtype: object

In [9]:
# Replace null values with "0"
# customer_data_df['fireplaces']= customer_data_df['fireplaces'].fillna(0)

In [10]:
# Check replacement
# customer_data_df['fireplaces']

In [11]:
# Replace null values with "0"
# customer_data_df['garages']= customer_data_df['garages'].fillna(0)

In [12]:
# Checking unique values
# customer_data_df.nunique()

In [13]:
# Checking unique values
customer_data_df.nunique().count()

29

In [14]:
# isna/isnull
# customer_data_df.isna().sum()

In [15]:
# Data types
# customer_data_df.dtypes

In [16]:
# Create new df
HAR_df= customer_data_df
HAR_df

Unnamed: 0,mls,street_number,street_name,unit,city,zip,county,subdivision,home_type,year_built,...,style,list_price,market_area,area,dom,cdom,list_date,school_district,elementary,high_school
0,90227486,2403,Austin Street,,Houston,77004,Harris,Austin Skyline,Single-Family,2015,...,Contemporary/Modern,445000,16,Midtown - Houston,150,150,9/9/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR H S
1,76269817,3123,Austin Street,,Houston,77004,Harris,Montione Vls/Elgin Pt Rep,Single-Family,2014,...,Mediterranean,489000,16,Midtown - Houston,30,30,1/7/2021,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR H S
2,44013547,3108,Crawford Street,,Houston,77004,Harris,Midtown,Single-Family,0,...,Contemporary/Modern,429000,16,Midtown - Houston,135,135,9/24/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR H S
3,78420188,3110,Crawford Street,,Houston,77004,Harris,Midtown,Single-Family,0,...,Contemporary/Modern,439000,16,Midtown - Houston,135,135,9/24/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR H S
4,32279965,1804,Dennis Street,,Houston,77004,Harris,Weekley at Chenvert,Single-Family,2014,...,"Contemporary/Modern, Traditional",404000,16,Midtown - Houston,36,36,1/1/2021,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR H S
5,12892280,1411,Elgin Street,,Houston,77004,Harris,Montione Vls/Elgin Pt Rep,Single-Family,2014,...,Traditional,429000,16,Midtown - Houston,5,45,2/1/2021,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR H S
6,47688089,1527,FRANCIS Street,,Houston,77004,Harris,FRANCIS STATION,Single-Family,2000,...,Traditional,320000,16,Midtown - Houston,20,20,11/16/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR H S
7,14702643,1611,Holman Street,,Houston,77004,Harris,Holman Outlot 38,Single-Family,2013,...,"Mediterranean, Traditional",1075000,16,Midtown - Houston,59,423,12/9/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR H S
8,23353569,1713,Holman Street,,Houston,77004,Harris,Midtown,Single-Family,2019,...,Contemporary/Modern,609900,16,Midtown - Houston,66,485,12/2/2020,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR H S
9,95756492,1315,Isabella Street,,Houston,77004,Harris,Midtown Elevated,Single-Family,2019,...,Traditional,629900,16,Midtown - Houston,12,962,1/25/2021,HOUSTON ISD,GREGORY-LINCOLN ED CTR,LAMAR H S


### Retrieving Latitude and Longitude 

In [17]:
# Creating addresses df
orig_addresses_df = HAR_df[['street_number','street_name','city','zip']]
orig_addresses_df.insert(3,'state','Texas')

orig_addresses_df

Unnamed: 0,street_number,street_name,city,state,zip
0,2403,Austin Street,Houston,Texas,77004
1,3123,Austin Street,Houston,Texas,77004
2,3108,Crawford Street,Houston,Texas,77004
3,3110,Crawford Street,Houston,Texas,77004
4,1804,Dennis Street,Houston,Texas,77004
5,1411,Elgin Street,Houston,Texas,77004
6,1527,FRANCIS Street,Houston,Texas,77004
7,1611,Holman Street,Houston,Texas,77004
8,1713,Holman Street,Houston,Texas,77004
9,1315,Isabella Street,Houston,Texas,77004


In [18]:
orig_addresses_df['full_address'] = orig_addresses_df['street_number'].map(str)+' '+orig_addresses_df['street_name'].map(str)+', '+orig_addresses_df['city'].map(str)+', '+orig_addresses_df['state'].map(str)+', '+orig_addresses_df['zip'].map(str)
orig_addresses_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  orig_addresses_df['full_address'] = orig_addresses_df['street_number'].map(str)+' '+orig_addresses_df['street_name'].map(str)+', '+orig_addresses_df['city'].map(str)+', '+orig_addresses_df['state'].map(str)+', '+orig_addresses_df['zip'].map(str)


Unnamed: 0,street_number,street_name,city,state,zip,full_address
0,2403,Austin Street,Houston,Texas,77004,"2403 Austin Street, Houston, Texas, 77004"
1,3123,Austin Street,Houston,Texas,77004,"3123 Austin Street, Houston, Texas, 77004"
2,3108,Crawford Street,Houston,Texas,77004,"3108 Crawford Street, Houston, Texas, 77004"
3,3110,Crawford Street,Houston,Texas,77004,"3110 Crawford Street, Houston, Texas, 77004"
4,1804,Dennis Street,Houston,Texas,77004,"1804 Dennis Street, Houston, Texas, 77004"
5,1411,Elgin Street,Houston,Texas,77004,"1411 Elgin Street, Houston, Texas, 77004"
6,1527,FRANCIS Street,Houston,Texas,77004,"1527 FRANCIS Street, Houston, Texas, 77004"
7,1611,Holman Street,Houston,Texas,77004,"1611 Holman Street, Houston, Texas, 77004"
8,1713,Holman Street,Houston,Texas,77004,"1713 Holman Street, Houston, Texas, 77004"
9,1315,Isabella Street,Houston,Texas,77004,"1315 Isabella Street, Houston, Texas, 77004"


In [19]:
addresses_df = orig_addresses_df.drop(['street_number','street_name','city','state','zip'], axis =1)
addresses_df

Unnamed: 0,full_address
0,"2403 Austin Street, Houston, Texas, 77004"
1,"3123 Austin Street, Houston, Texas, 77004"
2,"3108 Crawford Street, Houston, Texas, 77004"
3,"3110 Crawford Street, Houston, Texas, 77004"
4,"1804 Dennis Street, Houston, Texas, 77004"
5,"1411 Elgin Street, Houston, Texas, 77004"
6,"1527 FRANCIS Street, Houston, Texas, 77004"
7,"1611 Holman Street, Houston, Texas, 77004"
8,"1713 Holman Street, Houston, Texas, 77004"
9,"1315 Isabella Street, Houston, Texas, 77004"


In [21]:
#https://towardsdatascience.com/how-to-generate-lat-and-long-coordinates-from-an-address-column-using-pandas-and-googlemaps-api-d66b2720248d
#conda install -c conda-forge gmaps
#conda install -c conda-forge googlemaps
# from googlemaps import Client as GoogleMaps
# from secrets import API_KEY

# gmaps = GoogleMaps('AIzaSyCPvDQ8q0r9qDifnmkcAxxINp8yOUJHSKc')

# #addresses = addresses_df['full_address']
# addresses_df


In [None]:
import time  #to add delay in case of large DFs, python function

addresses_df['long'] = ""
addresses_df['lat'] = ""

#for x in range(len(addresses_df)):
#    try:
#        time.sleep(.5) #to add delay in case of large DFs
#        geocode_result = gmaps.geocode(addresses_df['full_address'][x])
#        addresses_df['lat'][x] = geocode_result[0]['geometry']['location'] ['lat']
#        addresses_df['long'][x] = geocode_result[0]['geometry']['location']['lng']
#    except IndexError:
#        print("Address was wrong...")
#    except Exception as e:
#        print("Unexpected error occurred.", e )
        
#addresses_df.head()
#addresses.to_csv('address_coords.csv')

In [22]:
#https://geopy.readthedocs.io/en/latest/#specifying-parameters-once
    
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="team5")

#geocode = lambda query: geolocator.geocode("%s, Houston, TX" % query)

location = geolocator.geocode("1804 Dennis Street, Houston, Texas, 77004")

print(location.address)
print((location.latitude, location.longitude))



ModuleNotFoundError: No module named 'geopy'

In [None]:
from functools import partial
from geopy.geocoders import Nominatim

#2618 Texas St, Houston, Texas, 77003
geocode = partial(geolocator.geocode, language="en")
print(geocode("2709 Rosalie Avenue, Houston, Texas, 77004"))
print(geocode("2709 Rosalie Avenue, Houston"))

In [None]:
import pandas as pd
df = pd.DataFrame({'name': ['paris', 'berlin', 'london']})

from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="team5")

from geopy.extra.rate_limiter import RateLimiter
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
df['location'] = df['name'].apply(geocode)

df['point'] = df['location'].apply(lambda loc: tuple(loc.point) if loc else None)
df

In [None]:
import pandas as pd

from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="team5")

from geopy.extra.rate_limiter import RateLimiter
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

addresses_df['location'] = addresses_df['full_address'].apply(geocode)

addresses_df['point'] = addresses_df['location'].apply(lambda loc: tuple(loc.point) if loc else None)
addresses_df

In [None]:
import pandas as pd

from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="team5")

from geopy.extra.rate_limiter import RateLimiter
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

#location = geolocator.geocode("1804 Dennis Street, Houston, Texas, 77004")

#print(location.address)
#print((location.latitude, location.longitude))

addresses_df['location'] = addresses_df['full_address'].apply(geocode)

addresses_df['point'] = addresses_df['location'].apply(lambda loc: tuple(loc.point) if loc else None)
addresses_df

In [None]:
# Writing to clean csv
HAR_df.to_csv("../Resources/alt_clean_har.csv", index = False)

In [None]:
# creating elementary school potion of junction table
raw_mls_el_df = HAR_df[['mls','elementary']]
mls_el_df= raw_mls_el_df.rename(columns= {'elementary': 'school'})
mls_el_df

In [None]:
# creating high school potion of junction table
raw_mls_hs_df = HAR_df[['mls', 'high_school']]
mls_hs_df= raw_mls_hs_df.rename(columns= {'high_school': 'school'})
mls_hs_df

In [None]:
# concating HAR potion of junction table
frames = [mls_el_df,mls_hs_df]
all_mls_schools_df = pd.concat(frames)
all_mls_schools_df.to_csv('../Resources/all_mls_schools.csv', index = False)
all_mls_schools_df

## TEA Data

### Extraction

In [None]:
# Store csv into df
csv_file= "../Resources/alt_school_rating_list_tea.csv"
raw_school_rating_df= pd.read_csv(csv_file)
raw_school_rating_df.head()

### Transformation

In [None]:
# removing columns not needed
raw_school_rating1_df = raw_school_rating_df.drop(['ESC\r\nRegion', 'Region', 'SBOE\r\nDistrict','Campus 2020 Rating',
                                                  'Campus 2018 Rating', 'Campus 2017 Rating','Campus 2016 Rating',
                                                  'Campus 2015 Rating', 'Campus 2014 Rating','Campus 2013 Rating',
                                                  'Campus 2012 Rating','Campus 2011 Rating'], axis =1)
raw_school_rating1_df

In [None]:
# Renaming columns - int is for intermediate (in process) table
int_school_rating_df= raw_school_rating1_df.rename(columns= {'Campus\r\nNumber': 'campus_number',
                                                            'Campus' : 'school',
                                                            'District' : 'district',
                                                          '# of Consecutive Years Campus is Academically Unacceptable' : 'yrs_unacceptable',
                                                          'Campus 2019 Rating' : 'rating'})
int_school_rating_df

In [None]:
# Adding in new column and associated value
int_school_rating_df['year']= 2019
int_school_rating_df

In [None]:
# Dropping rows that contain notes, i.e. start with *
int_school_ratinga_df = int_school_rating_df.drop(int_school_rating_df.loc[int_school_rating_df.campus_number.str.startswith("*")].index)
int_school_ratinga_df

In [None]:
# Dropping rows that contain middle school to match HAR data
int_school_rating1_df = int_school_ratinga_df.drop(int_school_ratinga_df.loc[int_school_ratinga_df.school.str.contains("MIDDLE")].index)
int_school_rating1_df

In [None]:
# checking for NANs
int_school_rating1_df.isna().sum()

In [None]:
# filling NaNs
int_school_rating1_df = int_school_rating1_df.fillna("Not Rated")

In [None]:
# checking for NANs
int_school_rating1_df.isna().sum()

In [None]:
# looking for unique items in column
int_school_rating1_df.rating.unique()

In [None]:
# replacing to get like wording 
int_school_rating2_df = int_school_rating1_df.replace({"rating" :"Not Rated: Minimum Size rules"},"Not Rated", regex = True)
int_school_rating3_df = int_school_rating2_df.replace({"rating" :"Not rated: Data Integrity Issue"},"Not Rated", regex = True)
int_school_rating3_df.head()

In [None]:
# verifying like wording
int_school_rating3_df.rating.unique()

In [None]:
# building dataframe for customer target area
int_school_rating4_df = int_school_rating3_df.loc[(int_school_rating_df['district'] == "HOUSTON ISD")|
                                               (int_school_rating_df['district'] == "KATY ISD") |
                                               (int_school_rating_df['district'] == "SHELDON ISD") |
                                               (int_school_rating_df['district'] == "SPRING BRANCH ISD") |
                                               (int_school_rating_df['district'] == "GALENA PARK ISD") |
                                               (int_school_rating_df['district'] == "CYPRESS-FAIRBANKS ISD") 
                                               ]
int_school_rating4_df

In [None]:
# adding high schools that are on edge of target area (outside of ISD) but still gets kids because of zoning
new_df = int_school_rating3_df.loc[(int_school_rating3_df['campus_number'] =='101911002')]
new1_df = int_school_rating3_df.loc[(int_school_rating3_df['campus_number'] =='101902003')]
new2_df = int_school_rating3_df.loc[(int_school_rating3_df['campus_number'] =='101919001')]                               

In [None]:
# adding schools to df
int_school_rating5_df = int_school_rating4_df.append(new_df, ignore_index=True)
int_school_rating6_df = int_school_rating5_df.append(new1_df, ignore_index=True)
int_school_rating7_df = int_school_rating6_df.append(new2_df, ignore_index=True)
int_school_rating7_df

In [None]:
# removing elementary school that is not in the target area ISD - it was causing conflict with primary key
int_school_rating8_df = int_school_rating7_df[int_school_rating7_df.campus_number != '101907152']
int_school_rating8_df

In [None]:
# Creating duplicate df to prevent corruption of original df
school_rating_df= int_school_rating8_df
school_rating_df

In [None]:
# Writing to clean csv
school_rating_df.to_csv('../Resources/alt_clean_ratings.csv', index = False)

In [None]:
raw_cnum_school_df = school_rating_df[['campus_number','school']]
raw_cnum_school_df .to_csv("../Resources/raw_cnum_school.csv", index = False)
raw_cnum_school_df

In [None]:
raw_junction = pd.merge_ordered(all_mls_schools_df, raw_cnum_school_df, fill_method='ffill', left_by='school')
raw_junction.to_csv("../Resources/raw_junction.csv", index = False)
raw_junction

In [None]:
# triple mls check - if dup school names in target area - will cause 3 mls nubers to show
trip_school_rating = raw_junction.pivot_table(index= ['mls'], aggfunc= 'size')
trip_school_rating.sort_values(ascending=False)

In [None]:
sorted_raw_junction = raw_junction.sort_values(by=['mls'])
sorted_raw_junction

In [None]:
raw_har2_df = sorted_raw_junction.drop(['school'], axis =1)
raw_har2_df 

In [None]:
# # Writing to clean csv
raw_har2_df .to_csv("../Resources/raw_har2.csv", index = False)

## Junction Table

In [None]:
# Store csv into df
csv_file= "../Resources/raw_HAR2.csv"
junction_df= pd.read_csv(csv_file)
junction_df

# Load to SQL

In [None]:
# Connect to local database
rds_connection_string = f"{username}:{password}@localhost:5432/etl_team5"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [None]:
# Use pandas to load transformed df into table school_rating_df
school_rating_df.to_sql(name='school_rating', con=engine, if_exists='replace', index=False)

In [None]:
# Use pandas to load transformed df into table mls
HAR_df.to_sql(name='mls', con=engine, if_exists='replace', index=False)

In [None]:
# Use pandas to load transformed df into table junction_table
junction_df.to_sql(name='junction_table', con=engine, if_exists='replace', index=False)

## SQL Query of tables 

In [None]:
# Confirm data has been added by querying the school_rating table
pd.read_sql_query('select * from school_rating', con=engine).head()

In [None]:
# Confirm data has been added by querying the mls table
pd.read_sql_query('select * from mls', con=engine).head()

In [None]:
# Confirm data has been added by querying the junction_table table
pd.read_sql_query('select * from junction_table', con=engine).head()

In [None]:
# Check for tables
engine.table_names()