## Compile All Data Into A Single CSV

In this Jupyter Notebook, we compile results across all of our datasets in Part 1 - 5 into a single CSV. (clean up)

In [1]:
# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import time 
from census import Census
from us import states
import sqlite3
from sqlalchemy import create_engine
import pymysql
pymysql.install_as_MySQLdb()

# Census API Key
c = Census("85ac64b6b5a9c0901b00329d1ef41f0c53ccfc98", year=2015)

### Load All Databases

In [2]:
# Check Tables in SQLite
conn_lite = sqlite3.connect("Opportunity_Map.db")
cur = conn_lite.cursor() 
res = cur.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;")
for name in res:
    print(name[0])

City_Census
FullDistances
Impressions
Minimum_Distances
Rules_Added
Zip_Census


In [3]:
# Retrieve all data
city_census = pd.read_sql("select * from City_Census", conn_lite)
min_distances = pd.read_sql("select * from Minimum_Distances", conn_lite)
impressions = pd.read_sql("select * from Impressions", conn_lite)

In [4]:
min_distances.head()

Unnamed: 0,CityState,University,Distance,Closest,Asian Population,Black Population,City,City Lat,City Lng,Distance_Text,...,Population,Poverty,State,Time,Time_Text,Uni Lat,Uni Lng,Uni CityState,White Population,Physical_Campus
0,"AARONSBURG, PA",PENN,140.339859,1,0.0,0.0,AARONSBURG,40.895701,-77.392432,165 mi,...,1058.0,202.0,PA,10568.0,2 hours 56 mins,40.4406,-79.9959,"PITTSBURGH, PA",1058.0,False
1,"ABBEVILLE, AL",GTECH,157.537626,1,0.0,2274.0,ABBEVILLE,31.595148,-85.208852,184 mi,...,6470.0,1363.0,AL,10990.0,3 hours 3 mins,33.775618,-84.396285,"ATLANTA, GA",4064.0,False
2,"ABBEVILLE, GA",GTECH,138.382298,1,41.0,1903.0,ABBEVILLE,31.976256,-83.339665,158 mi,...,4719.0,689.0,GA,9364.0,2 hours 36 mins,33.775618,-84.396285,"ATLANTA, GA",2676.0,False
3,"ABBEVILLE, LA",UT,189.538012,1,1059.0,6457.0,ABBEVILLE,29.894612,-92.193173,231 mi,...,26078.0,5164.0,LA,13180.0,3 hours 40 mins,29.719949,-95.342233,"HOUSTON, TX",17932.0,False
4,"ABBEVILLE, MS",VAND,187.693841,1,19.0,1502.0,ABBEVILLE,34.492325,-89.443056,254 mi,...,2974.0,428.0,MS,14485.0,4 hours 1 min,36.144703,-86.802655,"NASHVILLE, TN",1432.0,False


In [5]:
city_census.head()

Unnamed: 0,CityState,city,state,Population,White Population,Black Population,Native American Population,Asian Population,Hispanic Population,Education None,...,Employment Female Computer Engineering,Median Age,Median Male Age,Median Female Age,Household Income,Income Per Capita,Median Gross Rent,Median Home Value,lat,lng
0,"AARONSBURG, PA",AARONSBURG,PA,1058.0,1058.0,0.0,0.0,0.0,0.0,23.0,...,4.0,41.5,43.8,38.9,53000.0,21407.0,642.0,170100.0,40.895701,-77.392432
1,"ABBEVILLE, AL",ABBEVILLE,AL,6470.0,4064.0,2274.0,0.0,0.0,96.0,66.0,...,7.0,50.1,45.3,52.4,33944.0,20104.0,516.0,78100.0,31.595148,-85.208852
2,"ABBEVILLE, GA",ABBEVILLE,GA,4719.0,2676.0,1903.0,0.0,41.0,89.0,55.0,...,0.0,39.3,38.2,45.8,29200.0,10071.0,435.0,58400.0,31.976256,-83.339665
3,"ABBEVILLE, LA",ABBEVILLE,LA,26078.0,17932.0,6457.0,160.0,1059.0,710.0,308.0,...,71.0,35.9,34.9,37.4,42909.0,21520.0,613.0,95800.0,29.894612,-92.193173
4,"ABBEVILLE, MS",ABBEVILLE,MS,2974.0,1432.0,1502.0,0.0,19.0,16.0,20.0,...,2.0,37.6,29.9,46.0,61563.0,26266.0,490.0,70200.0,34.492325,-89.443056


### Merge All Data

In [6]:
merged_data = pd.merge(city_census, min_distances, on=["CityState"], how="inner")

In [7]:
merged_data.columns

Index(['CityState', 'city', 'state', 'Population_x', 'White Population_x',
       'Black Population_x', 'Native American Population_x',
       'Asian Population_x', 'Hispanic Population_x', 'Education None_x',
       'Education High School_x', 'Education GED_x', 'Education Associates_x',
       'Education Bachelors_x', 'Education Masters_x',
       'Education Professional_x', 'Education Doctorate_x', 'Poverty_x',
       'Employment Labor Force_x', 'Employment Unemployed_x',
       'Employment Male Computer Engineering_x',
       'Employment Female Computer Engineering_x', 'Median Age_x',
       'Median Male Age_x', 'Median Female Age_x', 'Household Income_x',
       'Income Per Capita_x', 'Median Gross Rent_x', 'Median Home Value_x',
       'lat', 'lng', 'University', 'Distance', 'Closest', 'Asian Population_y',
       'Black Population_y', 'City', 'City Lat', 'City Lng', 'Distance_Text',
       'Education Associates_y', 'Education Bachelors_y',
       'Education Doctorate_y', 'Educati

In [8]:
merged_data = pd.merge(merged_data, impressions, on="CityState", how="left")

In [9]:
merged_data.columns

Index(['CityState', 'city', 'state', 'Population_x', 'White Population_x',
       'Black Population_x', 'Native American Population_x',
       'Asian Population_x', 'Hispanic Population_x', 'Education None_x',
       'Education High School_x', 'Education GED_x', 'Education Associates_x',
       'Education Bachelors_x', 'Education Masters_x',
       'Education Professional_x', 'Education Doctorate_x', 'Poverty_x',
       'Employment Labor Force_x', 'Employment Unemployed_x',
       'Employment Male Computer Engineering_x',
       'Employment Female Computer Engineering_x', 'Median Age_x',
       'Median Male Age_x', 'Median Female Age_x', 'Household Income_x',
       'Income Per Capita_x', 'Median Gross Rent_x', 'Median Home Value_x',
       'lat', 'lng', 'University', 'Distance', 'Closest', 'Asian Population_y',
       'Black Population_y', 'City', 'City Lat', 'City Lng', 'Distance_Text',
       'Education Associates_y', 'Education Bachelors_y',
       'Education Doctorate_y', 'Educati

In [10]:
merged_data.head()

Unnamed: 0,CityState,city,state,Population_x,White Population_x,Black Population_x,Native American Population_x,Asian Population_x,Hispanic Population_x,Education None_x,...,Time_Text,Uni Lat,Uni Lng,Uni CityState,White Population_y,Physical_Campus,Impressions,CPC,CTR,Cost
0,"AARONSBURG, PA",AARONSBURG,PA,1058.0,1058.0,0.0,0.0,0.0,0.0,23.0,...,2 hours 56 mins,40.4406,-79.9959,"PITTSBURGH, PA",1058.0,False,,,,
1,"ABBEVILLE, AL",ABBEVILLE,AL,6470.0,4064.0,2274.0,0.0,0.0,96.0,66.0,...,3 hours 3 mins,33.775618,-84.396285,"ATLANTA, GA",4064.0,False,0.23,2.45,0.04,0.02
2,"ABBEVILLE, GA",ABBEVILLE,GA,4719.0,2676.0,1903.0,0.0,41.0,89.0,55.0,...,2 hours 36 mins,33.775618,-84.396285,"ATLANTA, GA",2676.0,False,,,,
3,"ABBEVILLE, LA",ABBEVILLE,LA,26078.0,17932.0,6457.0,160.0,1059.0,710.0,308.0,...,3 hours 40 mins,29.719949,-95.342233,"HOUSTON, TX",17932.0,False,,,,
4,"ABBEVILLE, MS",ABBEVILLE,MS,2974.0,1432.0,1502.0,0.0,19.0,16.0,20.0,...,4 hours 1 min,36.144703,-86.802655,"NASHVILLE, TN",1432.0,False,,,,


In [11]:
merged_data.count()

CityState                                   29119
city                                        29119
state                                       29119
Population_x                                29119
White Population_x                          29119
Black Population_x                          29119
Native American Population_x                29119
Asian Population_x                          29119
Hispanic Population_x                       29119
Education None_x                            29119
Education High School_x                     29119
Education GED_x                             29119
Education Associates_x                      29119
Education Bachelors_x                       29119
Education Masters_x                         29119
Education Professional_x                    29119
Education Doctorate_x                       29119
Poverty_x                                   29119
Employment Labor Force_x                    29119
Employment Unemployed_x                     29119


In [12]:
merged_data.columns

Index(['CityState', 'city', 'state', 'Population_x', 'White Population_x',
       'Black Population_x', 'Native American Population_x',
       'Asian Population_x', 'Hispanic Population_x', 'Education None_x',
       'Education High School_x', 'Education GED_x', 'Education Associates_x',
       'Education Bachelors_x', 'Education Masters_x',
       'Education Professional_x', 'Education Doctorate_x', 'Poverty_x',
       'Employment Labor Force_x', 'Employment Unemployed_x',
       'Employment Male Computer Engineering_x',
       'Employment Female Computer Engineering_x', 'Median Age_x',
       'Median Male Age_x', 'Median Female Age_x', 'Household Income_x',
       'Income Per Capita_x', 'Median Gross Rent_x', 'Median Home Value_x',
       'lat', 'lng', 'University', 'Distance', 'Closest', 'Asian Population_y',
       'Black Population_y', 'City', 'City Lat', 'City Lng', 'Distance_Text',
       'Education Associates_y', 'Education Bachelors_y',
       'Education Doctorate_y', 'Educati

In [13]:
mid_data = merged_data[['CityState', 'city', 'state', 'Population_x', 'White Population_x',
       'Black Population_x', 'Native American Population_x',
       'Asian Population_x', 'Hispanic Population_x', 'Education None_x',
       'Education High School_x', 'Education GED_x', 'Education Associates_x',
       'Education Bachelors_x', 'Education Masters_x',
       'Education Professional_x', 'Education Doctorate_x', 'Poverty_x',
       'Employment Labor Force_x', 'Employment Unemployed_x',
       'Employment Male Computer Engineering_x',
       'Employment Female Computer Engineering_x', 'Median Age_x',
       'Median Male Age_x', 'Median Female Age_x', 'Household Income_x',
       'Income Per Capita_x', 'Median Gross Rent_x', 'Median Home Value_x',
       'lat', 'lng', 'University', 'Uni CityState', 'Distance', 'Closest', 'City', 'City Lat', 'City Lng', 'Distance_Text',
       'State', 'Time', 'Time_Text', 'Uni Lat', 'Uni Lng', 'Impressions', 'CPC', 'CTR', 'Cost']]

In [14]:
mid_data[["lat", "City Lat"]]

Unnamed: 0,lat,City Lat
0,40.895701,40.895701
1,31.595148,31.595148
2,31.976256,31.976256
3,29.894612,29.894612
4,34.492325,34.492325
5,34.182407,34.182407
6,45.229183,45.229183
7,44.967530,44.967530
8,31.881728,31.881728
9,39.895240,39.895240


In [15]:
final_data = mid_data[['CityState', 'Population_x', 'White Population_x',
       'Black Population_x', 'Native American Population_x',
       'Asian Population_x', 'Hispanic Population_x', 'Education None_x',
       'Education High School_x', 'Education GED_x', 'Education Associates_x',
       'Education Bachelors_x', 'Education Masters_x',
       'Education Professional_x', 'Education Doctorate_x', 'Poverty_x',
       'Employment Labor Force_x', 'Employment Unemployed_x',
       'Employment Male Computer Engineering_x',
       'Employment Female Computer Engineering_x', 'Median Age_x',
       'Median Male Age_x', 'Median Female Age_x', 'Household Income_x',
       'Income Per Capita_x', 'Median Gross Rent_x', 'Median Home Value_x','University', 'Uni CityState', 'Distance', 
        'Closest', 'City', 'City Lat', 'City Lng', 'Distance_Text',
       'State', 'Time', 'Time_Text', 'Uni Lat', 'Uni Lng', 'Impressions', 'CPC', 'CTR', 'Cost']]

In [16]:
final_data = final_data.rename(columns = {'Population_x':"Population", 'White Population_x':"White Population",
       'Black Population_x':"Black Population", 'Native American Population_x':"Native American Population",
       'Asian Population_x':"Asian Population", 'Hispanic Population_x':"Hispanic Population", 'Education None_x':"Education None",
       'Education High School_x':"Education High School", 'Education GED_x':"Education GED", 'Education Associates_x':"Education Associates",
       'Education Bachelors_x':"Education Bachelors", 'Education Masters_x':"Education Masters",
       'Education Professional_x':"Education Professional", 'Education Doctorate_x':"Education Doctorate", 'Poverty_x':"Poverty",
       'Employment Labor Force_x':"Employment Labor Force", 'Employment Unemployed_x':"Employment Unemployed",
       'Employment Male Computer Engineering_x':"Employment Male Computer Engineering",
       'Employment Female Computer Engineering_x':"Employment Female Computer Engineering", 'Median Age_x':"Median Age",
       'Median Male Age_x':"Median Male Age", 'Median Female Age_x':"Median Female Age", 'Household Income_x':"Household Income",
       'Income Per Capita_x':"Income Per Capita", 'Median Gross Rent_x':"Median Gross Rent", 'Median Home Value_x':"Median Home Value"})

In [17]:
final_data.columns

Index(['CityState', 'Population', 'White Population', 'Black Population',
       'Native American Population', 'Asian Population', 'Hispanic Population',
       'Education None', 'Education High School', 'Education GED',
       'Education Associates', 'Education Bachelors', 'Education Masters',
       'Education Professional', 'Education Doctorate', 'Poverty',
       'Employment Labor Force', 'Employment Unemployed',
       'Employment Male Computer Engineering',
       'Employment Female Computer Engineering', 'Median Age',
       'Median Male Age', 'Median Female Age', 'Household Income',
       'Income Per Capita', 'Median Gross Rent', 'Median Home Value',
       'University', 'Uni CityState', 'Distance', 'Closest', 'City',
       'City Lat', 'City Lng', 'Distance_Text', 'State', 'Time', 'Time_Text',
       'Uni Lat', 'Uni Lng', 'Impressions', 'CPC', 'CTR', 'Cost'],
      dtype='object')

In [18]:
final_data = final_data[['CityState', 'University', 'Uni CityState', 'Distance', 'Closest', 'City', 'City Lat', 'City Lng',
       'Distance_Text', 'State', 'Time', 'Time_Text', 'Uni Lat', 'Uni Lng','Population', 'White Population', 'Black Population',
       'Native American Population', 'Asian Population', 'Hispanic Population',
       'Education None', 'Education High School', 'Education GED',
       'Education Associates', 'Education Bachelors', 'Education Masters',
       'Education Professional', 'Education Doctorate', 'Poverty',
       'Employment Labor Force', 'Employment Unemployed',
       'Employment Male Computer Engineering',
       'Employment Female Computer Engineering', 'Median Age',
       'Median Male Age', 'Median Female Age', 'Household Income',
       'Income Per Capita', 'Median Gross Rent', 'Median Home Value',
       'Impressions', 'CPC', 'CTR', 'Cost']]

In [19]:
final_data

Unnamed: 0,CityState,University,Uni CityState,Distance,Closest,City,City Lat,City Lng,Distance_Text,State,...,Median Male Age,Median Female Age,Household Income,Income Per Capita,Median Gross Rent,Median Home Value,Impressions,CPC,CTR,Cost
0,"AARONSBURG, PA",PENN,"PITTSBURGH, PA",140.339859,1,AARONSBURG,40.895701,-77.392432,165 mi,PA,...,43.800000,38.900000,53000.000000,21407.000000,642.000000,170100.0,,,,
1,"ABBEVILLE, AL",GTECH,"ATLANTA, GA",157.537626,1,ABBEVILLE,31.595148,-85.208852,184 mi,AL,...,45.300000,52.400000,33944.000000,20104.000000,516.000000,78100.0,0.23,2.45,0.040,0.02
2,"ABBEVILLE, GA",GTECH,"ATLANTA, GA",138.382298,1,ABBEVILLE,31.976256,-83.339665,158 mi,GA,...,38.200000,45.800000,29200.000000,10071.000000,435.000000,58400.0,,,,
3,"ABBEVILLE, LA",UT,"HOUSTON, TX",189.538012,1,ABBEVILLE,29.894612,-92.193173,231 mi,LA,...,34.900000,37.400000,42909.000000,21520.000000,613.000000,95800.0,,,,
4,"ABBEVILLE, MS",VAND,"NASHVILLE, TN",187.693841,1,ABBEVILLE,34.492325,-89.443056,254 mi,MS,...,29.900000,46.000000,61563.000000,26266.000000,490.000000,70200.0,,,,
5,"ABBEVILLE, SC",GTECH,"ATLANTA, GA",116.574634,1,ABBEVILLE,34.182407,-82.425607,148 mi,SC,...,40.100000,44.900000,32423.000000,17810.000000,488.000000,94900.0,0.23,13.23,0.042,0.13
6,"ABBOT, ME",UNH,"DURHAM, NH",158.938804,1,ABBOT,45.229183,-69.596521,207 mi,ME,...,49.900000,49.900000,42292.000000,24085.000000,828.000000,130800.0,,,,
7,"ABBOTSFORD, WI",MINN,"MINNEAPOLIS, MN",144.634102,1,ABBOTSFORD,44.967530,-90.282268,152 mi,WI,...,37.500000,43.900000,41926.000000,26266.000000,585.000000,112500.0,,,,
8,"ABBOTT, TX",SMU,"DALLAS, TX",68.416138,1,ABBOTT,31.881728,-97.085410,80.3 mi,TX,...,40.200000,42.600000,56667.000000,24890.000000,800.000000,115300.0,,,,
9,"ABBOTTSTOWN, PA",PENN,"PHILADELPHIA, PA",94.939740,1,ABBOTTSTOWN,39.895240,-76.978971,117 mi,PA,...,42.600000,43.700000,62197.000000,25581.000000,967.000000,161900.0,,,,


### Export Data to Local DB

In [20]:
final_data.to_csv("Outputs/06-Final_Data.csv")

In [21]:
final_data.to_sql("Full_Data", conn_lite, if_exists="replace", index=False)

  chunksize=chunksize, dtype=dtype)


In [22]:
# Check Tables in SQLite
cur = conn_lite.cursor() 
res = cur.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;")
for name in res:
    print(name[0])

City_Census
FullDistances
Full_Data
Impressions
Minimum_Distances
Rules_Added
Zip_Census


### Export to Cloud SQL

In [24]:
# Create Engine and Pass in MySQL Connection
engine = create_engine('mysql+mysqldb://trilogy:test@35.227.28.228/mapping_data?unix_socket=/cloudsql/sql-projects:us-east1:opportunity-db')
conn = engine.connect()

In [26]:
final_data.to_sql("Full_Data", conn, if_exists="replace", index=False)

In [None]:
data = pd.read_sql("SELECT * FROM Full_Data", conn)

In [None]:
data.head()

In [7]:
#gtech_df = data[data["University"] == "GTECH"]

In [8]:
#gtech_df.to_csv("gtech_area.csv")