## Implement Rules
---
In this Jupyter notebook, we implement additive rules on the state and university levels, and negative rules on the university level.

### Dependencies

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import time 
import sqlite3
import csv
from geopy.distance import vincenty
import glob
from sqlalchemy import create_engine
import pymysql
pymysql.install_as_MySQLdb()

### Read the Additive Rules Files

In [2]:
#import border rules csv, clean up
border_rules = {}
csvpath = 'Raw/border-rules.csv'

with open(csvpath, newline= "") as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',')
    next(csvreader)

    for row in csvreader:
        state = row[0]
        border_rules[state] = row[1:]
        
for item in border_rules:
    border_rules[item] = [x for x in border_rules[item] if x]

#import university rules csv, clean up
uni_rules = {}
csvpath2 = 'Raw/uni-rules.csv'

with open(csvpath2, newline= "") as csvfile2:
    csvreader2 = csv.reader(csvfile2, delimiter=',')
    next(csvreader2)

    for row in csvreader2:
        uni = row[0]
        uni_rules[uni] = row[1:]

for item in uni_rules:
    uni_rules[item] = [x for x in uni_rules[item] if x]

In [3]:
border_rules

{'AK': ['AK'],
 'AL': ['AL', 'FL', 'GA', 'MS', 'TN'],
 'AR': ['AR', 'LA', 'MS', 'MO', 'OK', 'TN', 'TX'],
 'AZ': ['AZ', 'CA', 'CO', 'NM', 'NV', 'UT'],
 'CA': ['CA'],
 'CO': ['CO', 'AZ', 'KS', 'NE', 'NM', 'OK', 'UT', 'WY', 'MT'],
 'CT': ['CT', 'MA', 'NY', 'RI'],
 'DC': ['DC', 'MD', 'VA'],
 'DE': ['DE', 'MD'],
 'FL': ['FL', 'AL', 'GA'],
 'GA': ['GA', 'AL', 'FL', 'NC', 'SC', 'TN'],
 'HI': ['HI'],
 'IA': ['IA', 'IL', 'MN', 'MO', 'NE', 'SD', 'WI'],
 'ID': ['ID', 'MT', 'NV', 'OR', 'UT', 'WA', 'WY'],
 'IL': ['IL', 'IN', 'IA', 'MI', 'KY', 'MO', 'WI'],
 'IN': ['IN', 'IL', 'KY', 'MI', 'OH'],
 'KS': ['KS', 'CO', 'MO', 'NE', 'OK'],
 'KY': ['KY', 'IL', 'IN', 'MO', 'OH', 'TN', 'VA', 'WV'],
 'LA': ['LA', 'AR', 'MS', 'TX'],
 'MA': ['MA', 'CT', 'NH', 'RI', 'VT'],
 'MD': ['MD', 'DE', 'VA', 'WV'],
 'ME': ['ME', 'NH'],
 'MI': ['MI', 'IL', 'IN', 'OH', 'WI'],
 'MN': ['MN', 'IA', 'ND', 'SD', 'WI', 'MT'],
 'MO': ['MO', 'AR', 'IL', 'IA', 'KS', 'KY', 'NE', 'OK', 'TN'],
 'MS': ['MS', 'AL', 'AR', 'LA', 'TN'],
 'MT

In [4]:
uni_rules

{'CU': [],
 'CWR': ['NY'],
 'DU': [],
 'GTECH': [],
 'GW': [],
 'HARV': [],
 'KU': ['IA'],
 'MIAMI': [],
 'MINN': [],
 'NU': [],
 'OSU': [],
 'PENN': [],
 'RUT': [],
 'SMU': [],
 'UA': [],
 'UCB': [],
 'UCD': [],
 'UCF': [],
 'UCI': [],
 'UCLA': [],
 'UCSD': [],
 'UMASS': [],
 'UMICH': [],
 'UNC': [],
 'UNCH': [],
 'UNH': [],
 'UP': [],
 'UR': [],
 'UT': [],
 'UTAH': [],
 'UTSA': [],
 'UW': [],
 'UWM': [],
 'VAND': [],
 'WASHU': []}

### Read the List of Physical Locations

In [5]:
#import physical location data for each university to check (make sure uni_rules.csv is updated according to this list)
physical_locations = pd.read_excel("Raw/Physical_Campuses_geo.xlsx")
physical_locations.count()

University             44
CityState              44
Physical_Campus        44
Current_Prospective    44
State                  44
Uni_lat                44
Uni_lng                44
dtype: int64

In [6]:
physical_locations

Unnamed: 0,University,CityState,Physical_Campus,Current_Prospective,State,Uni_lat,Uni_lng
0,UMASS,"AMHERST, MA",True,Current,MA,42.391157,-72.526712
1,GTECH,"ATLANTA, GA",True,Current,GA,33.775618,-84.396285
2,UT,"AUSTIN, TX",True,Current,TX,30.284918,-97.734057
3,UCB,"BERKELEY, CA",True,Current,CA,37.871899,-122.25854
4,HARV,"BOSTON, MA",True,Prospective,MA,42.3505,-71.105399
5,UNC,"CHAPEL HILL, NC",True,Current,NC,35.904912,-79.046913
6,UNCH,"CHARLOTTE, NC",True,Current,NC,35.307093,-80.735164
7,NU,"CHICAGO, IL",True,Current,IL,42.056459,-87.675267
8,CWR,"CLEVELAND, OH",True,Current,OH,41.504341,-81.608384
9,OSU,"COLUMBUS, OH",True,Prospective,OH,40.01419,-83.030914


### Read the Full List of Cities

In [7]:
#import census data for each CityState
conn = sqlite3.connect("Opportunity_Map.db")
full_cities = pd.read_sql("select * from City_Census", conn)

### Run Additive Rules

In [8]:
#create dataframe by checking universities using border rules
border_rules_check = pd.DataFrame(columns=['City', 'State', 'City Lat', 'City Lng',
                                      'University', 'Uni Lat', 'Uni Lng', 'Distance'])

counter = 0
for index, row in physical_locations.iterrows():
     print("Checking " + row["University"])
     state_pull = row["State"]
     uni_lat = row["Uni_lat"]
     uni_lng = row["Uni_lng"]
     uni_name = row["University"]
     uni_loc = row["CityState"]
     for item in border_rules[state_pull]:
         if item is not "":
             #check against the city/state dataframe, create new dataframe of values, add values for city/state and lat/lng
             for index, row in full_cities.iterrows():
                 if row["state"] == item:
                     city_name = row["city"]
                     state_name = row["state"]
                     city_lat = row["lat"]
                     city_lng = row["lng"]
                     city_state = row["CityState"]
                    
                     p1 = (uni_lat, uni_lng)
                     p2 = (city_lat, city_lng)
                     mi_distance = vincenty(p1, p2).miles
                     
                     border_rules_check.set_value(counter, "Distance", mi_distance)
                     border_rules_check.set_value(counter, "City", city_name)
                     border_rules_check.set_value(counter, "State", state_name)
                     border_rules_check.set_value(counter, "City Lat", city_lat)
                     border_rules_check.set_value(counter, "City Lng", city_lng)
                     border_rules_check.set_value(counter, "University", uni_name)
                     border_rules_check.set_value(counter, "Uni CityState", uni_loc)
                     border_rules_check.set_value(counter, "Uni Lat", uni_lat)
                     border_rules_check.set_value(counter, "Uni Lng", uni_lng)
                     border_rules_check.set_value(counter, "CityState", city_state)
                     counter = counter + 1
                 else:
                     continue

border_rules_check.head()

Checking UMASS
Checking GTECH
Checking UT
Checking UCB
Checking HARV
Checking UNC
Checking UNCH
Checking NU
Checking CWR
Checking OSU
Checking SMU
Checking UCD
Checking DU
Checking UMICH
Checking UNH
Checking NU
Checking UT
Checking UCI
Checking RUT
Checking KU
Checking UCSD
Checking UCLA
Checking MIAMI
Checking MINN
Checking VAND
Checking CU
Checking UCF
Checking PENN
Checking UA
Checking PENN
Checking UP
Checking UR
Checking UCD
Checking WASHU
Checking UTAH
Checking UTSA
Checking UCB
Checking GTECH
Checking UW
Checking RUT
Checking UCF
Checking UA
Checking GW
Checking UWM


Unnamed: 0,City,State,City Lat,City Lng,University,Uni Lat,Uni Lng,Distance,Uni CityState,CityState
0,ABINGTON,MA,42.12,-70.9572,UMASS,42.3912,-72.5267,82.622,"AMHERST, MA","ABINGTON, MA"
1,ACTON,MA,42.484,-71.4385,UMASS,42.3912,-72.5267,56.0042,"AMHERST, MA","ACTON, MA"
2,ACUSHNET,MA,41.7182,-70.9012,UMASS,42.3912,-72.5267,95.645,"AMHERST, MA","ACUSHNET, MA"
3,ADAMS,MA,42.6238,-73.1167,UMASS,42.3912,-72.5267,34.144,"AMHERST, MA","ADAMS, MA"
4,AGAWAM,MA,42.0624,-72.6258,UMASS,42.3912,-72.5267,23.2549,"AMHERST, MA","AGAWAM, MA"


In [9]:
border_rules_check.head()

Unnamed: 0,City,State,City Lat,City Lng,University,Uni Lat,Uni Lng,Distance,Uni CityState,CityState
0,ABINGTON,MA,42.12,-70.9572,UMASS,42.3912,-72.5267,82.622,"AMHERST, MA","ABINGTON, MA"
1,ACTON,MA,42.484,-71.4385,UMASS,42.3912,-72.5267,56.0042,"AMHERST, MA","ACTON, MA"
2,ACUSHNET,MA,41.7182,-70.9012,UMASS,42.3912,-72.5267,95.645,"AMHERST, MA","ACUSHNET, MA"
3,ADAMS,MA,42.6238,-73.1167,UMASS,42.3912,-72.5267,34.144,"AMHERST, MA","ADAMS, MA"
4,AGAWAM,MA,42.0624,-72.6258,UMASS,42.3912,-72.5267,23.2549,"AMHERST, MA","AGAWAM, MA"


In [10]:
#create additional dataframe by checking universities using university rules
uni_rules_check = pd.DataFrame(columns=['City', 'State', 'City Lat', 'City Lng',
                                      'University', 'Uni Lat', 'Uni Lng', 'Distance', 'Priority'])

counter = 0
for index, row in physical_locations.iterrows():
    print("Checking " + row["University"])
    uni_pull = row["University"]
    uni_lat = row["Uni_lat"]
    uni_lng = row["Uni_lng"]
    uni_name = row["University"]
    uni_loc = row["CityState"]
    for item in uni_rules[uni_pull]:
        if item is not "":
            #check against the city/state dataframe, create new dataframe of values, add values for city/state and lat/lng
            for index, row in full_cities.iterrows():
                if row["state"] == item or row["CityState"] == item:
                    city_name = row["city"]
                    state_name = row["state"]
                    city_lat = row["lat"]
                    city_lng = row["lng"]
                    city_state = row["CityState"]
                    
                    p1 = (uni_lat, uni_lng)
                    p2 = (city_lat, city_lng)
                    mi_distance = vincenty(p1, p2).miles

                    uni_rules_check.set_value(counter, "Distance", mi_distance)
                    uni_rules_check.set_value(counter, "City", city_name)
                    uni_rules_check.set_value(counter, "State", state_name)
                    uni_rules_check.set_value(counter, "City Lat", city_lat)
                    uni_rules_check.set_value(counter, "City Lng", city_lng)
                    uni_rules_check.set_value(counter, "University", uni_name)
                    uni_rules_check.set_value(counter, "Uni CityState", uni_loc)
                    uni_rules_check.set_value(counter, "Uni Lat", uni_lat)
                    uni_rules_check.set_value(counter, "Uni Lng", uni_lng)
                    uni_rules_check.set_value(counter, "CityState", city_state)
                    uni_rules_check.set_value(counter, "Priority", True)
                    counter = counter + 1

                else:
                    continue

uni_rules_check.head()

Checking UMASS
Checking GTECH
Checking UT
Checking UCB
Checking HARV
Checking UNC
Checking UNCH
Checking NU
Checking CWR
Checking OSU
Checking SMU
Checking UCD
Checking DU
Checking UMICH
Checking UNH
Checking NU
Checking UT
Checking UCI
Checking RUT
Checking KU
Checking UCSD
Checking UCLA
Checking MIAMI
Checking MINN
Checking VAND
Checking CU
Checking UCF
Checking PENN
Checking UA
Checking PENN
Checking UP
Checking UR
Checking UCD
Checking WASHU
Checking UTAH
Checking UTSA
Checking UCB
Checking GTECH
Checking UW
Checking RUT
Checking UCF
Checking UA
Checking GW
Checking UWM


Unnamed: 0,City,State,City Lat,City Lng,University,Uni Lat,Uni Lng,Distance,Priority,Uni CityState,CityState
0,ACCORD,NY,41.819,-74.2361,CWR,41.5043,-81.6084,382.037,True,"CLEVELAND, OH","ACCORD, NY"
1,ACRA,NY,42.3179,-74.086,CWR,41.5043,-81.6084,391.709,True,"CLEVELAND, OH","ACRA, NY"
2,ADAMS CENTER,NY,43.8726,-76.0147,CWR,41.5043,-81.6084,328.309,True,"CLEVELAND, OH","ADAMS CENTER, NY"
3,ADAMS,NY,43.8074,-76.0504,CWR,41.5043,-81.6084,324.643,True,"CLEVELAND, OH","ADAMS, NY"
4,ADDISON,NY,42.1041,-77.293,CWR,41.5043,-81.6084,226.627,True,"CLEVELAND, OH","ADDISON, NY"


In [11]:
#university rules incorporated at front end, would then concatenate these frames and continue with rest of the script
frames = [border_rules_check, uni_rules_check]
additive_rules_df = pd.concat(frames, join='outer')

In [12]:
additive_rules_df.reset_index(inplace=True)

In [13]:
additive_rules_df.to_csv("full_additive_rules.csv")

In [14]:
additive_rules_df.head()

Unnamed: 0,index,City,City Lat,City Lng,CityState,Distance,Priority,State,Uni CityState,Uni Lat,Uni Lng,University
0,0,ABINGTON,42.12,-70.9572,"ABINGTON, MA",82.622,,MA,"AMHERST, MA",42.3912,-72.5267,UMASS
1,1,ACTON,42.484,-71.4385,"ACTON, MA",56.0042,,MA,"AMHERST, MA",42.3912,-72.5267,UMASS
2,2,ACUSHNET,41.7182,-70.9012,"ACUSHNET, MA",95.645,,MA,"AMHERST, MA",42.3912,-72.5267,UMASS
3,3,ADAMS,42.6238,-73.1167,"ADAMS, MA",34.144,,MA,"AMHERST, MA",42.3912,-72.5267,UMASS
4,4,AGAWAM,42.0624,-72.6258,"AGAWAM, MA",23.2549,,MA,"AMHERST, MA",42.3912,-72.5267,UMASS


### Read the Negative Rules Files

In [15]:
#incorporate negative rules on full frame
#import university rules csv, clean up
negative_rules = {}
csvpath = 'Raw/negative-rules.csv'

with open(csvpath, newline= "") as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',')
    next(csvreader)

    for row in csvreader:
        uni = row[0]
        negative_rules[uni] = row[1:]

for item in negative_rules:
    negative_rules[item] = [x for x in negative_rules[item] if x]

In [16]:
negative_rules

{'CU': [],
 'CWR': [],
 'DU': [],
 'GTECH': ['TN', 'FL'],
 'GW': [],
 'HARV': [],
 'KU': [],
 'MIAMI': [],
 'MINN': [],
 'NU': [],
 'OSU': [],
 'PENN': [],
 'RUT': [],
 'SMU': [],
 'UA': [],
 'UCB': [],
 'UCD': [],
 'UCF': [],
 'UCI': [],
 'UCLA': [],
 'UCSD': [],
 'UMASS': [],
 'UMICH': [],
 'UNC': [],
 'UNCH': [],
 'UNH': [],
 'UP': ['NJ'],
 'UR': [],
 'UT': [],
 'UTAH': [],
 'UTSA': [],
 'UW': [],
 'VAND': [],
 'WASHU': ['IL']}

In [17]:
for index, row in additive_rules_df.iterrows():
    if row["University"] in negative_rules:
        for item in negative_rules[row["University"]]:
            if (row["State"] == item) or (row["CityState"] == item):
                additive_rules_df.drop(index, inplace=True)

In [18]:
test = additive_rules_df[additive_rules_df["University"] == "GTECH"]

In [19]:
test[test["State"] == "FL"]

Unnamed: 0,index,City,City Lat,City Lng,CityState,Distance,Priority,State,Uni CityState,Uni Lat,Uni Lng,University


In [20]:
additive_rules_df = additive_rules_df[['City', 'City Lat', 'City Lng', 'CityState', 'Distance',
       'Priority', 'State', 'Uni CityState', 'Uni Lat', 'Uni Lng',
       'University']]

In [21]:
additive_rules_df.to_csv("with_negative_rules.csv")

### Distance Check

In [22]:
additive_rules_df = additive_rules_df[additive_rules_df["Distance"] <= 200]

In [23]:
additive_rules_df.head()

Unnamed: 0,City,City Lat,City Lng,CityState,Distance,Priority,State,Uni CityState,Uni Lat,Uni Lng,University
0,ABINGTON,42.12,-70.9572,"ABINGTON, MA",82.622,,MA,"AMHERST, MA",42.3912,-72.5267,UMASS
1,ACTON,42.484,-71.4385,"ACTON, MA",56.0042,,MA,"AMHERST, MA",42.3912,-72.5267,UMASS
2,ACUSHNET,41.7182,-70.9012,"ACUSHNET, MA",95.645,,MA,"AMHERST, MA",42.3912,-72.5267,UMASS
3,ADAMS,42.6238,-73.1167,"ADAMS, MA",34.144,,MA,"AMHERST, MA",42.3912,-72.5267,UMASS
4,AGAWAM,42.0624,-72.6258,"AGAWAM, MA",23.2549,,MA,"AMHERST, MA",42.3912,-72.5267,UMASS


### Export

In [24]:
#the following selection of CityStates is within range for a google distance check

In [25]:
final_rules = additive_rules_df

final_rules.to_csv("testing_distance.csv")

In [26]:
final_rules.to_sql("Rules_Added", conn, if_exists="replace", index=False)

  chunksize=chunksize, dtype=dtype)


In [27]:
# Check Tables in SQLite
cur = conn.cursor() 
res = cur.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;")
for name in res:
    print(name[0])

City_Census
Rules_Added
Zip_Census


In [28]:
final_rules[final_rules["CityState"] == "SHINGLEHOUSE, PA"]

Unnamed: 0,City,City Lat,City Lng,CityState,Distance,Priority,State,Uni CityState,Uni Lat,Uni Lng,University
84444,SHINGLEHOUSE,41.9472,-78.1445,"SHINGLEHOUSE, PA",141.848,,PA,"PITTSBURGH, PA",40.4406,-79.9959,PENN


### Export to Cloud SQL

In [29]:
engine = create_engine('mysql+mysqldb://trilogy:DataRocks@35.227.28.228/mapping_data?unix_socket=/cloudsql/sql-projects:us-east1:opportunity-db')
conn = engine.connect()

final_rules.to_sql("Rules_Added", conn, if_exists="replace", index=False)

OperationalError: (pymysql.err.OperationalError) (2003, "Can't connect to MySQL server on '35.227.28.228' (timed out)")