# SQL & ML

## Import Dependencies

In [1]:
!pip install psycopg2-binary



In [2]:
import pandas as pd
import re

from sqlalchemy import create_engine
import psycopg2

from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import session
from sqlalchemy import create_engine, func

from config2 import db_password


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import tensorflow as tf

## Load data to create 3 Dataframes: Inspections, Violations and Community Health

In [3]:
# Get datasets.
inspections_dataset = pd.read_csv("./Resources/Clean/clean_inspections.csv")
violations_dataset = pd.read_csv("./Resources/Clean/clean_violations.csv")
community_health_dataset = pd.read_csv("./Resources/Clean/clean_community_health.csv")

In [7]:
# Create Violations DataFrame
inspections_df = pd.DataFrame(inspections_dataset)
del inspections_df['Unnamed: 0']
inspections_df.head()

Unnamed: 0,ACTIVITY_DATE,FACILITY_ID,FACILITY_NAME,PROGRAM_NAME,PROGRAM_STATUS,FACILITY_ADDRESS,FACILITY_CITY,FACILITY_STATE,FACILITY_ZIP,SCORE,GRADE,SERIAL_NUMBER,SEATS,LAT,LNG
0,2018-09-10,FA0242046,SERVERY- NICKELODEON,SERVERY- NICKELODEON,ACTIVE,203 W OLIVE AVE # C,BURBANK,CA,91502,96,A,DARRFUZBW,31-60,-118.314661,34.175253
1,2018-07-19,FA0252769,TOMS JR BURGERS,TOMS JR BURGERS,ACTIVE,1030 W MARTIN LUTHER KING JR BLVD STE 108,LOS ANGELES,CA,90037,98,A,DA0XQVMTN,0-30,-118.292543,34.010859
2,2018-08-15,FA0011237,DJ BIBINGKAHAN,DJ BIBINGKAHAN BAKESHOP,ACTIVE,1515 E AMAR RD,WEST COVINA,CA,91792,98,A,DAMPOJNY8,0-30,-117.913926,34.030964
3,2018-09-07,FA0252595,MEJICO GRILL AND TEQUILLA LOUNGE,MEJICO GRILL AND TEQUILLA LOUNGE,ACTIVE,29002 AGOURA RD,AGOURA HILLS,CA,91301,90,A,DAUEU4NGF,151 +,-118.756808,34.143452
4,2018-09-18,FA0158101,MCDONALD'S #10681,MCDONALD'S #10681,ACTIVE,5725 FLORENCE AVE,BELL GARDENS,CA,90201,91,A,DARQIUA45,61-150,-118.163665,33.967791


In [8]:
# Create Violations DataFrame
violations_df = pd.DataFrame(violations_dataset)
del violations_df['Unnamed: 0']
violations_df.head()

Unnamed: 0,SERIAL_NUMER,VIOLATION_STATUS,VIOLATION_CODE,VIOLATION_DESCRIPTION,POINTS
0,DA000211Z,OUT OF COMPLIANCE,F006,# 06. Adequate handwashing facilities supplied...,2.0
1,DA000211Z,OUT OF COMPLIANCE,F044,"# 44. Floors, walls and ceilings: properly bui...",1.0
2,DA000211Z,OUT OF COMPLIANCE,F014,# 14. Food contact surfaces: clean and sanitized,2.0
3,DA000211Z,OUT OF COMPLIANCE,F029,"# 29. Toxic substances properly identified, st...",1.0
4,DA000211Z,OUT OF COMPLIANCE,F035,# 35. Equipment/Utensils - approved; installed...,1.0


In [9]:
# Create Violations DataFrame
community_health_df = pd.DataFrame(community_health_dataset)
del community_health_df['Unnamed: 0']
community_health_df.head()

Unnamed: 0,GEONAME,Pop_Tot,Prop_18y,Prop_64y,Prop_65y+,Prop_Blk,Prop_Lat,Prop_Whi,Prop_Asi,Prop_Ami,...,No_hless,Rte_crim,Rte_alco,Propt_envi,Prop_depr,Propt_HPI,Prop_fru,Prop_bev,Prop_hyp,Prop_marj
0,ALHAMBRA,86705,0.1831,0.6504,0.1665,0.0133,0.3435,0.0913,0.5498,0.0014,...,64,168,12,6,0.0523,43.3,0.2,0.2,0.2,0.1
1,ALTADENA,42525,0.2072,0.6221,0.1707,0.2374,0.2905,0.4129,0.0553,0.0019,...,58,162,6,62,0.1099,75.5,0.1,0.3,0.3,0.2
2,ARCADIA,56992,0.1794,0.6389,0.1817,0.0115,0.125,0.2304,0.6315,0.0014,...,12,146,19,27,0.0493,73.4,0.1,0.2,0.2,0.0
3,AZUSA,49479,0.2507,0.6538,0.0956,0.0292,0.6838,0.1938,0.0895,0.0024,...,55,354,14,48,0.0895,34.2,0.1,0.4,0.2,0.1
4,BALDWIN PARK,74438,0.2623,0.633,0.1047,0.0099,0.7934,0.0432,0.1514,0.0012,...,111,391,10,9,0.0544,22.3,0.1,0.4,0.3,0.1


# SQL

## Connect Dataframe and .csv files to SQL database.

In [10]:
file_dir = '/Users/julieal-huneidi/Desktop/Rats-in-the-Restaurants/'

In [11]:
# Connection string to local server.
db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/rats-in-the-restaurant"

In [12]:
# Create database engine.
engine = create_engine(db_string)

In [13]:
# Save the inspections_df to a SQL table. 
inspections_df.to_sql(name='clean_inspections', con=engine)

In [14]:
# Save the violations_df to a SQL table.
violations_df.to_sql(name='violations', con=engine)

In [16]:
# Save the inspections_df to a SQL table. 
community_health_df.to_sql(name='community_health', con=engine)

# Machine Learning

### SVM or Support Vector Machine has been chosen because it is a linear model for classification and regression problems. It can solve linear and non-linear problems. SVMs are less prone to overfitting because they are trying to maximize the distance, rather than encompass all data within a boundary.


## Get data from SQL database

In [None]:
# # Get data from SQL database for Machine Learning 
# vio_df = pd.read_sql_table("violations", con=engine, index_col=0)

In [None]:
# vio_df = pd.DataFrame(vio_df)
# vio_df.head()

## Preprocessing

In [None]:
# # Check the number of unique values in each column
# vio_df.dtypes

In [None]:
# vio_df = vio_df.drop(columns = ["index"])
# vio_df

In [None]:
# vio_df.rename(columns = {'ACTIVITY DATE':'ACTIVITY_DATE', 'FACILITY ID':'FACILITY_ID', 
#                              'FACILITY NAME':'FACILITY_NAME', 'PROGRAM NAME': 'PROGRAM_NAME',
#                              'PROGRAM STATUS':'PROGRAM_STATUS', 'FACILITY ADDRESS': 'FACILITY_ADDRESS',
#                              'FACILITY CITY' : 'FACILITY_CITY', 'FACILITY STATE' : 'FACILITY_STATE',
#                              "FACILITY ZIP": "FACILITY_ZIP", "GRADE": "GRADE", "SERIAL NUMBER" : "SERIAL_NUMBER", 
#                              "SEATS" : "SEATS", "LAT" : "LAT", "LNG" : "LNG"}, inplace = True)
# inspect_df.head()

In [None]:
# vio_df = vio_df.drop(columns = ["SERIAL NUMBER"])
# vio_df

In [None]:
# # Create a OneHotEncoder instance
# enc = OneHotEncoder(sparse=False)

In [None]:
# # Fit and transform the OneHotEncoder using the categorical variable list
# encode_df = pd.DataFrame(enc.fit_transform(vio_df[vio_cat]))

In [None]:
# encode_df.columns = enc.get_feature_names(vio_cat)
# encode_df.head()

In [None]:
# Merge one-hot encoded features and drop the originals
# vio_df = vio_df.merge(encode_df,left_index=True, right_index=True)
# vio_df = vio_df.drop(vio_cat,1)
# vio_df.head()

In [None]:
# # Remove loan status target from features data
# y = vio_df.POINTS.values
# X = vio_df.drop(columns=["POINTS"]).values

# # Split training/test datasets
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# # Create a StandardScaler instance
# scaler = StandardScaler()

# # Fit the StandardScaler
# X_scaler = scaler.fit(X_train)

# # Scale the data
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

In [None]:
# # Create the SVM model
# svm = SVC(kernel='linear')

In [None]:
# # Train the model
# svm.fit(X_train, y_train)

## Random Forest Model

In [None]:
# # Create a random forest classifier.
# rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# # Fitting the model
# rf_model = rf_model.fit(X_train_scaled, y_train)

# # Evaluate the model
# y_pred = rf_model.predict(X_test_scaled)
# print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")