## Import Dependencies

In [1]:
# For data
import pandas as pd

# For plotting
import matplotlib.pyplot as plt

# For connection to database
from sqlalchemy import create_engine
import psycopg2
from config2 import db_password

# For machine learning model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

# Machine Learning

### Using SVM or Support Vector Machine it is a linear model for classification and regression problems. It can solve linear and non-linear problems. SVMs are less prone to overfitting because they are trying to maximize the distance, rather than encompass all data within a boundary.

## Get data from Postgresql database

In [2]:
# Connection string to local server.
db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/rats-in-the-restaurant"

In [3]:
# Create database engine.
engine = create_engine(db_string)

In [4]:
# Get data from Postgresql table for Machine Learning 
inspect_vio_df = pd.read_sql_table("inspect_vio", con=engine, index_col=0)

In [5]:
# Check data imported correctly from Postgresql into DataFrame
inspect_vio_df

In [6]:
# Get cleaned community_health.csv
clean_community_health_dataset = pd.read_csv("./Resources/Clean/clean_community_health.csv")

In [7]:
# Create community_health DataFrame and delete Unnamed column
cch_df = pd.DataFrame(clean_community_health_dataset)
cch_df

In [8]:
# Check the number of unique values in each column
cch_df.nunique()

In [9]:
# Merge inspect_vio and cch DataFrames
rats_df = pd.merge(inspect_vio_df, cch_df, left_on="FACILITY_CITY", right_on="GEONAME",
                  how="left", indicator=True)
rats_df

In [10]:
# Find the null values, if any.
for column in rats_df.columns:
    print(f"Column{column} has {rats_df[column].isnull().sum()} null values")

In [11]:
# Check the datatypes in each column
rats_df.dtypes

In [12]:
# Drop _merge column
rats_df = rats_df.drop(["_merge"],1)
rats_df

In [13]:
# Drop rows with null values
rats_df = rats_df.dropna()
rats_df

In [14]:
# Confirm rows with null values were deleted
for column in rats_df.columns:
    print(f"Column{column} has {rats_df[column].isnull().sum()} null values")

In [15]:
# Export rats DataFrame as csv
rats_df.to_csv("./Resources/Clean/rats.csv")

## Preprocessing for Machine Learning

#### After exporting DataFrame to csv, additional cleaning of the data was necessary in notebook cleaning and transformation and new csv file was created for the machine learning.  This new csv was imported to continue with machine learning.

In [16]:
# Get updated clean_new_rats.csv - the one with the "**" removed
clean_new_rats_dataset = pd.read_csv("./Resources/merged/clean_new_rats.csv")

In [17]:
# Create dataframe with clean_new_rats dataset csv
new_rats_df = pd.DataFrame(clean_new_rats_dataset)
new_rats_df

Unnamed: 0.1,Unnamed: 0,ACTIVITY_DATE,FACILITY_ID,FACILITY_NAME,PROGRAM_NAME,PROGRAM_STATUS,FACILITY_ADDRESS,FACILITY_CITY,FACILITY_STATE,FACILITY_ZIP,...,No_hless,Rte_crim,Rte_alco,Propt_envi,Prop_depr,Propt_HPI,Prop_fru,Prop_bev,Prop_hyp,Prop_marj
0,0,2020-02-04,FA0240932,THE GREAT ROOM CAFE,THE GREAT ROOM CAFE,ACTIVE,2810 ARTESIA BLVD,REDONDO BEACH,CA,90278,...,216.0,231.0,26.0,84,0.1129,93.0,0.2,0.2,0.2,0.1
1,1,2020-02-04,FA0240932,THE GREAT ROOM CAFE,THE GREAT ROOM CAFE,ACTIVE,2810 ARTESIA BLVD,REDONDO BEACH,CA,90278,...,216.0,231.0,26.0,84,0.1129,93.0,0.2,0.2,0.2,0.1
2,2,2020-02-04,FA0240932,THE GREAT ROOM CAFE,THE GREAT ROOM CAFE,ACTIVE,2810 ARTESIA BLVD,REDONDO BEACH,CA,90278,...,216.0,231.0,26.0,84,0.1129,93.0,0.2,0.2,0.2,0.1
3,3,2020-02-04,FA0240932,THE GREAT ROOM CAFE,THE GREAT ROOM CAFE,ACTIVE,2810 ARTESIA BLVD,REDONDO BEACH,CA,90278,...,216.0,231.0,26.0,84,0.1129,93.0,0.2,0.2,0.2,0.1
4,4,2020-02-04,FA0240932,THE GREAT ROOM CAFE,THE GREAT ROOM CAFE,ACTIVE,2810 ARTESIA BLVD,REDONDO BEACH,CA,90278,...,216.0,231.0,26.0,84,0.1129,93.0,0.2,0.2,0.2,0.1
5,5,2019-08-06,FA0251477,FISH DISH,FISH DISH,ACTIVE,5300 LANKERSHIM BLVD,BURBANK,CA,91601,...,167.0,201.0,22.0,5,0.0990,62.2,0.2,0.3,0.2,0.2
6,6,2019-08-06,FA0251477,FISH DISH,FISH DISH,ACTIVE,5300 LANKERSHIM BLVD,BURBANK,CA,91601,...,167.0,201.0,22.0,5,0.0990,62.2,0.2,0.3,0.2,0.2
7,7,2019-08-06,FA0251477,FISH DISH,FISH DISH,ACTIVE,5300 LANKERSHIM BLVD,BURBANK,CA,91601,...,167.0,201.0,22.0,5,0.0990,62.2,0.2,0.3,0.2,0.2
8,8,2019-08-06,FA0251477,FISH DISH,FISH DISH,ACTIVE,5300 LANKERSHIM BLVD,BURBANK,CA,91601,...,167.0,201.0,22.0,5,0.0990,62.2,0.2,0.3,0.2,0.2
9,9,2019-05-08,FA0033765,SONOMA WINE GARDEN,SONOMA WINE GARDEN,ACTIVE,395 SANTA MONICA PL,SANTA MONICA,CA,90401,...,712.0,509.0,35.0,72,0.1488,83.2,0.4,0.2,0.2,0.2


In [18]:
# Drop columns unnecessary for ML
rats_ML_df = new_rats_df.drop(["FACILITY_STATE", "GEONAME", "FACILITY_ADDRESS", "FACILITY_ID", 
                           "Unnamed: 0", "PROGRAM_NAME", "FACILITY_ZIP", "LAT", "LNG", "FACILITY_NAME", "GRADE",
                           "serial_number", "violation_description", "ACTIVITY_DATE"],1)
rats_ML_df

Unnamed: 0,PROGRAM_STATUS,FACILITY_CITY,SCORE,SEATS,violation_status,violation_code,points,Unnamed: 0.1,Pop_Tot,Prop_18y,...,No_hless,Rte_crim,Rte_alco,Propt_envi,Prop_depr,Propt_HPI,Prop_fru,Prop_bev,Prop_hyp,Prop_marj
0,ACTIVE,REDONDO BEACH,91,61-150,OUT OF COMPLIANCE,F007,4.0,66.0,69316.0,0.2056,...,216.0,231.0,26.0,84,0.1129,93.0,0.2,0.2,0.2,0.1
1,ACTIVE,REDONDO BEACH,91,61-150,OUT OF COMPLIANCE,F034,1.0,66.0,69316.0,0.2056,...,216.0,231.0,26.0,84,0.1129,93.0,0.2,0.2,0.2,0.1
2,ACTIVE,REDONDO BEACH,91,61-150,OUT OF COMPLIANCE,F040,1.0,66.0,69316.0,0.2056,...,216.0,231.0,26.0,84,0.1129,93.0,0.2,0.2,0.2,0.1
3,ACTIVE,REDONDO BEACH,91,61-150,OUT OF COMPLIANCE,F037,1.0,66.0,69316.0,0.2056,...,216.0,231.0,26.0,84,0.1129,93.0,0.2,0.2,0.2,0.1
4,ACTIVE,REDONDO BEACH,91,61-150,OUT OF COMPLIANCE,F006,2.0,66.0,69316.0,0.2056,...,216.0,231.0,26.0,84,0.1129,93.0,0.2,0.2,0.2,0.1
5,ACTIVE,BURBANK,92,0-30,OUT OF COMPLIANCE,F007,4.0,9.0,104692.0,0.1820,...,167.0,201.0,22.0,5,0.0990,62.2,0.2,0.3,0.2,0.2
6,ACTIVE,BURBANK,92,0-30,OUT OF COMPLIANCE,F055,2.0,9.0,104692.0,0.1820,...,167.0,201.0,22.0,5,0.0990,62.2,0.2,0.3,0.2,0.2
7,ACTIVE,BURBANK,92,0-30,OUT OF COMPLIANCE,F033,1.0,9.0,104692.0,0.1820,...,167.0,201.0,22.0,5,0.0990,62.2,0.2,0.3,0.2,0.2
8,ACTIVE,BURBANK,92,0-30,OUT OF COMPLIANCE,F044,1.0,9.0,104692.0,0.1820,...,167.0,201.0,22.0,5,0.0990,62.2,0.2,0.3,0.2,0.2
9,ACTIVE,SANTA MONICA,90,61-150,OUT OF COMPLIANCE,F033,1.0,73.0,93409.0,0.1429,...,712.0,509.0,35.0,72,0.1488,83.2,0.4,0.2,0.2,0.2


In [19]:
# Generate our categorical variable list
rats_cat = rats_ML_df.dtypes[rats_ML_df.dtypes == "object"].index.tolist()
print(rats_cat)

['PROGRAM_STATUS', 'FACILITY_CITY', 'SEATS', 'violation_status', 'violation_code']


In [20]:
# Check the number of unique values in each column
rats_ML_df[rats_cat].nunique()

PROGRAM_STATUS       2
FACILITY_CITY       65
SEATS                4
violation_status     2
violation_code      70
dtype: int64

In [21]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

In [22]:
# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(rats_ML_df[rats_cat]))

In [23]:
encode_df.columns = enc.get_feature_names(rats_cat)
encode_df.head()

Unnamed: 0,PROGRAM_STATUS_ACTIVE,PROGRAM_STATUS_INACTIVE,FACILITY_CITY_ALHAMBRA,FACILITY_CITY_ALTADENA,FACILITY_CITY_ARCADIA,FACILITY_CITY_AZUSA,FACILITY_CITY_BALDWIN PARK,FACILITY_CITY_BELL,FACILITY_CITY_BELL GARDENS,FACILITY_CITY_BELLFLOWER,...,violation_code_MF41,violation_code_MF45,violation_code_W019,violation_code_W020,violation_code_W023,violation_code_W026,violation_code_W032,violation_code_W034,violation_code_W050,violation_code_W052
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# Merge one-hot encoded features and drop the originals
rats_ML_df = rats_ML_df.merge(encode_df,left_index=True, right_index=True)
rats_ML_df = rats_ML_df.drop(rats_cat,1)
rats_ML_df

Unnamed: 0,SCORE,points,Unnamed: 0.1,Pop_Tot,Prop_18y,Prop_64y,Prop_65y+,Prop_Blk,Prop_Lat,Prop_Whi,...,violation_code_MF41,violation_code_MF45,violation_code_W019,violation_code_W020,violation_code_W023,violation_code_W026,violation_code_W032,violation_code_W034,violation_code_W050,violation_code_W052
0,91,4.0,66.0,69316.0,0.2056,0.6614,0.1330,0.0272,0.1567,0.6806,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,91,1.0,66.0,69316.0,0.2056,0.6614,0.1330,0.0272,0.1567,0.6806,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,91,1.0,66.0,69316.0,0.2056,0.6614,0.1330,0.0272,0.1567,0.6806,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,91,1.0,66.0,69316.0,0.2056,0.6614,0.1330,0.0272,0.1567,0.6806,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,91,2.0,66.0,69316.0,0.2056,0.6614,0.1330,0.0272,0.1567,0.6806,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,92,4.0,9.0,104692.0,0.1820,0.6683,0.1497,0.0249,0.2551,0.5918,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,92,2.0,9.0,104692.0,0.1820,0.6683,0.1497,0.0249,0.2551,0.5918,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,92,1.0,9.0,104692.0,0.1820,0.6683,0.1497,0.0249,0.2551,0.5918,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,92,1.0,9.0,104692.0,0.1820,0.6683,0.1497,0.0249,0.2551,0.5918,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,90,1.0,73.0,93409.0,0.1429,0.6918,0.1653,0.0374,0.1333,0.7266,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
# Remove SCORE target from features data
y = rats_ML_df.SCORE.values
X = rats_ML_df.drop(columns=["SCORE"]).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Standard Vector Machine (Classifier) Model

In [26]:
# Create the SVM model
svm = LinearSVC(max_iter=1000)

In [27]:
# Train the model
svm.fit(X_train, y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [28]:
# Evaluate the model
y_pred = svm.predict(X_test_scaled)
print(f" SVM model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 SVM model accuracy: 0.079
