In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [3]:
# Create file path for datasets
file_path_fv = "./resources/food_violations_seattle.csv"

# Read csv as dataframe
fv_df = pd.read_csv(file_path_fv)

# view df
fv_df.head(5)

Unnamed: 0.1,Unnamed: 0,inspection_id,restaurant_id,inspection_period_start_date,inspection_period_end_date,inspection_penalty_score,inspection_average_prev_penalty_scores,inspection_prev_penalty_score,cuisines,zip_code,review_count,non_positive_review_count,average_review_rating,review_contents
0,0,DA2383404,ZriNDCVxkCdVEO-X2sjHFw,2012-02-06 00:00:00,2012-10-02 00:00:00,46,54.0,50,"['Japanese', 'Restaurants']",98116,1,0,5.0,We went here this past weekend with zero expec...
1,1,DA2304227,ZriNDCVxkCdVEO-X2sjHFw,2011-11-03 00:00:00,2012-02-06 00:00:00,82,86.0,76,"['Japanese', 'Restaurants']",98116,3,0,4.0,Lots of young families . Kind of sparse &#160;...
2,2,DA2278690,ZriNDCVxkCdVEO-X2sjHFw,2011-01-06 00:00:00,2011-11-03 00:00:00,24,67.0,41,"['Japanese', 'Restaurants']",98116,7,0,4.428571,This place is a gem! My husband and I didn't h...
3,3,DA2183562,ZriNDCVxkCdVEO-X2sjHFw,2010-08-26 00:00:00,2011-01-06 00:00:00,71,89.0,28,"['Japanese', 'Restaurants']",98116,3,1,3.333333,"Honestly, my expectations were not that high. ..."
4,4,DA2142912,ZriNDCVxkCdVEO-X2sjHFw,2010-02-25 00:00:00,2010-08-26 00:00:00,64,69.0,60,"['Japanese', 'Restaurants']",98116,4,2,3.25,"Yep, like it says.My friend and I were on the ..."


In [4]:
# Create file path for datasets
file_path_cuisine = "./resources/cuisines.csv"

# Read csv as dataframe
cuisine_df = pd.read_csv(file_path_cuisine)

# view df
cuisine_df.head(5)

Unnamed: 0,x,asian,ethnic
0,Japanese,1,1
1,Restaurants,0,0
2,Sandwiches,0,0
3,Vietnamese,1,1
4,Delis,0,0


In [5]:
# filter cuisines to display only asian + ethnicity variables 
new_cuisine_df = cuisine_df.query('asian == 1 or ethnic == 1')

In [6]:
# rename columns to match with main dataset
new_cuisine_df.rename(columns={'x': 'cuisines'}, inplace=True)
new_cuisine_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_cuisine_df.rename(columns={'x': 'cuisines'}, inplace=True)


Unnamed: 0,cuisines,asian,ethnic
0,Japanese,1,1
3,Vietnamese,1,1
5,Dim Sum,1,1
7,Cantonese,1,1
8,Chinese,1,1


In [12]:
# filter by inspection_penalty_score of 10 or more
new_fv_df = fv_df.query('inspection_penalty_score >= 10')
new_fv_df.head(25)

Unnamed: 0.1,Unnamed: 0,inspection_id,restaurant_id,inspection_period_start_date,inspection_period_end_date,inspection_penalty_score,inspection_average_prev_penalty_scores,inspection_prev_penalty_score,cuisines,zip_code,review_count,non_positive_review_count,average_review_rating,review_contents
0,0,DA2383404,ZriNDCVxkCdVEO-X2sjHFw,2012-02-06 00:00:00,2012-10-02 00:00:00,46,54.0,50,"['Japanese', 'Restaurants']",98116,1,0,5.0,We went here this past weekend with zero expec...
1,1,DA2304227,ZriNDCVxkCdVEO-X2sjHFw,2011-11-03 00:00:00,2012-02-06 00:00:00,82,86.0,76,"['Japanese', 'Restaurants']",98116,3,0,4.0,Lots of young families . Kind of sparse &#160;...
2,2,DA2278690,ZriNDCVxkCdVEO-X2sjHFw,2011-01-06 00:00:00,2011-11-03 00:00:00,24,67.0,41,"['Japanese', 'Restaurants']",98116,7,0,4.428571,This place is a gem! My husband and I didn't h...
3,3,DA2183562,ZriNDCVxkCdVEO-X2sjHFw,2010-08-26 00:00:00,2011-01-06 00:00:00,71,89.0,28,"['Japanese', 'Restaurants']",98116,3,1,3.333333,"Honestly, my expectations were not that high. ..."
4,4,DA2142912,ZriNDCVxkCdVEO-X2sjHFw,2010-02-25 00:00:00,2010-08-26 00:00:00,64,69.0,60,"['Japanese', 'Restaurants']",98116,4,2,3.25,"Yep, like it says.My friend and I were on the ..."
5,5,DA2081169,ZriNDCVxkCdVEO-X2sjHFw,2009-12-04 00:00:00,2010-02-25 00:00:00,53,69.0,50,"['Japanese', 'Restaurants']",98116,1,1,2.0,i recently ate here....next time you'll have t...
6,6,DA2070387,ZriNDCVxkCdVEO-X2sjHFw,2009-03-17 00:00:00,2009-12-04 00:00:00,43,47.0,33,"['Japanese', 'Restaurants']",98116,1,1,3.0,After my first 3 visits I boldly proclaimed th...
7,7,DA1961408,ZriNDCVxkCdVEO-X2sjHFw,2008-09-04 00:00:00,2009-03-17 00:00:00,83,94.0,58,"['Japanese', 'Restaurants']",98116,2,0,4.0,The best teriyaki I have had in Seattle!!! &#1...
8,8,DA1892158,ZriNDCVxkCdVEO-X2sjHFw,2008-02-04 00:00:00,2008-09-04 00:00:00,63,65.0,45,"['Japanese', 'Restaurants']",98116,1,1,3.0,"Clean joint, good selection of food, friendly ..."
15,15,DA2411831,jJp7BaSZ4eu0MfqkZjuJGw,2012-08-21 00:00:00,2013-02-05 00:00:00,42,51.0,45,"['Vietnamese', 'Restaurants']",98106,3,2,3.0,"I have been here twice, service is good, perso..."


In [8]:
# merge data frames from cuisines column to include only 1 values 


In [9]:
# keep important columns such as inspection penalty scores and cuisne 

In [10]:
# Generate a logistical regression 