In [1]:
import pandas as pd
from collections import Counter
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

###### Reading data into a pandas dataframe

In [2]:
df = pd.DataFrame.from_csv('C:/Users/Piyush/Google Drive/MScA/nlp/assignment_4/Food_Inspections.csv')

  """Entry point for launching an IPython kernel.


###### Dropping rows where we don't have any violations

In [3]:
df.dropna(subset=['Violations'], inplace=True)

###### Dropping columns which are not needed

In [4]:
df.drop(labels=['DBA Name', 'AKA Name', 'License #', 'Facility Type', 'Risk', 'Address', 'City', 'State', 'Zip', 'Inspection Date', 'Inspection Type', 'Latitude', 'Longitude', 'Location'], axis=1, inplace=True)

###### Printing unique values for Results

In [5]:
df['Results'].unique()

array(['Pass w/ Conditions', 'Fail', 'Pass', 'No Entry',
       'Out of Business', 'Not Ready'], dtype=object)

###### We are going to drop rows where Result is not "Pass", "Fail", or "Pass w/ Conditions"

In [6]:
df = df[(df['Results'] == 'Pass w/ Conditions') | (df['Results'] ==  'Fail') | (df['Results'] == 'Pass')]

In [7]:
df.shape

(135102, 2)

###### Some restaurants have more than 1 violations. These violations are delimited by "|". We are applying regex to split all violations into a list

In [8]:
df.iloc[1,].Violations

'1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOWLEDGE, AND PERFORMS DUTIES - Comments: NO PERSON IN CHARGE PRESENT AT THE TIME OF INSPECTION. PRIORITY FOUNDATION VIOLATION. NO CITATION ISSUED. | 3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL EMPLOYEE; KNOWLEDGE, RESPONSIBILITIES AND REPORTING - Comments: NOTED NO EMPLOYEE HEALTH POLICY ON SITE AT THE TIME OF INSPECTION FOLLOWING THE NEW CODE AND GUIDELINES. INSTRUCTED TO HAVE VERIFICATION OF POLICY FOR EACH EMPLOYEE ONE ON SITE AT ALL TIMES. PRIORITY FOUNDATION #7-38-010. NO CITATION ISSUED TODAY | 5. PROCEDURES FOR RESPONDING TO VOMITING AND DIARRHEAL EVENTS - Comments: 2-501.11 MANAGEMENT UNABLE TO PROVIDE A PROCEDURE FOR CLEANING UP VOMITING AND DIARRHEA.INSTRUCTED TO HAVE ONE. PRIORITY FOUNDATION #7-38-005. NO CITATION ISSUED - | 50. HOT & COLD WATER AVAILABLE; ADEQUATE PRESSURE - Comments: NOTED NO HOT RUNNING WATER AT THE KITCHEN PREP AREA HAND WASH SINK AND 3 COMPARTMENT SINK AND EMPLOYEE BATHROOMS HAND WASH SINK AND ONE OF 3 HAND 

In [9]:
df['Violations_List'] = df['Violations'].apply(lambda x: re.split("[|]+", x))

In [10]:
df.iloc[1,].Violations_List

['1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOWLEDGE, AND PERFORMS DUTIES - Comments: NO PERSON IN CHARGE PRESENT AT THE TIME OF INSPECTION. PRIORITY FOUNDATION VIOLATION. NO CITATION ISSUED. ',
 ' 3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL EMPLOYEE; KNOWLEDGE, RESPONSIBILITIES AND REPORTING - Comments: NOTED NO EMPLOYEE HEALTH POLICY ON SITE AT THE TIME OF INSPECTION FOLLOWING THE NEW CODE AND GUIDELINES. INSTRUCTED TO HAVE VERIFICATION OF POLICY FOR EACH EMPLOYEE ONE ON SITE AT ALL TIMES. PRIORITY FOUNDATION #7-38-010. NO CITATION ISSUED TODAY ',
 ' 5. PROCEDURES FOR RESPONDING TO VOMITING AND DIARRHEAL EVENTS - Comments: 2-501.11 MANAGEMENT UNABLE TO PROVIDE A PROCEDURE FOR CLEANING UP VOMITING AND DIARRHEA.INSTRUCTED TO HAVE ONE. PRIORITY FOUNDATION #7-38-005. NO CITATION ISSUED - ',
 ' 50. HOT & COLD WATER AVAILABLE; ADEQUATE PRESSURE - Comments: NOTED NO HOT RUNNING WATER AT THE KITCHEN PREP AREA HAND WASH SINK AND 3 COMPARTMENT SINK AND EMPLOYEE BATHROOMS HAND WASH SINK AND O

###### Removing Health Code and keeping just the comments

In [11]:
# Function to remove health code from the violation
def aggregate_comments(violations):
    comments = list()
    for violation in violations:
        violation_info = re.split("- Comments:", violation)
        if len(violation_info) > 1:
            comments.append(violation_info[1].strip().upper())
    
    if len(comments) > 1:
        return ",".join(comments)
    elif len(comments) == 1:
        return comments[0]
    return ""

In [12]:
df['Comments'] = df['Violations_List'].apply(lambda x: aggregate_comments(x))

In [13]:
df.iloc[1,].Comments

'NO PERSON IN CHARGE PRESENT AT THE TIME OF INSPECTION. PRIORITY FOUNDATION VIOLATION. NO CITATION ISSUED.,NOTED NO EMPLOYEE HEALTH POLICY ON SITE AT THE TIME OF INSPECTION FOLLOWING THE NEW CODE AND GUIDELINES. INSTRUCTED TO HAVE VERIFICATION OF POLICY FOR EACH EMPLOYEE ONE ON SITE AT ALL TIMES. PRIORITY FOUNDATION #7-38-010. NO CITATION ISSUED TODAY,2-501.11 MANAGEMENT UNABLE TO PROVIDE A PROCEDURE FOR CLEANING UP VOMITING AND DIARRHEA.INSTRUCTED TO HAVE ONE. PRIORITY FOUNDATION #7-38-005. NO CITATION ISSUED -,NOTED NO HOT RUNNING WATER AT THE KITCHEN PREP AREA HAND WASH SINK AND 3 COMPARTMENT SINK AND EMPLOYEE BATHROOMS HAND WASH SINK AND ONE OF 3 HAND WASH SINKS AT THE BOYS BATHROOM. TEMPERATURE OF WATER WAS AS FOLLOWS:-KITCHEN HAND WASH SINK 80.3F, 3 COMPARTMENT SINK WAS 85.2F, NO HOT OR COLD RUNNING WATER AT THE HAND WASH SINK OF THE BOYS BATHROOM (BAD SENSOR) AND EMPLOYEE BATHROOMS 81.3F AND 45F. MANAGER IMMEDIATELY CALLED FOR REPAIRS. PRIORITY VIOLATION #7-38-030(C)'

###### Converting Results to categorical

In [14]:
df['Results_Flag'] = df.Results.map({'Pass w/ Conditions':0, 'Pass':1, 'Fail':2})

In [15]:
df.head()

Unnamed: 0_level_0,Results,Violations,Violations_List,Comments,Results_Flag
Inspection ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2282603,Pass w/ Conditions,25. CONSUMER ADVISORY PROVIDED FOR RAW/UNDERCO...,[25. CONSUMER ADVISORY PROVIDED FOR RAW/UNDERC...,ALL MENU ITEMS THAT PERTAIN TO THE CONSUMER AD...,0
2282582,Fail,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...","[1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNO...",NO PERSON IN CHARGE PRESENT AT THE TIME OF INS...,2
2282567,Pass,47. FOOD & NON-FOOD CONTACT SURFACES CLEANABLE...,[47. FOOD & NON-FOOD CONTACT SURFACES CLEANABL...,MUST CLEAN AND SANITIZE ALL CHOPPING BOARDS AN...,1
2282535,Pass w/ Conditions,5. PROCEDURES FOR RESPONDING TO VOMITING AND D...,[5. PROCEDURES FOR RESPONDING TO VOMITING AND ...,OBSERVED NO PROCEDURE FOR RESPONDING TO VOMITI...,0
2282536,Pass w/ Conditions,16. FOOD-CONTACT SURFACES: CLEANED & SANITIZED...,[16. FOOD-CONTACT SURFACES: CLEANED & SANITIZE...,FOUND LOW TEMPERATURE DISH WASHING MACHINE FIN...,0


###### Splitting the data into train and test

In [16]:
# how to define X and y (from the citation data) for use with COUNTVECTORIZER
X = df.Comments #Predictor
y = df.Results_Flag
print(X.shape)
print(y.shape)

(135102,)
(135102,)


In [17]:
# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(101326,)
(33776,)
(101326,)
(33776,)


###### Creating models using different vectorizing techniques

In [18]:
RESULTS = list()

In [19]:
def model_selection(vectorizer, vectorizer_description):
    X_train_dtm = vectorizer.fit_transform(X_train)
    X_test_dtm = vectorizer.transform(X_test)
    
    #Fitting Naive Bayer model
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred_class_nb = nb.predict(X_test_dtm)
    
    #Fitting Logistic Regression model
    logreg = LogisticRegression()
    logreg.fit(X_train_dtm, y_train)
    y_pred_class_logit = logreg.predict(X_test_dtm)
    
    return (
            vectorizer_description, 
            X_train_dtm.shape[1], 
            accuracy_score(y_test, y_pred_class_nb), 
            accuracy_score(y_test, y_pred_class_logit)
        )

In [20]:
RESULTS.append(model_selection(CountVectorizer(stop_words='english'), "CountVectorizer with uni-grams"))

In [21]:
RESULTS.append(model_selection(CountVectorizer(stop_words='english', ngram_range=(1, 3)), "CountVectorizer with n-grams"))

In [22]:
RESULTS.append(model_selection(TfidfVectorizer(stop_words='english'), "TfidfVectorizer with uni-grams"))

In [23]:
RESULTS.append(model_selection(TfidfVectorizer(stop_words='english', ngram_range=(1, 3)), "TfidfVectorizer with n-grams"))

In [24]:
pd.DataFrame(RESULTS, columns=['Vectorizer', 'Feature Space', 'Naive Bayes Accuracy', 'Logistic Accuracy'])

Unnamed: 0,Vectorizer,Feature Space,Naive Bayes Accuracy,Logistic Accuracy
0,CountVectorizer with uni-grams,37054,0.868664,0.936493
1,CountVectorizer with n-grams,2300859,0.879944,0.942829
2,TfidfVectorizer with uni-grams,37054,0.870944,0.932348
3,TfidfVectorizer with n-grams,2300859,0.831093,0.934924


###### The accuracy has slightly improved due to addition of n-grams, but the feature space has exploded. The model performance increase compared to feature space increase is negligible. Thus, for now we should consider the simpler model (with uni-grams). Both Tfidf and CountVectorizer perform equally well. I would probably go with CountVectorizer as it is a simpler model.