## Assignment #4 - Luke Schwenke

#### Create classification model, predicting the outcome of food safety inspection based on the inspectors’ comments

In [170]:
import pandas as pd
import requests
import re
import numpy as np
from pandarallel import pandarallel
from textblob import TextBlob
import sklearn
from textblob.sentiments import NaiveBayesAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import metrics
from sklearn.pipeline import make_pipeline
import numpy as np
import eli5

In [98]:
import warnings
warnings.filterwarnings("ignore")

In [99]:
import multiprocessing
num_processors = multiprocessing.cpu_count()
print(f'Available CPUs: {num_processors}')

Available CPUs: 8


In [100]:
pandarallel.initialize(nb_workers=num_processors-1, use_memory_fs=False)

INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [101]:
%%time

# Define the API endpoint and parameters
url = "https://data.cityofchicago.org/resource/cwig-ma7x.json"

# Fetch the total count of records
def get_total_count(url):
    params = {
        "$select": "count(*)"
    }
    response = requests.get(url, params=params)
    data = response.json()
    return int(data[0]['count'])

total_count = get_total_count(url)
print(f"Total number of records available: {total_count:,.0f}")

Total number of records available: 266,602
CPU times: user 30 ms, sys: 3.48 ms, total: 33.5 ms
Wall time: 784 ms


In [102]:
%%time

# Fetch data and load it into a pandas DataFrame
def fetch_data(url, params):
    response = requests.get(url, params=params)
    data = response.json()
    return pd.DataFrame(data)

# Set the limit parameter equal to the number of available records
params = {
    "$limit": total_count
}

# Fetch the data and load it into a DataFrame
df = fetch_data(url, params)

print(f'Number of records retrieved: {df.shape[0]:,.0f}')

Number of records retrieved: 266,602
CPU times: user 5.3 s, sys: 554 ms, total: 5.85 s
Wall time: 25.8 s


In [103]:
df.head(1)

Unnamed: 0,inspection_id,dba_name,aka_name,license_,facility_type,risk,address,city,state,zip,...,results,violations,latitude,longitude,location,:@computed_region_awaf_s7ux,:@computed_region_6mkv_f3dw,:@computed_region_vrxf_vc4k,:@computed_region_bdys_3d7i,:@computed_region_43wa_7qmu
0,2588404,SAMI BOY 2,SAMI BOY 2,2951502,Grocery Store,Risk 2 (Medium),3730 W DIVISION ST,CHICAGO,IL,60651,...,Pass,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.90283211340331,-87.7202778317596,"{'type': 'Point', 'coordinates': [-87.72027783...",41,4299,24,454,46


### Extract free-form text comments from inspectors

In [104]:
#df_clean = df[df['results'] == 'Fail']

# Drop rows with NaN in the 'violations' column
df_clean = df.dropna(subset=['violations'])

In [342]:
comments_pattern = r'Comments: (.*?)(?:\||$)'
df_clean['text_comments'] = df_clean['violations'].apply(lambda x: re.findall(comments_pattern, x))

In [None]:
print("Here is a sample comment:", df_clean['text_comments'][0])

Here is a sample comment: ['OBSERVED NO VERIFIABLE EMPLOYEE HEALTH POLICY ON SITE AT TIME OF INSPECTION. LEFT TEMPLATE AND INSTRUCTED TO MAINTAIN COPIES OF VERIFIABLE HEALTH POLICY SIGNED BY ALL EMPLOYEES ON SITE AT ALL TIMES. PRIORITY FOUNDATION VIOLATION 7-38-010. NO CITATION ISSUED ', 'OBSERVED THE FRONT EXIT DOOR NOT COMPLETELY RODENT PROOFED AS REQUIRED. MUST COMPLETELY RODENT PROOF DOOR BY SEALING 1/4 GAP AT THE BOTTOM OF THE DOOR.  ', 'OBSERVED WASHROOM DOOR NOT SELF-CLOSING. INSTRUCTED MANAGER TO REPAIR SELF-CLOSING DEVICE ON WASHROOM DOOR.  ', 'OBSERVED SOILED MOP HEADS NOT PROPERLY STORED. INSTRUCTED MANAGER TO HANG MOP HEADS TO PREVENT INSECT BREEDING. ']


In [106]:
def list_to_string(text_list):
    return ' '.join(text_list)

# Apply the function to each element in the 'text_comments' column
df_clean['text_comments_single_string'] = df_clean['text_comments'].apply(list_to_string)

In [107]:
df_clean['text_comments_single_string'] = df_clean['text_comments_single_string'].apply(lambda x: ' '.join(re.findall(r'\w+', x.lower())))

In [108]:
print("Here is a CLEANED sample comment:", df_clean['text_comments_single_string'][0])

Here is a CLEANED sample comment: observed no verifiable employee health policy on site at time of inspection left template and instructed to maintain copies of verifiable health policy signed by all employees on site at all times priority foundation violation 7 38 010 no citation issued observed the front exit door not completely rodent proofed as required must completely rodent proof door by sealing 1 4 gap at the bottom of the door observed washroom door not self closing instructed manager to repair self closing device on washroom door observed soiled mop heads not properly stored instructed manager to hang mop heads to prevent insect breeding


### Alternatively, split into individual comments which will increase the number of rows / samples
### I will use this as the *X* variable

In [258]:
df_exploded = df_clean.explode('text_comments').reset_index()

In [261]:
df_exploded['text_comments'][0]

'OBSERVED NO VERIFIABLE EMPLOYEE HEALTH POLICY ON SITE AT TIME OF INSPECTION. LEFT TEMPLATE AND INSTRUCTED TO MAINTAIN COPIES OF VERIFIABLE HEALTH POLICY SIGNED BY ALL EMPLOYEES ON SITE AT ALL TIMES. PRIORITY FOUNDATION VIOLATION 7-38-010. NO CITATION ISSUED '

## Model Preparation for Multinomial Classification

In [118]:
print("The different possible inspection outcomes are as follows:\n", list(df_exploded.results.unique()))

The different possible inspection outcomes are as follows:
 ['Pass', 'Fail', 'Pass w/ Conditions', 'No Entry', 'Not Ready', 'Out of Business']


#### I will encode the labelled *y* variable as:

* 1 = Fail
* 2 = Pass
* 3 = Pass w/ Conditions
* 4 = No Entry
* 5 = Not Ready
* 6 = Out of Business

In [119]:
encoding = {
    'Fail': 1,
    'Pass': 2,
    'Pass w/ Conditions': 3,
    'No Entry': 4,
    'Not Ready': 5,
    'Out of Business': 6
}

df_exploded['results_labels'] = df_exploded['results'].map(encoding)

In [332]:
# define X and y
X = df_exploded['text_comments']
y = df_exploded['results_labels']

print(f"X Shape: {X.shape}")
print(f"y Shape: {y.shape}")

X Shape: (873757,)
y Shape: (873757,)


In [193]:
y.value_counts()

2    332892
1    321370
3    215259
4      3654
5       454
6       128
Name: results_labels, dtype: int64

#### *Note: the above results indicate the results are highly imbalanced*

In [333]:
# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123123)

print(f"Training records, X_train: {X_train.shape} y_train: {y_train.shape}")
print(f"Testing records, X_test: {X_test.shape} y_test: {y_test.shape}")

Training records, X_train: (655317,) y_train: (655317,)
Testing records, X_test: (218440,) y_test: (218440,)


In [334]:
# Reset the indices of X_train and y_train
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
# Remove NaN values from X_train
X_train = X_train.dropna()
# Remove corresponding rows from y_train based on the NaN removal from X_train
y_train = y_train[X_train.index]

# Reset the indices of X_test and y_test
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
# Remove NaN values from X_test
X_test = X_test.dropna()
# Remove corresponding rows from y_test based on the NaN removal from X_test
y_test = y_test[X_test.index]

In [335]:
# Verify shapes are equal for train and test sets
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape, '\n')
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (655193,)
y_train shape: (655193,) 

X_test shape: (218392,)
y_test shape: (218392,)


## Model #1: Naive-Bayes

In [173]:
# Instantiate NB pipeline
pipe_nb = make_pipeline(
    CountVectorizer(lowercase=False, stop_words='english', max_df=0.8, min_df=0.2, max_features=1000, ngram_range=(1,5)),
    MultinomialNB()
)

In [172]:
# Verify there are no NaN before model training
nan_count_X_train = X_train.isna().sum()
nan_count_y_train = y_train.isna().sum()
print("Number of NaN values in X_train:", nan_count_X_train)
print("Number of NaN values in y_train:", nan_count_y_train)

Number of NaN values in X_train: 0
Number of NaN values in y_train: 0


In [175]:
#%time nb.fit(countvectorizer.fit_transform(X_train), y_train)
%time pipe_nb.fit(X_train, y_train);

CPU times: user 1min 3s, sys: 1.68 s, total: 1min 5s
Wall time: 1min 6s


In [176]:
# make class predictions
%time y_pred = pipe_nb.predict(X_test)

CPU times: user 10 s, sys: 34.2 ms, total: 10.1 s
Wall time: 10.1 s


In [177]:
# calculate accuracy of class predictions
print(f"Test Accuracy: {metrics.accuracy_score(y_test, y_pred) * 100:.1f}%")

Test Accuracy: 44.4%


In [178]:
# calculate precision and recall
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.43      0.38      0.40     80728
           2       0.47      0.66      0.55     82948
           3       0.38      0.22      0.28     53657
           4       0.00      0.00      0.00       910
           5       0.00      0.00      0.00       117
           6       0.00      0.00      0.00        32

    accuracy                           0.44    218392
   macro avg       0.21      0.21      0.20    218392
weighted avg       0.43      0.44      0.43    218392



In [179]:
# calculate the confusion matrix
print(metrics.confusion_matrix(y_test, y_pred))

[[30625 38043 12060     0     0     0]
 [21722 54591  6635     0     0     0]
 [18427 23532 11698     0     0     0]
 [  289   464   157     0     0     0]
 [   41    57    19     0     0     0]
 [   10    17     5     0     0     0]]


## Model #2: Multinomial Logistic Regression

In [216]:
pipe_logreg = make_pipeline(
    CountVectorizer(lowercase=False, stop_words='english', max_df=0.8, min_df=0.2, max_features=1000, ngram_range=(1,5)),
    #TfidfVectorizer(lowercase=False, stop_words='english', max_df=0.8, min_df=0.2, max_features=1000, ngram_range=(1, 5)),
    LogisticRegression(max_iter=5000, multi_class='multinomial', class_weight='balanced', n_jobs=num_processors) #l1_ratio=0.5)
)

In [217]:
%time pipe_logreg.fit(X_train, y_train)

CPU times: user 1min 6s, sys: 2.55 s, total: 1min 8s
Wall time: 1min 19s


In [218]:
%time y_pred = pipe_logreg.predict(X_test)

CPU times: user 10 s, sys: 115 ms, total: 10.1 s
Wall time: 10.2 s


In [219]:
pd.Series(y_pred).value_counts()

6    55968
5    50281
2    49115
3    24851
1    24748
4    13429
dtype: int64

In [220]:
print(f"Test Accuracy: {metrics.accuracy_score(y_test, y_pred) * 100:.1f}%") #46.2% originally

Test Accuracy: 20.9%


In [223]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.45      0.14      0.21     80728
           2       0.52      0.31      0.38     82948
           3       0.36      0.17      0.23     53657
           4       0.01      0.08      0.01       910
           5       0.00      0.36      0.00       117
           6       0.00      0.38      0.00        32

    accuracy                           0.21    218392
   macro avg       0.22      0.24      0.14    218392
weighted avg       0.45      0.21      0.28    218392



## Model #3: Support Vector

In [221]:
from sklearn.svm import SVC

#### The SGDClassifier in scikit-learn can be used for multinomial regression, but it typically works better for binary classification or linear models. For multinomial regression, especially when dealing with multiple classes and non-linear decision boundaries, using a Support Vector Machine (SVM) with a kernel like RBF (SVC with kernel='rbf') is often a better choice.

In [325]:
myCountVectorizer = CountVectorizer(lowercase=False, stop_words='english', max_df=0.8, min_df=0.2, max_features=1000, ngram_range=(1,10))
mySVC = SVC(max_iter=300, kernel='rbf')#, class_weight='balanced') 

pipe_svc = make_pipeline(
    myCountVectorizer,
    mySVC 
)

In [326]:
%time pipe_svc.fit(X_train, y_train)

CPU times: user 2min 35s, sys: 2min 28s, total: 5min 3s
Wall time: 5min 28s


In [327]:
%time y_pred = pipe_svc.predict(X_test)

CPU times: user 15.9 s, sys: 200 ms, total: 16.1 s
Wall time: 16.1 s


In [329]:
print(f"SVC RBF Kernel - Test Accuracy: {metrics.accuracy_score(y_test, y_pred) * 100:.1f}%")
print(f"SVC RBF Kernel - Test F1-score: {metrics.f1_score(y_test, y_pred, average='weighted') * 100:.1f}%")

SVC RBF Kernel - Test Accuracy: 55.4%
SVC RBF Kernel - Test F1-score: 52.7%


### Try the best performing SVC RBF kernel with the TF-IDF vectorizer:

In [336]:
pipe_svc = make_pipeline(
    TfidfVectorizer(lowercase=False, stop_words='english', max_df=0.8, min_df=0.2, max_features=1000, ngram_range=(1,10)),
    SVC(max_iter=300, kernel='rbf')
)

In [337]:
%time pipe_svc.fit(X_train, y_train)

CPU times: user 4min 42s, sys: 2min 29s, total: 7min 12s
Wall time: 21min 54s


In [338]:
%time y_pred = pipe_svc.predict(X_test)

CPU times: user 1min 3s, sys: 152 ms, total: 1min 3s
Wall time: 1min 4s


In [339]:
print(f"TF-IDF Vectorizer - SVC RBF Kernel - Test Accuracy: {metrics.accuracy_score(y_test, y_pred) * 100:.1f}%")

TF-IDF Vectorizer - SVC RBF Kernel - Test Accuracy: 37.1%


### *Note: the above ML classifiers perform very poorly when the prompted to balance the classes during model fitting*

# Summary:

#### The vectorizer I used in all instances of my models was CountVectorizer; however, I did try the TF-IDF vectorizer on my best performing model and it dropped the test accuracy. For this reason I decided to keep the regular CountVectorizer as my best pre-processing technique.

#### In addition to using the CountVectorizer I kept capitalizations since some words that are capitalized could have more meaning. The stop words were removed to reduce noise. I removed words that appear in more than 80% of the comments will be ignored since they probably do not have a lot of meaning. I did the same for words under 20%. I limited the terms to 1,000 maximum to help with computation and dimensionality, and lastly I set the ngram range to 1-5 or 1-10 to allow the text to capture more patterns/context.

#### The first Naive Bayes model had a test accuracy of 44.4%, the multinomial Logistic Regression had a test accuracy of 20.9%, and the Support Vector Classifier with an RBF kernel performed the best with a test accuracy of 55.4%. This is not very strong performance which leads me to believe there are ML models that could capture the patterns in the dataset better than the 3 algorithms I tried. 

#### The challenge of this problem was also made more difficult due to the results class being imbalanced. Even when accounting for this when balancing the class_weight parameter the test accuracy was still not 60%+. Future steps would be to try more models like Neural Networks, further tune model hyperparameters, and consider various ways to handle the class imbalance.

## (experiment) How do Logistic Regression and Support Vector Classification perform with binary classification?

In [282]:
pipe_svc = make_pipeline(
    CountVectorizer(lowercase=False, stop_words='english', max_df=0.8, min_df=0.2, max_features=1000, ngram_range=(1,5)),
    SVC(max_iter=300, kernel='linear')#, class_weight='balanced') 
)

In [283]:
encoding_binary = {
    'Fail': 0,
    'Pass': 1
}

df_exploded['results_labels_binary'] = df_exploded['results'].map(encoding_binary)

In [284]:
df_exploded['results_labels_binary'].value_counts()

1.0    332892
0.0    321370
Name: results_labels_binary, dtype: int64

In [285]:
# define X and y
X = df_exploded['text_comments']
y = df_exploded['results_labels_binary']

print(f"X Shape: {X.shape}")
print(f"y Shape: {y.shape}")

X Shape: (873757,)
y Shape: (873757,)


In [286]:
# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123123)

print(f"Training records, X_train: {X_train.shape} y_train: {y_train.shape}")
print(f"Testing records, X_test: {X_test.shape} y_test: {y_test.shape}")

Training records, X_train: (655317,) y_train: (655317,)
Testing records, X_test: (218440,) y_test: (218440,)


In [287]:
# Reset the indices of X_train and y_train
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
# Remove NaN values from X_train
X_train = X_train.dropna()
# Remove corresponding rows from y_train based on the NaN removal from X_train
y_train = y_train[X_train.index]

# Reset the indices of X_test and y_test
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
# Remove NaN values from X_test
X_test = X_test.dropna()
# Remove corresponding rows from y_test based on the NaN removal from X_test
y_test = y_test[X_test.index]

###

X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
# Remove NaN values from X_train
y_train = y_train.dropna()
# Remove corresponding rows from y_train based on the NaN removal from X_train
X_train = X_train[y_train.index]

# Reset the indices of X_test and y_test
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
# Remove NaN values from X_test
y_test = y_test.dropna()
# Remove corresponding rows from y_test based on the NaN removal from X_test
X_test = X_test[y_test.index]

In [288]:
# Verify shapes are equal for train and test sets
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape, '\n')
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (490436,)
y_train shape: (490436,) 

X_test shape: (163676,)
y_test shape: (163676,)


In [289]:
# Verify there are no NaN before model training
nan_count_X_train = X_train.isna().sum()
nan_count_y_train = y_train.isna().sum()
print("Number of NaN values in X_train:", nan_count_X_train)
print("Number of NaN values in y_train:", nan_count_y_train)

Number of NaN values in X_train: 0
Number of NaN values in y_train: 0


In [290]:
%time pipe_svc.fit(X_train, y_train)

CPU times: user 49.7 s, sys: 972 ms, total: 50.7 s
Wall time: 51 s


In [291]:
%time y_pred = pipe_svc.predict(X_test)

CPU times: user 7.82 s, sys: 37.9 ms, total: 7.86 s
Wall time: 7.9 s


In [297]:
print(f"SVC Linear Kernel (Binary Classification) - Test Accuracy: {metrics.accuracy_score(y_test, y_pred) * 100:.1f}%")
print(f"SVC Linear Kernel (Binary Classification) - Test F1-score: {metrics.f1_score(y_test, y_pred, average='weighted') * 100:.1f}%")

SVC Linear Kernel (Binary Classification) - Test Accuracy: 61.1%
SVC Linear Kernel (Binary Classification) - Test F1-score: 60.9%


In [294]:
pipe_logreg_binary = make_pipeline(
    CountVectorizer(lowercase=False, stop_words='english', max_df=0.8, min_df=0.2, max_features=1000, ngram_range=(1,5)),
    #TfidfVectorizer(lowercase=False, stop_words='english', max_df=0.8, min_df=0.2, max_features=1000, ngram_range=(1, 5)),
    LogisticRegression(max_iter=5000, multi_class='auto', class_weight='balanced', n_jobs=num_processors) #l1_ratio=0.5)
)

In [295]:
%time pipe_logreg_binary.fit(X_train, y_train)

CPU times: user 52 s, sys: 1.57 s, total: 53.6 s
Wall time: 56.7 s


In [296]:
%time y_pred = pipe_logreg_binary.predict(X_test)

CPU times: user 7.23 s, sys: 123 ms, total: 7.36 s
Wall time: 7.39 s


In [298]:
print(f"Logistic Regression (Binary Classification) - Test Accuracy: {metrics.accuracy_score(y_test, y_pred) * 100:.1f}%")
print(f"Logistic Regression (Binary Classification) - Test F1-score: {metrics.f1_score(y_test, y_pred, average='weighted') * 100:.1f}%")

Logistic Regression (Binary Classification) - Test Accuracy: 61.1%
Logistic Regression (Binary Classification) - Test F1-score: 60.9%
