# NLP Data Processor
Predicts whether a 311 service request is "urgent"

## Dependencies

In [146]:
# Import dependencies
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from nltk.stem import WordNetLemmatizer
import sqlalchemy as db
from sqlalchemy import create_engine, MetaData, Table, Column, Integer
from sqlalchemy.orm import sessionmaker

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mahlo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load source data

In [147]:
# Load data: Small sample data
# df = pd.read_csv(
#     'Sample CSVs/2019-sample-small.csv', header=0)
# df.head()

In [148]:
# # Load data: After manual classification work
# df = pd.read_csv(
#     'Sample CSVs/Working CSV.csv', header=0)
# df.head()

Unnamed: 0,id,service_request_id,service_request_parent_id,date_requested,case_age_days,service_name,case_record_type,date_updated,status,lat,lng,council_district,comm_plan_code,comm_plan_name,park_name,case_origin,referred,public_description,urgent
0,39984,2522790,,2019-02-27T15:39:00,5.0,72 Hour Violation,Parking,2019-03-04T00:00:00,Closed,32.7306,-117.226,2.0,30.0,Peninsula,,Web,,Abandoned car has been sitting in same spot on...,no
1,2,2475208,,2019-01-01T00:20:00,,Other,TSW,,Referred,32.7023,-117.093,4.0,11.0,Encanto Neighborhoods,,Mobile,This report has been referred to Police CRO ? ...,Ongoing loud fireworks in area near El Rey Tra...,no
2,3,2475209,,2019-01-01T00:21:00,15.0,Litter,ESD Complaint/Report,2019-01-15T00:00:00,Closed,32.7198,-117.06,4.0,11.0,Encanto Neighborhoods,,Mobile,,3 bags of garbage and one box,no
3,4,2475210,,2019-01-01T01:06:00,189.0,Street Sweeping,TSW,2019-07-09T00:00:00,Closed,32.70216,-117.052,4.0,44.0,Skyline-Paradise Hills,,Web,,North side of Skyline Drive from Cielo to Wood...,no
4,6,2475212,,2019-01-01T01:28:00,400.0,Street Light Out,TSW,,In Process,32.79611,-117.125,7.0,35.0,Serra Mesa,,Mobile,,Street light not working.,no


In [198]:
# Load data: From SQL server
engine = db.create_engine('postgresql://postgres:88653680Dd!@localhost/311_requests')
connection = engine.connect()
query = "select * from requests_raw;"
df = pd.read_sql_query(query, connection)
connection.close()
df.head()

Unnamed: 0,id,service_request_id,service_request_parent_id,date_requested,case_age_days,service_name,case_record_type,date_updated,status,lat,lng,council_district,comm_plan_code,comm_plan_name,park_name,case_origin,referred,public_description,urgent
0,39984,2522790,,2019-02-27,5.0,72 Hour Violation,Parking,2019-03-04,Closed,32.7306,-117.226,2.0,30.0,Peninsula,,Web,,Abandoned car has been sitting in same spot on...,no
1,2,2475208,,2019-01-01,,Other,TSW,NaT,Referred,32.7023,-117.093,4.0,11.0,Encanto Neighborhoods,,Mobile,This report has been referred to Police CRO ? ...,Ongoing loud fireworks in area near El Rey Tra...,no
2,3,2475209,,2019-01-01,15.0,Litter,ESD Complaint/Report,2019-01-15,Closed,32.7198,-117.06,4.0,11.0,Encanto Neighborhoods,,Mobile,,3 bags of garbage and one box,no
3,4,2475210,,2019-01-01,189.0,Street Sweeping,TSW,2019-07-09,Closed,32.70216,-117.052,4.0,44.0,Skyline-Paradise Hills,,Web,,North side of Skyline Drive from Cielo to Wood...,no
4,6,2475212,,2019-01-01,400.0,Street Light Out,TSW,NaT,In Process,32.79611,-117.125,7.0,35.0,Serra Mesa,,Mobile,,Street light not working.,no


In [199]:
# Drop the irrelevant columns
df = df.drop(columns=['case_age_days', 'case_record_type', 'date_updated', 'status', 'council_district', 'comm_plan_code', 'comm_plan_name', 'park_name', 'case_origin', 'referred'])
df.head()

Unnamed: 0,id,service_request_id,service_request_parent_id,date_requested,service_name,lat,lng,public_description,urgent
0,39984,2522790,,2019-02-27,72 Hour Violation,32.7306,-117.226,Abandoned car has been sitting in same spot on...,no
1,2,2475208,,2019-01-01,Other,32.7023,-117.093,Ongoing loud fireworks in area near El Rey Tra...,no
2,3,2475209,,2019-01-01,Litter,32.7198,-117.06,3 bags of garbage and one box,no
3,4,2475210,,2019-01-01,Street Sweeping,32.70216,-117.052,North side of Skyline Drive from Cielo to Wood...,no
4,6,2475212,,2019-01-01,Street Light Out,32.79611,-117.125,Street light not working.,no


In [200]:
# Plug the CSV's public_description column and urgent column into X and y
X, y = df.public_description, df.urgent

## Data pre-processing

In [201]:
documents = []

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
#     document = document.split()

#     document = [stemmer.lemmatize(word) for word in document]
#     document = ' '.join(document)
    
    documents.append(document)

In [202]:
documents[0]

'abandoned car has been sitting in same spot on property for 12 months light blue ford 4 door taurus older model ca license 4twk522'

---------------------
# Calculate TF-IDF using Bag of Words

In [203]:
# Use Bag of Words to to encode text
vectorizer = CountVectorizer(max_features=1500, min_df=1, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()

In [204]:
# Transform the Bag Of Words array to calculate TF-IDF
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

## Fit the data

In [205]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=0)

In [208]:
# Fit the data to the model
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train) 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [209]:
# Make a prediction
y_pred = classifier.predict(X_test)
y_pred

array(['yes', 'no', 'yes', ..., 'no', 'no', 'yes'], dtype=object)

# Model Evaluation

In [210]:
# Perform the evaluation, and print the results

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[10321   885]
 [ 1896  2345]]
              precision    recall  f1-score   support

          no       0.84      0.92      0.88     11206
         yes       0.73      0.55      0.63      4241

    accuracy                           0.82     15447
   macro avg       0.79      0.74      0.75     15447
weighted avg       0.81      0.82      0.81     15447

0.8199650417556807


# Save the model

In [211]:
with open('text_classifier', 'wb') as picklefile:
    pickle.dump(classifier,picklefile)

# Load the model

In [212]:
with open('text_classifier', 'rb') as training_model:
    model = pickle.load(training_model)

In [213]:
# Test the loaded model to confirm that it has the same results
y_pred2 = model.predict(X_test)

print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))
print(accuracy_score(y_test, y_pred2)) 

[[10321   885]
 [ 1896  2345]]
              precision    recall  f1-score   support

          no       0.84      0.92      0.88     11206
         yes       0.73      0.55      0.63      4241

    accuracy                           0.82     15447
   macro avg       0.79      0.74      0.75     15447
weighted avg       0.81      0.82      0.81     15447

0.8199650417556807


In [214]:
# Show the predictions array for comparison
print(y_pred2)

['yes' 'no' 'yes' ... 'no' 'no' 'yes']


In [215]:
#Show the accuracy score
print(accuracy_score(y_test, y_pred2))

0.8199650417556807


In [216]:
model.predict(X_test)

array(['yes', 'no', 'yes', ..., 'no', 'no', 'yes'], dtype=object)