# NLP Data Processor

In [2]:
# Import dependencies
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mahlo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load source data

In [5]:
# Load data: Small sample data
df = pd.read_csv(
    'Sample CSVs/2019-sample-small.csv', header=0)
df.head()

Unnamed: 0,service_request_id,service_request_parent_id,sap_notification_number,date_requested,case_age_days,service_name,case_record_type,date_updated,status,lat,lng,council_district,comm_plan_code,comm_plan_name,park_name,case_origin,referred,public_description,urgent
0,2475207,,,2019-01-01T00:20:00,,Other,TSW,,Referred,32.710427,-117.145192,8,37,Southeastern San Diego,,Mobile,This report has been referred to Police CRO ? ...,19th and 20th island ave people firing gun sho...,yes
1,2475208,,,2019-01-01T00:20:00,,Other,TSW,,Referred,32.702298,-117.092751,4,11,Encanto Neighborhoods,,Mobile,This report has been referred to Police CRO ? ...,Ongoing loud fireworks in area near El Rey Tra...,no
2,2475209,,,2019-01-01T00:21:00,15.0,Litter,ESD Complaint/Report,2019-01-15T00:00:00,Closed,32.719797,-117.060215,4,11,Encanto Neighborhoods,,Mobile,,3 bags of garbage and one box,no
3,2475210,,40300090000.0,2019-01-01T01:06:00,189.0,Street Sweeping,TSW,2019-07-09T00:00:00,Closed,32.702164,-117.051741,4,44,Skyline-Paradise Hills,,Web,,North side of Skyline Drive from Cielo to Wood...,no
4,2475211,,40300090000.0,2019-01-01T01:12:00,18.0,Engineering - Traffic Calming/Safety Features,Traffic Engineering,2019-01-18T00:00:00,Closed,32.829445,-117.279999,1,10,La Jolla,,Mobile,,"On December 13, 2018, this delivery truck back...",no


In [10]:
# Plug the CSV's public_description column and urgent column into X and y
X, y = df.public_description, df.urgent

## Data pre-processing

In [13]:
documents = []

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
#     document = document.split()

#     document = [stemmer.lemmatize(word) for word in document]
#     document = ' '.join(document)
    
    documents.append(document)

---------------------
# Calculate TF-IDF using one of two methods

## Bag of Words method

In [18]:
# Use Bag of Words to to encode text
vectorizer = CountVectorizer(max_features=1500, min_df=1, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()

In [19]:
# Convert Bag of Words values into calculate TF-IDF
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

## Direct conversion method

In [None]:
# Alternatively, calculate TF-IDF without using Bag of Words first
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = tfidfconverter.fit_transform(documents).toarray()

# End of conversion options
---------------------

## Fit the data

In [20]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [23]:
# Fit the data to the model
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train) 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [24]:
# Make a prediction
y_pred = classifier.predict(X_test)

# Model Evaluation

In [25]:
# Perform the evaluation, and print the results

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

NameError: name 'confusion_matrix' is not defined

# Save the model

In [None]:
with open('text_classifier', 'wb') as picklefile:
    pickle.dump(classifier,picklefile)

# Load the model

In [None]:
with open('text_classifier', 'rb') as training_model:
    model = pickle.load(training_model)

In [None]:
# Test the loaded model to confirm that it has the same results
y_pred2 = model.predict(X_test)

print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))
print(accuracy_score(y_test, y_pred2)) 