In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import random
import os
import keras

from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import resample
from sklearn import naive_bayes, metrics
import sklearn

from itertools import groupby

import tensorflow as tf
tf.enable_eager_execution()

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from nltk.stem.porter import PorterStemmer

from keras.utils import to_categorical

In [None]:
#Read in test and training sets from createdbalanced_1556_2019 and create_unbalanced_test_400_2018

train = pd.read_csv('C:/Users/nateb/Desktop/Insight/Akidolabs/Akido_DPS_Data/train_1556_labeled.csv')
trainorig = train
test = pd.read_csv('C:/Users/nateb/Desktop/Insight/Akidolabs/Akido_DPS_Data/test_400_2018.csv')
testorig = train


In [None]:
#Combine train and test sets, create new column with designator
train['which'] = "train"
test['which'] = "test"
new=train.append(test) 
train=new
#print(train.shape)
#print(train['which'].value_counts())

In [None]:
##rename PoliceInformationSummary variable to pis
##make all letters lowercase in combined data
train = train.rename(columns={"PoliceInformationSummary": "pis"})
train['pis'] = train['pis'].str.lower()


In [None]:
#use keras to convert dispatch calls to series of words/tokens
train['clean_pis'] = train['pis'].apply(lambda x: ' '.join(text_to_word_sequence(x)))

In [None]:
#compare first entry for police information summary before and after 
#text_to_word_sequence
train['clean_pis'].iloc[0],train['pis'].iloc[0]

In [None]:
##regular expression to remove unwanted character strings
##function to run all re's
def standardize_text(df, text_field):

    #replace links
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    #Remove non alphanumeric
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    #Replace @ with at
    df[text_field] = df[text_field].str.replace(r"@", "at")
    #Delete all digits greater than 1 numbers
    df[text_field] = df[text_field].str.replace(r"\d{1,}", "")
    
    return df

In [None]:
#apply function to cleaned pis column
train = standardize_text(train, 'clean_pis')

In [None]:
#compare first entry for police information summary before and after 
#re's
train['clean_pis'].iloc[0],train['pis'].iloc[0]

In [None]:
#lemmatizer for cleaned text using NLTK
#loop through all dispatch entries
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

train['clean_pis_lemmatized'] = train.clean_pis.apply(lemmatize_text)

In [None]:
#compare first entries for police information summary 
#at different cleaning stages
train['pis'].iloc[0],train['clean_pis'].iloc[0],train['clean_pis_lemmatized'].iloc[0]

In [None]:
#Resplit into test and train after cleaning
#confirm dimensions
test = train[train['which'] == 'test']
train = train[train['which'] == 'train']
print(test.shape)
print(train.shape)


In [None]:
#tabulate values in manual class column
#this column contains labels that I manually entered for both test and train sets
#where 9 appears, I was unsure whether the entry had to to do with homelessness
print(train['manual_class'].value_counts())
print(test['manual_class'].value_counts())

In [None]:
##Replace 9 with 0, unsure, to not homeless
test['manual_class'] = test['manual_class'].replace(9,0)
train['manual_class']= train['manual_class'].replace(9,0)

In [None]:
#Confirm that conversion has worked
print(train['manual_class'].value_counts())
print(test['manual_class'].value_counts())

In [None]:
#Vectorize the dispatch text using TF-IDF for test and training sets
#Get and store feature Names        
#Remove stop words
#Print out number of vectorized features

tfidf_vec = TfidfVectorizer(stop_words = 'english')

fitted_name = tfidf_vec.fit(train['clean_pis'])
feature_name = fitted_name.get_feature_names()
print (len(feature_name))

fitted_name_test = tfidf_vec.fit(test['clean_pis'])
feature_name_test = fitted_name_test.get_feature_names()
print (len(feature_name_test))

In [None]:
#Train/transform training set, transform only test
#Fit_transform training text
#transform only test text

tfidf_vec = TfidfVectorizer(stop_words = 'english')

fitted = train['clean_pis'].tolist()
train_x=tfidf_vec.fit_transform(fitted)
train_y = train['manual_class']


fitted_test = test['clean_pis'].tolist()
test_x=tfidf_vec.transform(fitted_test)
test_y = test['manual_class']

In [None]:
#pd.DataFrame(train_x.toarray(), columns=feature_name).iloc[:,0::2]

In [None]:
##Train model using Naive Bayes

from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
%time nb.fit(train_x, train_y)
from sklearn import metrics
y_pred_class = nb.predict(test_x)
metrics.accuracy_score(test_y, y_pred_class)

In [None]:
print(test_y.value_counts())

#Compute null accuracy
null_accuracy = test_y.value_counts().head(1) / len(test_y)
print('Null accuracy:', null_accuracy)

# Manual calculation of null accuracy by always predicting the majority class
print('Manual null accuracy:',(374 / (374 + 26)))

In [None]:
##confusion matrix for naive bayes classifier
metrics.confusion_matrix(test_y, y_pred_class)
#Confusion matrix
#[TN FP
#FN TP]

In [None]:
# print message text for the false positives
o_x = test['clean_pis']
# collection of false positives
false_positives = o_x[(y_pred_class==1) & (test_y==0)]


In [None]:
#displey false positive at position [8]
false_positives.iloc[8]

In [None]:
# print message text for the false negatives 
o_x = test['clean_pis']
#o_x[y_pred_class < y_test]
# alternative less elegant but easier to understand
false_negatives = o_x[(y_pred_class==0) & (test_y==1)]
print(false_negatives.iloc[3])

In [None]:
#####################################
#####  Logistic Regression ##########
#####################################

##Compare using logistic regression model
from sklearn.linear_model import LogisticRegression

# 2. instantiate a logistic regression model
logreg = LogisticRegression()
# 3. train the model using X_train_dtm
%time logreg.fit(train_x, train_y)
# 4. make class predictions for X_test_dtm
y_pred_class_log = logreg.predict(test_x)
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob = logreg.predict_proba(test_x)[:, 1]
y_pred_prob
# calculate accuracy
print(metrics.accuracy_score(test_y, y_pred_class_log))
# calculate AUC
print(metrics.roc_auc_score(test_y, y_pred_prob))

In [None]:
#all probabilities for both 0 and 1 classes, 2d array
logreg.predict_proba(test_x)

In [None]:
#Predicted class for test set
y_pred_prob

In [None]:
#Confusiong matrix for logistic regression classifier 
metrics.confusion_matrix(test_y, y_pred_class_log)
#Confusion matrix
#[TN FP
#FN TP]

In [None]:
#Print actual values vs predicted values
pred = np.array(y_pred_class_log).tolist()
prob = np.array(y_pred_prob).tolist()
adj = pd.DataFrame({'actual': test_y, 'prob': prob, 'pred': pred})
print(adj['actual'].value_counts())
print(adj['pred'].value_counts())


In [None]:
#Output predictions to csv
adj.to_csv (r'C:\Users\nateb\Desktop\Insight\Akidolabs\Akido_DPS_Data\pred.csv', 
                        index = None, header=True) 

In [None]:
#False positives
o_x = test['clean_pis']
false_positives_log = o_x[(y_pred_class_log==1) & (test_y==0)]


In [None]:
#false negatives
o_x = test['clean_pis']
false_negatives_log = o_x[(y_pred_class_log==0) & (test_y==1)]

In [None]:
#print false negative in position [2] for logistic classifier
print(false_negatives_log.iloc[2])

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# roc curve and auc score
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [None]:
#specify Plot ROC curve details
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve for Homeless Classification')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
auc = roc_auc_score(test_y, y_pred_prob)
print('AUC: %.2f' % auc)

In [None]:
fpr, tpr, thresholds = roc_curve(test_y, y_pred_prob)

In [None]:
#Plot ROC curve
plot_roc_curve(fpr, tpr)
