In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import numpy as np
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import re
%matplotlib inline

# Load and Combine DA Data Files

In [2]:
# Load DA reports
DA2015 = pd.read_csv('DA2015.csv')
DA2016 = pd.read_csv('DA2016.csv')
DA2017 = pd.read_csv('DA2017.csv')

In [3]:
DA_Full = pd.concat([DA2015,DA2016,DA2017], axis = 0).reset_index(drop = True)
DA_Full.shape

(1185, 290)

# Apply Corrections to Incident Numbers

In [4]:
# Correct Incident Numbers
DA_Full.IncidentNumber = DA_Full.IncidentNumber.replace( ['2015-15790',
'2015-171',
'2015-18657',
'2015-10854',
'2015-17442',
'2015-18625',
'2015-1903',
'2015-25424',
'2015-382',
'2015-23119',
'2015-24081',
'2015-20102',
'2015-25134',
'2015-25182',
'2015-857',
'2015-7982',
'2016-13631',
'2016-5611',
'2016-2940',
'2016-2840',
'2016-3611',
'2016-5548',
'2016-4220',
'2016-4579',
'2016-6031',
'2016-607',
'2016-6887',
'2016-7005',
'2016-9116',
'2016-92',
'2016-9441',
'2016-16113',
'2016-1073',
'2016-19740',
'2016-197708',
'2016-182477',
'2016-1945'] ,['2015-15970',
'2015-16171',
'2015-16657',
'2015-16854',
'2015-17142',
'2015-18265',
'2015-19035',
'2015-22454',
'2015-23083',
'2015-23719',
'2015-24041',
'2015-25702',
'2015-25734',
'2015-25782',
'2015-851',
'2015-9982',
'2016-1363',
'2016-1564',
'2016-1940',
'2016-2848',
'2016-3414',
'2016-3548',
'2016-4270',
'2016-4578',
'2016-6037',
'2016-6845',
'2016-6882',
'2016-7006',
'2016-9111',
'2016-9219',
'2016-9991',
'2016-10113',
'2016-10735',
'2016-10740',
'2016-17770',
'2016-18247',
'2016-19957'])


In [5]:
# create combined DF
DA_Full.to_csv('DA_Full.csv')

In [6]:
DA_Full.columns.values

array(['DIRVersion', 'CaseYear', 'CaseName', 'SuspectActionsImpaired',
       'SuspectActionsDrugOrAlcolol', 'SuspectActionsPushing',
       'SuspectActionsThrewItems', 'SuspectActionsBiting',
       'SuspectActionsInjureChild', 'SuspectActionsSexAssault',
       'SuspectActionsUnwantedContact', 'SuspectActionsDestroyedProp',
       'InjureOtherPerson', 'SuspectActionsShooting',
       'SuspectActionsVerbalAbuse', 'InjurePet', 'SuspectActionsSlapping',
       'SuspectActionsViolatedConditions', 'SuspectActionsForcedEntry',
       'InterferencePhone', 'SuspectActionsSlammingBody',
       'SuspectActionsForciblRestraint', 'SuspectActionsIntimidation',
       'SuspectActionsStabbing', 'SuspectActionsHairPulling',
       'SuspectActionsKicking', 'SuspectActionsStrangulation',
       'SuspectActionsHomicide', 'SuspectActionsPunching',
       'SuspectActionsSuicide', 'SuspectActionsThreats',
       'SuspectActionsInjureKillPerson', 'SuspectActionsInjureKillSelf',
       'SuspectActionsInjure

# Cleaning Data

In [7]:
# Transforming necessary columns to binary
DA_Full.PartiesLiveTogether = DA_Full.PartiesLiveTogether.replace(['No','Yes'], [0,1])
DA_Full.PartiesEverLiveTogether = DA_Full.PartiesEverLiveTogether.replace(['No','Yes'], [0,1])
DA_Full.HaveChildTogher = DA_Full.HaveChildTogher.replace(['No','Yes'], [0,1])

# replacing NAs with zero
DA_Full.PartiesLiveTogether = DA_Full.PartiesLiveTogether.fillna(0)
DA_Full.PartiesEverLiveTogether = DA_Full.PartiesEverLiveTogether.fillna(0)
DA_Full.HaveChildTogher = DA_Full.HaveChildTogher.fillna(0)

# Replace NA with 'Other'
DA_Full.PartiesRelationship = DA_Full.PartiesRelationship.fillna('Other')

# Feature Engineering

In [8]:
# Create function to tag if case is "Intimate' vs 'Not Intimate'
def is_intimate(df):
    if (df['PartiesRelationship'] == 'Intimate') or (df['PartiesRelationship'] == 'Married') or (df['PartiesRelationship'] == 'FormallyIntimate') or (df['PartiesRelationship'] == 'FormallyMarried'):
        return str('Intimate')
    elif (df['PartiesRelationship'] == 'Other'): # if it is nan or other then do second check
        if df['HaveChildTogher'] == 1:
            return str('Intimate')
        else:
            return str('Not Intimate')
    else:
        return str('Not Intimate')

In [9]:
DA_Full['Intimacy'] = DA_Full.apply(lambda x: is_intimate(x), axis = 1)

### These are ones NOT labeled 'Intimate' that WE labeled as 'Intimate'

In [10]:
DA_Full.loc[(DA_Full.PartiesRelationship == 'Other') & (DA_Full.Intimacy == 'Intimate'),['PartiesRelationship','Intimacy']]

Unnamed: 0,PartiesRelationship,Intimacy
56,Other,Intimate
78,Other,Intimate
127,Other,Intimate
289,Other,Intimate
370,Other,Intimate
443,Other,Intimate
447,Other,Intimate
452,Other,Intimate
561,Other,Intimate
564,Other,Intimate


# Combining Narrative Columns From Version 1 And 2

In [11]:
# DA reports have 2 versions. Both are combined into one DF but each version has a different narrative column.
def get_narrative(df):
    if df.DIRVersion == 1:
        return str(df.ResultsOfInvestigation)
    else:
        return str(df.IncidentComment)

In [12]:
DA_Full['Narrative'] = DA_Full.apply(lambda x: get_narrative(x), axis = 1)

In [13]:
DA_Full.Narrative.head()

0    P1 states P2 struck/punched him in the face. P...
1                                  See Sup information
2    P1 states her and P2 currently live together a...
3    P1 and P2 got into a verbal argument about som...
4    P1 came into KPD to report her receiving a tex...
Name: Narrative, dtype: object

In [14]:
DA_Full.groupby('Intimacy').agg({'Narrative': 'count'})

Unnamed: 0_level_0,Narrative
Intimacy,Unnamed: 1_level_1
Intimate,845
Not Intimate,340


# 71% of records are Intimate

In [15]:
845/(845+340)

0.7130801687763713

# Tokenize Narrative

In [16]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer('[A-Za-z]+')

DA_Full.Narrative = DA_Full.Narrative.apply(lambda x: tokenizer.tokenize(x))

In [17]:
DA_Full.Narrative.head()

0    [P, states, P, struck, punched, him, in, the, ...
1                              [See, Sup, information]
2    [P, states, her, and, P, currently, live, toge...
3    [P, and, P, got, into, a, verbal, argument, ab...
4    [P, came, into, KPD, to, report, her, receivin...
Name: Narrative, dtype: object

# Remove Stopwords

In [18]:
from nltk.corpus import stopwords

# adding my own stopwords
stopWords = set(stopwords.words('english'))
stopWords.update(('p1','p2','nan', 'p','see','sup','supp','refer','report', 'party','states', 'state','stated','said','got','told'))

def rem_stop (df):
    return [word.lower() for word in df.Narrative if word.lower() not in stopWords]

In [19]:
DA_Full.Narrative = DA_Full.apply(lambda x: rem_stop(x), axis = 1)
DA_Full.Narrative.head()

0    [struck, punched, face, showed, obvious, injur...
1                                        [information]
2    [currently, live, together, residence, leave, ...
3    [verbal, argument, problems, mad, began, trash...
4    [came, kpd, receiving, text, facebook, message...
Name: Narrative, dtype: object

# Apply Lemmatization


In [20]:
from nltk import WordNetLemmatizer
lemztr = WordNetLemmatizer()

def porter_stem(df):
    return [lemztr.lemmatize(word) for word in df.Narrative]

In [21]:
DA_Full.Narrative = DA_Full.apply(lambda x: porter_stem(x), axis = 1)

In [22]:
DA_Full.Narrative.head()

0    [struck, punched, face, showed, obvious, injur...
1                                        [information]
2    [currently, live, together, residence, leave, ...
3    [verbal, argument, problem, mad, began, trashi...
4    [came, kpd, receiving, text, facebook, message...
Name: Narrative, dtype: object

In [23]:
narrative = DA_Full.Narrative
narrative.head()

0    [struck, punched, face, showed, obvious, injur...
1                                        [information]
2    [currently, live, together, residence, leave, ...
3    [verbal, argument, problem, mad, began, trashi...
4    [came, kpd, receiving, text, facebook, message...
Name: Narrative, dtype: object

# Making Narratives A Transaction Object

In [24]:
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_narrative = te.fit(narrative).transform(narrative)

In [25]:
te_narrative

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [26]:
tx_narrative = pd.DataFrame(te_narrative, columns = te.columns_)

In [27]:
DA_Full.Intimacy.unique()

array(['Intimate', 'Not Intimate'], dtype=object)

# Reducing Dimensionality From 3805 Features To 1861

In [28]:
# Setting minimum amount of times a word must appear to be a feature
min_freq = 2

In [29]:
# There are 1944 features that appear more than once!
tx_narrative.loc[:,~(tx_narrative.sum() >= min_freq)].head()

Unnamed: 0,aadvisedof,aaron,aat,abby,ability,abouts,accidentally,account,accuse,acquire,...,yellow,yet,yielded,yoked,zachary,zacharys,zell,zeno,ziane,zoe
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [30]:
# There are 1861 features that only appear once!
tx_narrative.loc[:,(tx_narrative.sum() >= min_freq)].head()

Unnamed: 0,abdomen,abeel,able,abortion,abrasion,abuse,abused,abusing,abusive,access,...,yelling,yesterday,ymca,yo,york,young,younger,youngest,yr,zambrella
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [31]:
tx_narrative = tx_narrative.loc[:, tx_narrative.sum() >= min_freq]
tx_narrative.shape

(1185, 1861)

# Train Test Split

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
x = np.array(tx_narrative)
y = np.ravel(DA_Full.Intimacy)

In [34]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .10, random_state = 0, stratify = y)

# Tuning Alpha Of Multinomial Naive Bayes Classifier

In [35]:
from sklearn import naive_bayes
from sklearn.model_selection import GridSearchCV

mnb_cv = naive_bayes.MultinomialNB()

mnb_param_grid = [{
    'alpha' : np.linspace(.01,5,100) # np.linspace(.01,5,100)
}]

mnb_grid_search = GridSearchCV(mnb_cv,
                              param_grid = mnb_param_grid,
                              cv = 5,
                              scoring = 'accuracy')

In [36]:
mnb_grid_search.fit(x_train, y_train)
best_alpha = mnb_grid_search.best_params_.get('alpha')
print(best_alpha)
print(mnb_grid_search.best_score_)


2.53020202020202
0.7401500938086304


# Refit Naive Bayes With Best Alpha

In [37]:
mnb = naive_bayes.MultinomialNB(alpha = best_alpha)
mnb.fit(x_train, y_train)

MultinomialNB(alpha=2.53020202020202, class_prior=None, fit_prior=True)

In [38]:
# Test accuracy : 78.99% (10% split and feature reduction) min freq 2
# Test accuracy : 80.67% (10% split and feature reduction) min freq 1
# Test accuracy : 78.15% (10% split and no feature reduction)

print("The naive score of multinomial naive bayes is: %.4f" %mnb.score(x, y))
print("The training score of multinomial naive bayes is: %.4f" %mnb.score(x_train, y_train))
print("The test score of multinomial naive bayes is: %.4f" %mnb.score(x_test, y_test))

The naive score of multinomial naive bayes is: 0.8346
The training score of multinomial naive bayes is: 0.8377
The test score of multinomial naive bayes is: 0.8067


# MODEL EVALUTATION

In [39]:
import collections
collections.Counter(y_test)

Counter({'Intimate': 85, 'Not Intimate': 34})

In [40]:
from sklearn.metrics import confusion_matrix
CM = pd.DataFrame(confusion_matrix( y_test, mnb.predict(x_test),
                                   labels = ['Intimate', 'Not Intimate']),
                columns=['Predicted - Intimate','Predicted - Not Intimate'],
                ).rename(index={0:'Actual - Intimate', 1:'Actual - Not Intimate'})
CM

Unnamed: 0,Predicted - Intimate,Predicted - Not Intimate
Actual - Intimate,82,3
Actual - Not Intimate,20,14


In [41]:
print('CORRECT: True Intimate:', 82/(82+20))
print('CORRECT: True Not-Intimate:', 14/(3+14))
print('WRONG: False Intimate:', 20/(82+20))
print('WRONG: False Not-Intimate:', 3/(3+14))
print('****************************************************')
print('TOTAL CORRECT: ', (82+14)/(82+14+20+3) )
print('TOTAL WRONG: ', (20+3)/(82+14+20+3) )

CORRECT: True Intimate: 0.803921568627451
CORRECT: True Not-Intimate: 0.8235294117647058
WRONG: False Intimate: 0.19607843137254902
WRONG: False Not-Intimate: 0.17647058823529413
****************************************************
TOTAL CORRECT:  0.8067226890756303
TOTAL WRONG:  0.19327731092436976
