## Preprocessing

In [1]:
#set up sys
import os, sys
sys.path.append('../PMC_func')


#python basics
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import logging
from collections import Counter

#spacy stuff
import spacy
from spacy.language import Language
from spacy.tokens import Doc
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA, DEP, LEMMA, LOWER, IS_PUNCT, IS_DIGIT, IS_SPACE, IS_STOP
import en_core_web_sm

#gensim stuff
from gensim.models import CoherenceModel, LdaMulticore,LdaModel
from gensim.models.callbacks import ConvergenceMetric
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS


# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis  # don't skip this

#custom dependencies
import PMC_module

#other
import pickle
import warnings
warnings.filterwarnings('ignore')

## Import Data

In [2]:
#Import Data
path = '/Users/pedrogalarza/Documents/NYU-MSDS/2021_police-misconduct/misclass'
narratives_path = os.path.join(path, 'data/cpd',"narratives.csv")
narratives = pd.read_csv(narratives_path)
intake = narratives.column_name.str.contains('take')
narratives = (narratives[intake])[["cr_id", "column_name", "text"]]
narratives = narratives.drop_duplicates()
df = narratives[:].copy()
df_list = df.text.values.tolist() #store documents as list of lists

### Preprocessing
#### Input Normalization

In [3]:
#input normalization with duplicate removal
input_list_normalization = PMC_module.input_normalization(texts = df_list)
df_list_normalized = input_list_normalization.normalization_lower()\
                                                 .normalization_whitespace()\
                                                     .strip_accents()\
                                                         .normalization_remove_repeats()\
                                                             .texts

In [4]:
#input normalization with out duplicate removal
input_list_normalization_keep_repeats = PMC_module.input_normalization(texts = df_list)
df_list_normalized_keep_repeats = input_list_normalization_keep_repeats.normalization_lower()\
                                                 .normalization_whitespace()\
                                                     .strip_accents()\
                                                        .texts

In [5]:
#count number of district entries between input normalization techiques
repeat_entry_count = sum(np.array(df_list_normalized) != np.array(df_list_normalized_keep_repeats))
print('number of texts with repeat:', repeat_entry_count)

number of texts with repeat: 994


#### Spacy Filtering and Lemmatization - After Repeat Filtering

In [6]:
#stream spacy docs into lemmatization functions
nlp = spacy.load('en_core_web_sm')

lemmatized_texts = []
for doc in nlp.pipe(df_list_normalized, batch_size=20):
    spacy_tokenizer_test = PMC_module.spacy_filters(doc = doc)
    lemmatized_doc = spacy_tokenizer_test.filter_length()\
                                .filter_stop()\
                                    .filter_punc()\
                                        .filter_pos()\
                                            .extract_lemmas()\
                                                .bag_of_lem

    lemmatized_texts.append(lemmatized_doc)

#### Filter Lengths - Implementation

In [7]:
#join into composite data frame
df_lemmatized_texts = df.copy()
df_lemmatized_texts['bag_of_lemmas'] = lemmatized_texts
df_lemmatized_texts['BoL_length'] = df_lemmatized_texts.apply(lambda row: len(row['bag_of_lemmas']),axis=1)
df_filtered_lemmatized_texts = df_lemmatized_texts.copy()
df_filtered_lemmatized_texts = df_filtered_lemmatized_texts[df_filtered_lemmatized_texts['BoL_length'] >= 10]
df_filtered_lemmatized_texts["row_number"] = df_filtered_lemmatized_texts.reset_index().index

#store lemmas into list for gensim processing
nogram_list_lemmatized_texts = df_filtered_lemmatized_texts.bag_of_lemmas.to_list()

#### Filter Legths - Print effect

In [8]:
pct_remaining = len(df_filtered_lemmatized_texts)/len(df_lemmatized_texts)

print('number of documents before filtering bag length:', len(df_lemmatized_texts))
print('number of documents after filtering bag length:', len(df_filtered_lemmatized_texts))
print("Percentage of Remaining Vocabulary After Filtering Length:", pct_remaining)

number of documents before filtering bag length: 42896
number of documents after filtering bag length: 33979
Percentage of Remaining Vocabulary After Filtering Length: 0.7921251398731817


### Gensim Vectorizing
#### N-gram Contruction

In [14]:
bigram = Phrases(nogram_list_lemmatized_texts, min_count=3, threshold=10,connector_words=ENGLISH_CONNECTOR_WORDS)

bigram_list_lemmatized_texts = list(bigram[nogram_list_lemmatized_texts])
trigram = Phrases(bigram_list_lemmatized_texts, min_count=3, threshold=10,connector_words=ENGLISH_CONNECTOR_WORDS)
trigram_list_lemmatized_texts = list(trigram[bigram_list_lemmatized_texts])

#### Contruct to Gensim Corpus Objects

In [10]:
corpus, id2word = PMC_module.gensim_vectorizing(nogram_list_lemmatized_texts,lower=20, upper=.5)
bi_corpus, bi_id2word = PMC_module.gensim_vectorizing(bigram_list_lemmatized_texts,lower=20, upper=.5)
tri_corpus, tri_id2word = PMC_module.gensim_vectorizing(trigram_list_lemmatized_texts,lower=20, upper=.5)

Percentage of Remaining Vocabulary After Filtering Extremes:  0.12273273083486065
Percentage of Remaining Vocabulary After Filtering Extremes:  0.10932328529965885
Percentage of Remaining Vocabulary After Filtering Extremes:  0.09803045866642847


#### Construct Composite Data Frame
- Convert Gensim Processed Document Vectors into Dictionaries
- Store Frequency dictionaries into dataframe


In [15]:
df_filtered_lemmatized_texts["gensim_nogram"] =  df_filtered_lemmatized_texts.apply(lambda row: PMC_module.corp2dict(row.row_number,corpus, id2word),axis=1)
df_filtered_lemmatized_texts["gensim_bigram"] =  df_filtered_lemmatized_texts.apply(lambda row: PMC_module.corp2dict(row.row_number,bi_corpus, bi_id2word),axis=1)
df_filtered_lemmatized_texts["gensim_trigram"] =  df_filtered_lemmatized_texts.apply(lambda row: PMC_module.corp2dict(row.row_number,tri_corpus, tri_id2word),axis=1)
df_filtered_lemmatized_texts

Unnamed: 0,cr_id,column_name,text,bag_of_lemmas,BoL_length,row_number,gensim_nogram,gensim_bigram,gensim_trigram
1,1049924,Initial / Intake Allegation,"THE REPORTING PARTY, WHO DID NOT\nWITNESS THE ...","[reporting, party, witness, incident, allege, ...",14,0,"{'dog': 1, 'enter': 1, 'incident': 1, 'justifi...","{'enter': 1, 'justification': 1, 'reference_lo...","{'enter_residence': 1, 'justification': 1, 're..."
5,1050193,Initial / Intake Allegation,It is reported that the accused officer failed...,"[report, accuse, officer, fail, terminate, mot...",44,1,"{'fail': 4, 'motor': 4, 'order': 4, 'pursuit':...","{'fail': 4, 'order': 4, 'pursuit': 4, 'report'...","{'fail': 4, 'order': 4, 'pursuit': 4, 'report'..."
9,1050294,Initial / Intake Allegation,The reporting party (aD\nalleges that he was b...,"[reporting, party, allege, beat, bouncer, club...",72,2,"{'incident': 1, 'fail': 2, 'report': 1, 'serge...","{'fail': 2, 'report': 1, 'sergeant': 1, 'vehic...","{'fail': 2, 'report': 1, 'sergeant': 1, 'vehic..."
12,1050294,Initial / Intake Allegation,The reporting party\nalleges that an unknown o...,"[reporting, party, allege, unknown, officer, t...",70,3,"{'incident': 1, 'fail': 2, 'report': 1, 'serge...","{'fail': 2, 'report': 1, 'sergeant': 1, 'vehic...","{'fail': 2, 'report': 1, 'sergeant': 1, 'vehic..."
17,1050588,Initial / Intake Allegation,The reporting party alleges that several\nplai...,"[reporting, party, allege, plainclothe, office...",64,4,"{'fail': 1, 'reason': 1, 'unknown': 3, 'appare...","{'fail': 1, 'unknown': 3, 'apparent_reason': 1...","{'unknown': 3, 'apparent_reason': 1, 'arrest':..."
...,...,...,...,...,...,...,...,...,...
77207,1069346,Initial / Intake Allegation,The victim alleges that her white Iphone\nwhic...,"[victim, allege, white, iphone, person, time, ...",10,33974,"{'time': 1, 'arrest': 1, 'victim': 1, 'white':...","{'time': 1, 'arrest': 1, 'victim': 1, 'white':...","{'time': 1, 'arrest': 1, 'victim': 1, 'white':..."
77212,1069383,Initial / Intake Allegation,It is reported that after the arrest of the\nw...,"[report, arrest, witness, vehicle, unknown, of...",11,33975,"{'witness': 1, 'fail': 1, 'report': 1, 'vehicl...","{'fail': 1, 'report': 1, 'vehicle': 1, 'unknow...","{'report': 1, 'vehicle': 1, 'unknown': 1, 'arr..."
77214,1069617,Initial / Intake Allegation,THE REPORTING PARTY ALLEGES THAT\nONE THE ACCU...,"[reporting, party, allege, accuse, officer, se...",33,33976,"{'justification': 1, 'vehicle': 2, 'search': 2...","{'justification': 1, 'vehicle': 2, 'state': 3,...","{'justification': 1, 'vehicle': 2, 'state': 3,..."
77216,1069693,Initial / Intake Allegation,THE REPORTING PARTY ALLEGES THAT\nTHE DEPARTME...,"[reporting, party, allege, department, member,...",10,33977,"{'damage': 1, 'cause': 1, 'department': 1, 'me...","{'damage': 1, 'cause': 1, 'department_member':...","{'department_member': 1, 'drive': 1, 'sidewalk..."


#### Pickle data frame

In [16]:
df_filtered_lemmatized_texts.to_pickle("../pickled_data/df_lemmatized_texts.pkl")

#### Pickle Text List, Gensim Corpus, and Gensim id2wrod dict

In [17]:
nogram_filename = "../pickled_data/nogram_corpus.pkl"
bigram_filename = "../pickled_data/bigram_corpus.pkl"
trigram_filename = "../pickled_data/trigram_corpus.pkl"


nogram_data = (nogram_list_lemmatized_texts,corpus, id2word)
with open(nogram_filename, "wb") as f:
    pickle.dump(nogram_data, f)

bigram_data = (bigram_list_lemmatized_texts,bi_corpus, bi_id2word)
with open(bigram_filename, "wb") as g:
    pickle.dump(bigram_data, g)
    
trigram_data = (trigram_list_lemmatized_texts,tri_corpus, tri_id2word)
with open(trigram_filename, "wb") as h:
    pickle.dump(trigram_data, h)