In [1]:
import os
import PyPDF2 as pypdf
#import pdfminer
import pandas as pd
import re
import time

data_dir = '/Users/mhouck/Documents/school/capstone/1st Circuit Court/Maine'

In [2]:
# Sex Trafficking Codes and Terms for Calculations
sex_traffic_bucket = ['human traffic', 'sex traffic']
sex_code_bucket = ['§ 1328', '§ 2421', '§ 2422', '§ 2423', '§ 1591', '§ 1952', '§ 1594']
sex_bucket_terms = ['exploit', 'prostitut', 'sex', 'child', 'threat', 'coerc', 'fraud', 'recruit', 'harbor','transport',
                    'provid', 'bonded labor', 'force', 'peonage', 'involuntary servitude', 'slavery','debt bondage',
                    'groom', 'obtain', 'isolat', 'abus', 'involuntary', 'blackmail', 'solicit', 'patroniz']
sex_bucket_socials = ['snapchat', 'facebook', 'adultsearch', 'listcrawler', 'escortalligator', 'megapersonals', 'skipthegame']
sex_bucket_slang = ['the game', 'the life', 'choosing up', 'turn out', 'exit fee', 'in pocket', 'chicken hawk', 'momma',
                    'branding', 'circuit', 'eyeballing', 'stable', 'trade up', 'trade down', 'wifey', 'bottom',
                    'family', 'folks', 'seasoning', 'advert', ' ad ', 'post', 'online', 'internet', 'hotel',
                    'motel', 'room', 'phone', 'text', 'money', 'roses', 'cash', 'paid', 'owe', 'pimp', 'daddy',
                    'gorilla pimp', 'romeo pimp', 'boss', 'prostitute', 'girls', 'women', ' ho ', ' hoe ', 'bitch',
                    'escort', 'date', 'trick', 'out-call', 'in-call', 'trade', 'blade', 'track', 'stroll',
                    'lot lizard', 'quota', 'squaring up', 'customer', 'client', 'john', 'travel', 'greyhound', ' bus ',
                    'rental car', 'taxi', 'uber', 'lyft', 'place', 'brothel']

# Labor Trafficking Codes and Terms for Calculations
labor_traffic_bucket = ['human traffic', 'labor traffic']
labor_code_bucket = ['§ 1581', '§ 1584', '§ 1589', '§ 1590', '§ 1592']
labor_words_bucket = ['visa', 'h-2b visa', 'h-2a visa', 'f1 visa', 't visa', 'u visa', 'involuntary servitude',
                      'bonded labor', 'forced labor', 'peonage', 'slavery']
labor_terms_bucket = ['labor', 'conceal', 'control', 'promise', 'false', 'payments', 'money', 'debt',
                      'bondage', 'work', 'alien', 'confiscate','passport', 'immigrant',
                      'immigration', 'document', 'transport', 'enticement', 'deception', 'snakehead']

#list to get police departments 
pd_list = ['police department', 'p.d.', 'pd', 'agency', 'law enforcement agency', 'sheriff department', 'highway patrol']
abrev_pd_list = ['federal bureau of investigations', 'fbi', 'dhs', 'f.b.i', 'd.h.s']


In [4]:
def get_court(text):
    return text.split("\n")[1]

# takes the text of a court case pdf and returns which court heard the case
def get_arresting_dept(text):
    matches = []

    for word in pd_list:
        #pattern that gets the two words before "police department" or whatever other word we look for
        pattern = r'(\b\w+\b)\s+(\b\w+\b)\s+' + word

        match_list = re.findall(pattern, text)

        # print the matches
        for match in match_list:
            combo = match[0] + ' ' + match[1] + ' ' + word
            matches.append(combo)

    word_dict = {}
    for word in abrev_pd_list: 
        word_count = text.count(word)
        word_dict[word] = word_count

    word_dict = {x:y for x,y in word_dict.items() if y!=0}

    #count number of each occurence 
    counts = {}
    for i in matches:
        counts[i] = counts.get(i, 0) + 1

    counts.update(word_dict)

    #sort the dictionary by value 
    counts = {k: v for k, v in sorted(counts.items(), key=lambda item: item[1], reverse=True)}
    return counts

def get_courtcode(text, code):
    if code in text:
        return 'YES'
    else:
        return 'NO'

def get_defendant_name(text):
    try:
        defs = text.split("reporter\n")[1].split('defendant')[0]
        ind = defs.count("plaintiff")
        defs = defs.split("plaintiff")[ind]

        defs = defs.replace("\n", "")
        if 'v.' in defs:
            defs = defs.split("v.")[1].split('defendant')[0].strip()[:-1]
        elif 'vs' in defs:
            defs = defs.split("vs.")[1].split('defendant')[0].strip()[:-1]

        elif 'respondent-appellant' in defs:
            defs = defs.split("vs.")[1].split('respondent')[0].strip()[:-1]

            #defs_final = defs
        return defs
    except:
        return None
        
    #return defs_final

def get_traffic_count(text, word_list):
    word_dict = {}
    for word in word_list: 
        word_count = text.count(word)
        word_dict[word] = word_count
    return word_dict


def get_traffic_appearance(text, word_list):
    word_dict = {}
    for word in word_list: 
        if word in text:
            word_dict[word] = 1
        else:
            word_dict[word] = 0
    return word_dict

In [5]:
def get_trafficking_ratings(text):
    # Sex Trafficking Calculation
    sex_traffic_count = get_traffic_count(text, sex_traffic_bucket)
    sex_code_appearance = get_traffic_appearance(text, sex_code_bucket)
    sex_terms_appearance = get_traffic_appearance(text, sex_bucket_terms)
    sex_socials_appearance = get_traffic_appearance(text, sex_bucket_socials)
    sex_slang_appearance = get_traffic_appearance(text, sex_bucket_slang)
    
    sex_code_appearance_score = (sum(sex_code_appearance.values()) / len(sex_code_appearance)) * 10
    sex_terms_appearance_score = (1+(sum(sex_terms_appearance.values()) / len(sex_terms_appearance))) * 3
    sex_socials_appearance_score = (1+(sum(sex_socials_appearance.values()) / len(sex_socials_appearance))) * 2
    sex_slang_appearance_score = (1+(sum(sex_slang_appearance.values()) / len(sex_slang_appearance)))
    sex_overlap_score = ((sex_terms_appearance_score/3) * (sex_socials_appearance_score/2) *
                                     sex_slang_appearance_score) * 2
   
    sex_scores_total = (sex_code_appearance_score + sex_terms_appearance_score + sex_socials_appearance_score +
                        sex_slang_appearance_score + sex_overlap_score)
    
    # Labor Trafficking Calculation
    labor_traffic_count = get_traffic_count(text, labor_traffic_bucket)
    labor_code_appearance = get_traffic_appearance(text, labor_code_bucket)
    labor_words_appearance = get_traffic_appearance(text, labor_words_bucket)
    labor_terms_appearance = get_traffic_appearance(text, labor_terms_bucket)
    
    labor_code_appearance_score = (sum(labor_code_appearance.values()) / len(labor_code_appearance)) * 10
    labor_words_appearance_score = (1+(sum(labor_words_appearance.values()) / len(labor_words_appearance))) * 3
    labor_terms_appearance_score = (1+(sum(labor_terms_appearance.values()) / len(labor_terms_appearance))) * 2
    labor_overlap_score = ((labor_words_appearance_score/3) * (labor_terms_appearance_score/2)) * 3
    
    labor_scores_total = (labor_code_appearance_score + labor_words_appearance_score + labor_terms_appearance_score +
                        labor_overlap_score)
    
    return sex_scores_total, labor_scores_total



In [6]:
def get_traffic_count(text, word_list):
    word_dict = {}
    for word in word_list: 
        word_count = text.count(word)
        word_dict[word] = word_count
    return word_dict


def get_traffic_appearance(text, word_list):
    word_dict = {}
    for word in word_list: 
        if word in text:
            word_dict[word] = 1
        else:
            word_dict[word] = 0
    return word_dict

In [7]:
for filename in os.scandir(data_dir):
        if (".pdf" in filename.name) and ("Attachment" not in filename.name):
            reader = pypdf.PdfReader(filename.path)
            pypdf2_text = ""
            for i in range(len(reader.pages)):
                pypdf2_text += reader.pages[i].extract_text()  
            pypdf2_text = pypdf2_text.lower()
            #print(filename.name)
            #print(get_trafficking_ratings(pypdf2_text))
        else:
            continue

In [8]:
st = time.time()

files = []
defendant = []
court = []
c1591 = []
c2421 = []
c1594 = []
c2422 = []
c2423 = []
c1328 = []
c1952 = []
c1581 = []
c1584 = []
c1589 = []
c1590 = []
c1592 = []
sex_trafficking_score = []
labor_trafficking_score = []
pdept = []

for filename in os.scandir(data_dir):
        if (".pdf" in filename.name) and ("Attachment" not in filename.name):
            reader = pypdf.PdfReader(filename.path)
            pypdf2_text = ""
            for i in range(len(reader.pages)):
                pypdf2_text += reader.pages[i].extract_text()  
            pypdf2_text = pypdf2_text.lower()
            #print(filename.name)
            files.append(filename.name)
            court.append(get_court(pypdf2_text))
            defendant.append(get_defendant_name(pypdf2_text))
            pdept.append(get_arresting_dept(pypdf2_text))
            c1591.append(get_courtcode(pypdf2_text, "§ 1591"))
            c2421.append(get_courtcode(pypdf2_text, "§ 2421"))
            c1594.append(get_courtcode(pypdf2_text, "§ 1594"))
            c2422.append(get_courtcode(pypdf2_text, "§ 2422"))
            c2423.append(get_courtcode(pypdf2_text, "§ 2423"))
            c1328.append(get_courtcode(pypdf2_text, "§ 1328"))
            c1952.append(get_courtcode(pypdf2_text, "§ 1952"))
            c1581.append(get_courtcode(pypdf2_text, "§ 1581"))
            c1584.append(get_courtcode(pypdf2_text, "§ 1584"))
            c1589.append(get_courtcode(pypdf2_text, "§ 1589"))
            c1590.append(get_courtcode(pypdf2_text, "§ 1590"))
            c1592.append(get_courtcode(pypdf2_text, "§ 1592"))
            sex_traffic_score, labor_traffic_score = get_trafficking_ratings(pypdf2_text)
            sex_trafficking_score.append(sex_traffic_score)
            labor_trafficking_score.append(labor_traffic_score)
            #print(get_trafficking_ratings(pypdf2_text))
        else:
            continue
            
et = time.time()            
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

AttributeError: 'NoneType' object has no attribute 'items'

In [None]:
cases = pd.DataFrame({'file_name': files,
                      'Court': court,
                      'Defendant(s)': defendant,
                      'Arresting PD(s)': pdept,
                      'Sex_Traffic_Score':sex_trafficking_score,
                      'Labor_Traffic_Score': labor_trafficking_score,
                      'Code_1591': c1591,
                      'Code_2421': c2421, 
                      'Code_1594': c1594, 
                      'Code_2422': c2422, 
                      'Code_2423': c2423, 
                      'Code_1328': c1328, 
                      'Code_1952': c1952,
                      'Code_1581': c1581,
                      'Code_1584': c1584,
                      'Code_1589': c1589,
                      'Code_1590': c1590,
                      'Code_1592': c1592})
cases = cases.sort_values(by="Sex_Traffic_Score", ascending=False)

In [None]:
cases.head(50)

In [178]:
cases.to_csv('maine_example.csv')