In [1]:
import os
import PyPDF2 as pypdf
#import pdfminer
import pandas as pd
import re
import time
from ipynb.fs.full.Trafficking_Ranking import get_trafficking_ratings

data_dir = './data'

In [2]:
#list to get police departments 
pd_list = ['police department', 'p.d.', 'pd', 'agency', 'law enforcement agency', 'sheriff department', 'highway patrol']
abrev_pd_list = ['federal bureau of investigations', 'fbi', 'dhs', 'f.b.i', 'd.h.s']


In [3]:
def get_court(text):
    return text.split("\n")[1]

# takes the text of a court case pdf and returns which court heard the case
def get_arresting_dept(text):
    matches = []

    for word in pd_list:
        #pattern that gets the two words before "police department" or whatever other word we look for
        pattern = r'(\b\w+\b)\s+(\b\w+\b)\s+' + word

        match_list = re.findall(pattern, text)

        # print the matches
        for match in match_list:
            combo = match[0] + ' ' + match[1] + ' ' + word
            matches.append(combo)

    word_dict = {}
    for word in abrev_pd_list: 
        word_count = text.count(word)
        word_dict[word] = word_count

    word_dict = {x:y for x,y in word_dict.items() if y!=0}

    #count number of each occurence 
    counts = {}
    for i in matches:
        counts[i] = counts.get(i, 0) + 1

    counts.update(word_dict)

    #sort the dictionary by value 
    counts = {k: v for k, v in sorted(counts.items(), key=lambda item: item[1], reverse=True)}
    return counts

def get_courtcode(text, code):
    if code in text:
        return 'YES'
    else:
        return 'NO'

def get_defendant_name(text):
    try:
        defs = text.split("reporter\n")[1].split('defendant')[0]
        ind = defs.count("plaintiff")
        defs = defs.split("plaintiff")[ind]

        defs = defs.replace("\n", "")
        if 'v.' in defs:
            defs = defs.split("v.")[1].split('defendant')[0].strip()[:-1]
        elif 'vs' in defs:
            defs = defs.split("vs.")[1].split('defendant')[0].strip()[:-1]

        elif 'respondent-appellant' in defs:
            defs = defs.split("vs.")[1].split('respondent')[0].strip()[:-1]

            #defs_final = defs
        return defs
    except:
        return None
        
    #return defs_final

def get_traffic_count(text, word_list):
    word_dict = {}
    for word in word_list: 
        word_count = text.count(word)
        word_dict[word] = word_count
    return word_dict


def get_traffic_appearance(text, word_list):
    word_dict = {}
    for word in word_list: 
        if word in text:
            word_dict[word] = 1
        else:
            word_dict[word] = 0
    return word_dict

In [4]:
def create_excel_output(state):
    # initialize empty files
    files = []
    defendant = []
    court = []
    c1591 = []
    c2421 = []
    c1594 = []
    c2422 = []
    c2423 = []
    c1328 = []
    c1952 = []
    c1581 = []
    c1584 = []
    c1589 = []
    c1590 = []
    c1592 = []
    sex_trafficking_score = []
    labor_trafficking_score = []
    pdept = []

    print("Starting the Excel Writing Process for: " + str(state))
    data_dir = "./United States/" + str(state)
    sub_folders = [name for name in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, name))]
    for folder in sub_folders:
        for filename in os.scandir(data_dir + './' + str(folder)):
            if (".pdf" in filename.name) and ("Attachment" not in filename.name):
                reader = pypdf.PdfReader(filename.path)
                pypdf2_text = ""
                for i in range(len(reader.pages)):
                    pypdf2_text += reader.pages[i].extract_text()  
                pypdf2_text = pypdf2_text.lower()
                print("Scraping: " + str(filename))
                files.append(filename.name)
                court.append(get_court(pypdf2_text))
                defendant.append(get_defendant_name(pypdf2_text))
                pdept.append(get_arresting_dept(pypdf2_text))
                c1591.append(get_courtcode(pypdf2_text, "§ 1591"))
                c2421.append(get_courtcode(pypdf2_text, "§ 2421"))
                c1594.append(get_courtcode(pypdf2_text, "§ 1594"))
                c2422.append(get_courtcode(pypdf2_text, "§ 2422"))
                c2423.append(get_courtcode(pypdf2_text, "§ 2423"))
                c1328.append(get_courtcode(pypdf2_text, "§ 1328"))
                c1952.append(get_courtcode(pypdf2_text, "§ 1952"))
                c1581.append(get_courtcode(pypdf2_text, "§ 1581"))
                c1584.append(get_courtcode(pypdf2_text, "§ 1584"))
                c1589.append(get_courtcode(pypdf2_text, "§ 1589"))
                c1590.append(get_courtcode(pypdf2_text, "§ 1590"))
                c1592.append(get_courtcode(pypdf2_text, "§ 1592"))
                sex_traffic_score, labor_traffic_score = get_trafficking_ratings(pypdf2_text)
                sex_trafficking_score.append(sex_traffic_score)
                labor_trafficking_score.append(labor_traffic_score)
            else:
                continue
            
    cases = pd.DataFrame({'file_name': files,
                          'Court': court,
                          'Defendant(s)': defendant,
                          #'Arresting PD(s)': pdept,
                          'Sex_Traffic_Score':sex_trafficking_score,
                          'Labor_Traffic_Score': labor_trafficking_score,
                          'Code_1591': c1591,
                          'Code_2421': c2421, 
                          'Code_1594': c1594, 
                          'Code_2422': c2422, 
                          'Code_2423': c2423, 
                          'Code_1328': c1328, 
                          'Code_1952': c1952,
                          'Code_1581': c1581,
                          'Code_1584': c1584,
                          'Code_1589': c1589,
                          'Code_1590': c1590,
                          'Code_1592': c1592})
    cases = cases.sort_values(by="Sex_Traffic_Score", ascending=False)
    filename = str(state) + "_Cases_Information_and_Rankings"
    cases.to_csv(str(data_dir) + '/' + str(filename) + '.csv')

In [5]:
# type state name, this is case sensitive based on the folder the items are inside (i.e. New York, Oklahoma, Virgin Islands)
state = "Oklahoma"
# Make sure that all of the states are in a folder named "United States" and that the "United States"
# folder is in the same location as this ipynb file, this function will automatically generate the excel output
# in the selected states folder
create_excel_output(state)

Starting the Excel Writing Process for: Oklahoma
Scraping: <DirEntry 'Barrett v. United States, 2012 U.S. Dist. LEXIS 115360.pdf'>
Scraping: <DirEntry 'Bishop v. Fed. Gov_t (As A Whole), 2020 U.S. Dist. LEXIS 37326.pdf'>
Scraping: <DirEntry 'Bosco_s Club, Inc. v. Oklahoma City, 598 F. Supp. 583.pdf'>
Scraping: <DirEntry 'Cailao v. Hotelmacher LLC, 2019 U.S. Dist. LEXIS 241947.pdf'>
Scraping: <DirEntry 'Campbell v. Cowley, 1993 U.S. Dist. LEXIS 21855.pdf'>
Scraping: <DirEntry 'Campbell v. Workman, 2010 U.S. Dist. LEXIS 31679.pdf'>
Scraping: <DirEntry 'Casilao v. Hotelmacher LLC, 2020 U.S. Dist. LEXIS 265546.pdf'>
Scraping: <DirEntry 'Casilao v. Hotelmacher LLC, 2021 U.S. Dist. LEXIS 188177.pdf'>
Scraping: <DirEntry 'Castro v. Oklahoma, 71 F.3d 1502.pdf'>
Scraping: <DirEntry 'Castro v. Ward, 138 F.3d 810.pdf'>
Scraping: <DirEntry 'Chellen v. John Pickle Co., 344 F. Supp. 2d 1278.pdf'>
Scraping: <DirEntry 'Conway v. Sutter, 2010 U.S. Dist. LEXIS 85062.pdf'>
Scraping: <DirEntry 'Copeland v

Scraping: <DirEntry 'United States v. Ehrens, 2015 U.S. Dist. LEXIS 160744.pdf'>
Scraping: <DirEntry 'United States v. Eubanks, 2022 U.S. Dist. LEXIS 134136.pdf'>
Scraping: <DirEntry 'United States v. Fishman, 645 F.3d 1175.pdf'>
Scraping: <DirEntry 'United States v. Garcia-Escalera, 998 F. Supp. 2d 1191.pdf'>
Scraping: <DirEntry 'United States v. Gillespie, 452 F.3d 1183.pdf'>
Scraping: <DirEntry 'United States v. Green, 435 F.3d 1265.pdf'>
Scraping: <DirEntry 'United States v. Gum, 2015 U.S. Dist. LEXIS 151581.pdf'>
Scraping: <DirEntry 'United States v. Gum, 2016 U.S. Dist. LEXIS 182030.pdf'>
Scraping: <DirEntry 'United States v. Hall, 2021 U.S. Dist. LEXIS 225831.pdf'>
Scraping: <DirEntry 'United States v. Hall, 424 F. Supp. 508.pdf'>
Scraping: <DirEntry 'United States v. Harring, 2016 U.S. Dist. LEXIS 49541.pdf'>
Scraping: <DirEntry 'United States v. Harring, 2018 U.S. Dist. LEXIS 179639.pdf'>
Scraping: <DirEntry 'United States v. Henderson, 829 Fed. Appx. 376.pdf'>
Scraping: <DirE

Scraping: <DirEntry 'Courtney v. Courtney, 1938 OK 538.pdf'>
Scraping: <DirEntry 'Cowles v. State, 1981 OK CR 132.pdf'>
Scraping: <DirEntry 'Davis v. State, 1996 OK CR 15.pdf'>
Scraping: <DirEntry 'de Jesus Garcia v. State, 1995 OK CR 58.pdf'>
Scraping: <DirEntry 'Dobbins v. Tex. Co., 1928 OK 696.pdf'>
Scraping: <DirEntry 'Dunford v. State, 1977 OK CR 109.pdf'>
Scraping: <DirEntry 'Edmons v. State, 1913 OK CR 153.pdf'>
Scraping: <DirEntry 'Edwards v. City of Sallisaw, 2014 OK 86.pdf'>
Scraping: <DirEntry 'Ex parte Bochmann, 1921 OK CR 203.pdf'>
Scraping: <DirEntry 'Ex parte Gammel, 1949 OK CR 81.pdf'>
Scraping: <DirEntry 'Ex parte Smith, 1923 OK CR 269.pdf'>
Scraping: <DirEntry 'Ex parte Woodruff, 1949 OK CR 98.pdf'>
Scraping: <DirEntry 'Faught v. City of Sapulpa, 1930 OK 218.pdf'>
Scraping: <DirEntry 'Findley v. Wilson, 1925 OK 805.pdf'>
Scraping: <DirEntry 'Fletcher v. State, 1961 OK CR 60.pdf'>
Scraping: <DirEntry 'Fry v. State ex rel. Dep_t of Corr., 2017 OK 77.pdf'>
Scraping: <Dir

Scraping: <DirEntry 'Towne v. Hubbard (In re Towne), 2000 OK 30.pdf'>
Scraping: <DirEntry 'Walter Stewart v. State, 1946 OK CR 110.pdf'>
Scraping: <DirEntry 'Walters v. J. C. Penney Co., 2003 OK 100.pdf'>
Scraping: <DirEntry 'Walton v. Donnelly, 1921 OK 258.pdf'>
Scraping: <DirEntry 'Warren v. Canard, 1911 OK 521.pdf'>
Scraping: <DirEntry 'Weatherford v. State, 1954 OK CR 147.pdf'>
Scraping: <DirEntry 'Western Union Tel. Co. v. Chouteau, 1911 OK 216.pdf'>
Scraping: <DirEntry 'White v. State, 1910 OK CR 174.pdf'>
Scraping: <DirEntry 'White v. State, 1995 OK CR 15.pdf'>
Scraping: <DirEntry 'Whitson v. City of Ada, 1935 OK 414.pdf'>
Scraping: <DirEntry 'Wilson v. State, 1919 OK CR 264.pdf'>
Scraping: <DirEntry 'Wines v. State, 1912 OK CR 201.pdf'>
Scraping: <DirEntry 'World Publ. Co. v. White, 2001 OK 48.pdf'>
Scraping: <DirEntry 'Yancey v. Thomas (In re Adoption of Baby Boy L.), 2013 OK CIV APP 63.pdf'>
Scraping: <DirEntry 'Young v. Terr. of Okla., 1899 OK 86.pdf'>


PermissionError: [Errno 13] Permission denied: './United States/Oklahoma/Oklahoma_Cases_Information_and_Rankings.csv'