In [1]:
import os
import PyPDF2 as pypdf
#import pdfminer
import pandas as pd
import re
import time

data_dir = './data'

In [61]:
def get_court(text):
    return text.split("\n")[1]

# takes the text of a court case pdf and returns which court heard the case
def get_arresting_dept(text):
    """
    takes the text of a court case pdf and returns a dictionary of the
    possible arresting agencies (agency:ocurrence), to be used in determining arresting agency
    for affidavit requests
    """
    return df

def get_courtcode(text, code):
    if code in text:
        return 'YES'
    else:
        return 'NO'

def get_defendant_name(text):
    try:
        defs = text.split("reporter\n")[1].split('defendant')[0]
        ind = defs.count("plaintiff")
        defs = defs.split("plaintiff")[ind]

        defs = defs.replace("\n", "")
        if 'v.' in defs:
            defs = defs.split("v.")[1].split('defendant')[0].strip()[:-1]
        elif 'vs' in defs:
            defs = defs.split("vs.")[1].split('defendant')[0].strip()[:-1]

        elif 'respondent-appellant' in defs:
            defs = defs.split("vs.")[1].split('respondent')[0].strip()[:-1]

            #defs_final = defs
        return defs
    except:
        return None
        
    #return defs_final

def get_traffic_count(text, word_list):
    word_dict = {}
    for word in word_list: 
        word_count = text.count(word)
        word_dict[word] = word_count
    return word_dict


def get_traffic_appearance(text, word_list):
    word_dict = {}
    for word in word_list: 
        if word in text:
            word_dict[word] = 1
        else:
            word_dict[word] = 0
    return word_dict

In [88]:
# Sex Trafficking Codes and Terms for Calculations
sex_traffic_bucket = ['human traffic', 'sex traffic']
sex_code_bucket = ['§ 1328', '§ 2421', '§ 2422', '§ 2423', '§ 1591', '§ 1952', '§ 1594']
sex_bucket_terms = ['exploit', 'prostitut', 'sex', 'child', 'threat', 'coerc', 'fraud', 'recruit', 'harbor','transport',
                    'provid', 'bonded labor', 'force', 'peonage', 'involuntary servitude', 'slavery','debt bondage',
                    'groom', 'obtain', 'isolat', 'abus', 'involuntary', 'blackmail', 'solicit', 'patroniz']
sex_bucket_socials = ['snapchat', 'facebook', 'adultsearch', 'listcrawler', 'escortalligator', 'megapersonals', 'skipthegame']
sex_bucket_slang = ['the game', 'the life', 'choosing up', 'turn out', 'exit fee', 'in pocket', 'chicken hawk', 'momma',
                    'branding', 'circuit', 'eyeballing', 'stable', 'trade up', 'trade down', 'wifey', 'bottom',
                    'family', 'folks', 'seasoning', 'advert', ' ad ', 'post', 'online', 'internet', 'hotel',
                    'motel', 'room', 'phone', 'text', 'money', 'roses', 'cash', 'paid', 'owe', 'pimp', 'daddy',
                    'gorilla pimp', 'romeo pimp', 'boss', 'prostitute', 'girls', 'women', ' ho ', ' hoe ', 'bitch',
                    'escort', 'date', 'trick', 'out-call', 'in-call', 'trade', 'blade', 'track', 'stroll',
                    'lot lizard', 'quota', 'squaring up', 'customer', 'client', 'john', 'travel', 'greyhound', ' bus ',
                    'rental car', 'taxi', 'uber', 'lyft', 'place', 'brothel']

# Labor Trafficking Codes and Terms for Calculations
labor_traffic_bucket = ['human traffic', 'labor traffic']
labor_code_bucket = ['§ 1581', '§ 1584', '§ 1589', '§ 1590', '§ 1592']
labor_words_bucket = ['visa', 'h-2b visa', 'h-2a visa', 'f1 visa', 't visa', 'u visa', 'involuntary servitude',
                      'bonded labor', 'forced labor', 'peonage', 'slavery']
labor_terms_bucket = ['labor', 'conceal', 'control', 'promise', 'false', 'payments', 'money', 'debt',
                      'bondage', 'work', 'alien', 'confiscate','passport', 'immigrant',
                      'immigration', 'document', 'transport', 'enticement', 'deception', 'snakehead']

In [138]:
def get_trafficking_ratings(text):
    # Sex Trafficking Calculation
    sex_traffic_count = get_traffic_count(text, sex_traffic_bucket)
    sex_code_appearance = get_traffic_appearance(text, sex_code_bucket)
    sex_terms_appearance = get_traffic_appearance(text, sex_bucket_terms)
    sex_socials_appearance = get_traffic_appearance(text, sex_bucket_socials)
    sex_slang_appearance = get_traffic_appearance(text, sex_bucket_slang)
    
    sex_code_appearance_score = (sum(sex_code_appearance.values()) / len(sex_code_appearance)) * 10
    sex_terms_appearance_score = (1+(sum(sex_terms_appearance.values()) / len(sex_terms_appearance))) * 3
    sex_socials_appearance_score = (1+(sum(sex_socials_appearance.values()) / len(sex_socials_appearance))) * 2
    sex_slang_appearance_score = (1+(sum(sex_slang_appearance.values()) / len(sex_slang_appearance)))
    sex_overlap_score = ((sex_terms_appearance_score/3) * (sex_socials_appearance_score/2) *
                                     sex_slang_appearance_score) * 2
   
    sex_scores_total = (sex_code_appearance_score + sex_terms_appearance_score + sex_socials_appearance_score +
                        sex_slang_appearance_score + sex_overlap_score)
    
    # Labor Trafficking Calculation
    labor_traffic_count = get_traffic_count(text, labor_traffic_bucket)
    labor_code_appearance = get_traffic_appearance(text, labor_code_bucket)
    labor_words_appearance = get_traffic_appearance(text, labor_words_bucket)
    labor_terms_appearance = get_traffic_appearance(text, labor_terms_bucket)
    
    labor_code_appearance_score = (sum(labor_code_appearance.values()) / len(labor_code_appearance)) * 10
    labor_words_appearance_score = (1+(sum(labor_words_appearance.values()) / len(labor_words_appearance))) * 3
    labor_terms_appearance_score = (1+(sum(labor_terms_appearance.values()) / len(labor_terms_appearance))) * 2
    labor_overlap_score = ((labor_words_appearance_score/3) * (labor_terms_appearance_score/2)) * 3
    
    labor_scores_total = (labor_code_appearance_score + labor_words_appearance_score + labor_terms_appearance_score +
                        labor_overlap_score)
    
    return sex_scores_total, labor_scores_total



In [139]:
for filename in os.scandir(data_dir):
        if (".pdf" in filename.name) and ("Attachment" not in filename.name):
            reader = pypdf.PdfReader(filename.path)
            pypdf2_text = ""
            for i in range(len(reader.pages)):
                pypdf2_text += reader.pages[i].extract_text()  
            pypdf2_text = pypdf2_text.lower()
            print(filename.name)
            print(get_trafficking_ratings(pypdf2_text))
        else:
            continue

Alliance for Constitutional Sex Offense Laws, Inc. v. Dep't of State, 2018 U.S. Dist. LEXIS 225913.pdf
(15.341596638655462, 11.863636363636363)
Alvarez v. Warden, 2019 U.S. Dist. LEXIS 57594.pdf
(13.402941176470588, 12.5)
Anonymous v. Gonzalez, 2016 U.S. Dist. LEXIS 197386.pdf
(9.975882352941177, 9.25)
Backpage.com, LLC v. McKenna, 881 F. Supp. 2d 1262.pdf
(14.504285714285714, 9.5)
Blackman-Baham v. Kelly, 2017 U.S. Dist. LEXIS 24175.pdf
(10.23764705882353, 9.5)
Brown v. Johnson, 2022 U.S. Dist. LEXIS 179329.pdf
(10.499411764705883, 9.75)
City & Cty. of San Francisco v. Sessions, 349 F. Supp. 3d 924.pdf
(10.617058823529412, 10.918181818181816)
City of Huntington Beach v. Becerra, 44 Cal. App. 5th 243.pdf
(10.24235294117647, 10.0)
City of Los Angeles v. Superior Court, 29 Cal. 4th 1.pdf
(9.499999999999998, 9.25)
Clarke v. Maddow, 2022 U.S. Dist. LEXIS 134375.pdf
(10.838235294117645, 9.645454545454546)
Dahlia v. Rodriguez, 735 F.3d 1060.pdf
(10.51, 9.5)
Doe v. Kerry, 2016 U.S. Dist. LEXI

KeyboardInterrupt: 

In [140]:
def get_traffic_count(text, word_list):
    word_dict = {}
    for word in word_list: 
        word_count = text.count(word)
        word_dict[word] = word_count
    return word_dict


def get_traffic_appearance(text, word_list):
    word_dict = {}
    for word in word_list: 
        if word in text:
            word_dict[word] = 1
        else:
            word_dict[word] = 0
    return word_dict

In [141]:
st = time.time()

files = []
defendant = []
court = []
c1591 = []
c2421 = []
c1594 = []
c2422 = []
c2423 = []
c1328 = []
c1952 = []
c1581 = []
c1584 = []
c1589 = []
c1590 = []
c1592 = []
sex_trafficking_score = []
labor_trafficking_score = []

for filename in os.scandir(data_dir):
        if (".pdf" in filename.name) and ("Attachment" not in filename.name):
            reader = pypdf.PdfReader(filename.path)
            pypdf2_text = ""
            for i in range(len(reader.pages)):
                pypdf2_text += reader.pages[i].extract_text()  
            pypdf2_text = pypdf2_text.lower()
            print(filename.name)
            files.append(filename.name)
            court.append(get_court(pypdf2_text))
            defendant.append(get_defendant_name(pypdf2_text))
            c1591.append(get_courtcode(pypdf2_text, "§ 1591"))
            c2421.append(get_courtcode(pypdf2_text, "§ 2421"))
            c1594.append(get_courtcode(pypdf2_text, "§ 1594"))
            c2422.append(get_courtcode(pypdf2_text, "§ 2422"))
            c2423.append(get_courtcode(pypdf2_text, "§ 2423"))
            c1328.append(get_courtcode(pypdf2_text, "§ 1328"))
            c1952.append(get_courtcode(pypdf2_text, "§ 1952"))
            c1581.append(get_courtcode(pypdf2_text, "§ 1581"))
            c1584.append(get_courtcode(pypdf2_text, "§ 1584"))
            c1589.append(get_courtcode(pypdf2_text, "§ 1589"))
            c1590.append(get_courtcode(pypdf2_text, "§ 1590"))
            c1592.append(get_courtcode(pypdf2_text, "§ 1592"))
            sex_traffic_score, labor_traffic_score = get_trafficking_ratings(pypdf2_text)
            sex_trafficking_score.append(sex_traffic_score)
            labor_trafficking_score.append(labor_traffic_score)
            print(get_trafficking_ratings(pypdf2_text))
        else:
            continue
            
et = time.time()            
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

Alliance for Constitutional Sex Offense Laws, Inc. v. Dep't of State, 2018 U.S. Dist. LEXIS 225913.pdf
(15.341596638655462, 11.863636363636363)
Alvarez v. Warden, 2019 U.S. Dist. LEXIS 57594.pdf
(13.402941176470588, 12.5)
Anonymous v. Gonzalez, 2016 U.S. Dist. LEXIS 197386.pdf
(9.975882352941177, 9.25)
Backpage.com, LLC v. McKenna, 881 F. Supp. 2d 1262.pdf
(14.504285714285714, 9.5)
Blackman-Baham v. Kelly, 2017 U.S. Dist. LEXIS 24175.pdf
(10.23764705882353, 9.5)
Brown v. Johnson, 2022 U.S. Dist. LEXIS 179329.pdf
(10.499411764705883, 9.75)
City & Cty. of San Francisco v. Sessions, 349 F. Supp. 3d 924.pdf
(10.617058823529412, 10.918181818181816)
City of Huntington Beach v. Becerra, 44 Cal. App. 5th 243.pdf
(10.24235294117647, 10.0)
City of Los Angeles v. Superior Court, 29 Cal. 4th 1.pdf
(9.499999999999998, 9.25)
Clarke v. Maddow, 2022 U.S. Dist. LEXIS 134375.pdf
(10.838235294117645, 9.645454545454546)
Dahlia v. Rodriguez, 735 F.3d 1060.pdf
(10.51, 9.5)
Doe v. Kerry, 2016 U.S. Dist. LEXI

People v. Morante, 20 Cal. 4th 403.pdf
(11.284117647058824, 10.0)
People v. Morris, 2019 Cal. App. Unpub. LEXIS 3080.pdf
(12.501176470588238, 9.75)
People v. Moses, 10 Cal. 5th 893.pdf
(12.08529411764706, 9.863636363636363)
People v. Moses, 38 Cal. App. 5th 757.pdf
(12.484453781512606, 9.599999999999998)
People v. Muhammad, 2016 Cal. App. Unpub. LEXIS 2008.pdf
(11.667899159663865, 8.75)
People v. Nguyen, 2009 Cal. App. Unpub. LEXIS 2239.pdf
(11.331764705882351, 9.25)
People v. Norris, 88 Cal. App. 3d Supp. 32.pdf
(9.35, 8.75)
People v. Ortiz, 2019 Cal. App. Unpub. LEXIS 1784.pdf
(12.407731092436975, 9.75)
People v. Otis, 2018 Cal. App. Unpub. LEXIS 4169.pdf
(12.204285714285714, 9.25)
People v. Palmer, 2019 Cal. App. Unpub. LEXIS 1556.pdf
(12.60588235294118, 10.0)
People v. Parker, 2021 Cal. App. Unpub. LEXIS 4218.pdf
(12.664201680672269, 10.39090909090909)
People v. Patterson, 2021 Cal. App. Unpub. LEXIS 6042.pdf
(10.73, 9.0)
People v. Reed, 2022 Cal. App. Unpub. LEXIS 6754.pdf
(13.347

In [65]:
cases = pd.DataFrame({'file_name': files,
                      'Court': court,
                      'Defendant(s)': defendant,
                      'Sex_Traffic_Score':sex_trafficking_score,
                      'Labor_Traffic_Score': labor_trafficking_score,
                      'Code_1591': c1591,
                      'Code_2421': c2421, 
                      'Code_1594': c1594, 
                      'Code_2422': c2422, 
                      'Code_2423': c2423, 
                      'Code_1328': c1328, 
                      'Code_1952': c1952,
                      'Code_1581': c1581,
                      'Code_1584': c1584,
                      'Code_1589': c1589,
                      'Code_1590': c1590,
                      'Code_1592': c1592})

In [54]:
len(cases)

200

In [151]:
cases.to_csv('../Albuquerque/FilteredAlbuquerque.csv')

# Seth working on names

In [None]:
'''
files = []
c1591 = []
c2421 = []
c1594 = []
c2422 = []
c2423 = []
c1328 = []
c1952 = []
alb_pd = []
all_pd = []
'''
for filename in os.scandir(data_dir):
        if ".pdf" in filename.name:
            reader = pypdf.PdfReader(filename.path)
            pypdf2_text = ""
            for i in range(len(reader.pages)):
                pypdf2_text += reader.pages[i].extract_text()  
            files.append(filename.name)
            
            '''
            c1591.append(get_courtcode(pypdf2_text, "1591"))
            c2421.append(get_courtcode(pypdf2_text, "2421"))
            c1594.append(get_courtcode(pypdf2_text, "1594"))
            c2422.append(get_courtcode(pypdf2_text, "2422"))
            c2423.append(get_courtcode(pypdf2_text, "2423"))
            c1328.append(get_courtcode(pypdf2_text, "1328"))
            c1952.append(get_courtcode(pypdf2_text, "1952"))
            alb_pd.append(get_alb_pd(pypdf2_text))
            all_pd.append(get_all_pd(pypdf2_text))
            '''
        else:
            continue

In [None]:
cases = pd.DataFrame({'file_name': files,
                      'Code_1591': c1591,
                      'Code_2421': c2421, 
                      'Code_1594': c1594, 
                      'Code_2422': c2422, 
                      'Code_2423': c2423, 
                      'Code_1328': c1328, 
                      'Code_1952': c1952, 
                     'AlbPD_Mentions':alb_pd, 
                     'AllPD_Mentions': all_pd})
cases

# break
-------------------------------------------------------------

In [30]:
cases

Unnamed: 0,file_name,Code_1591,Code_2421,Code_1594,Code_2422,Code_2423,Code_1328,Code_1952,AlbPD_Mentions,AllPD_Mentions,TotalCharges
0,"A.M. v. N.M. Dep_t of Health, 148 F. Supp. 3d ...",0,0,0,0,0,0,0,0,0,0
1,"ACLU of N.M. v. City of Albuquerque, 139 N.M. ...",0,0,0,0,0,0,0,1,12,0
2,"Aguilar v. McAleenan, 2019 U.S. Dist. LEXIS 19...",0,0,0,0,0,0,0,0,0,0
3,"Allen v. City of Albuquerque, 2015 U.S. Dist. ...",0,0,0,0,0,0,0,0,3,0
4,"Alvarado v. KOB-TV, L.L.C., 493 F.3d 1210.pdf",0,0,0,0,0,0,0,3,6,0
...,...,...,...,...,...,...,...,...,...,...,...
266,"Webb v. Padilla, 2009 U.S. Dist. LEXIS 101489.pdf",0,0,0,0,0,0,0,3,8,0
267,"White v. Town of Hurley, 2019 U.S. Dist. LEXIS...",0,0,0,0,0,0,0,0,39,0
268,"Womack v. Unified Gov_t, 2021 U.S. Dist. LEXIS...",0,0,0,0,0,0,0,0,17,0
269,"Young v. Wilham, 2017-NMCA-087.pdf",0,0,0,0,0,1,0,1,42,1


In [31]:
# Read in the text from case PDFs and store in dataframe
cases = pd.DataFrame({'file_name': pd.Series(dtype='str'),
                      #'court': pd.Series(dtype='str'),                      
                      #'acts': pd.Series(dtype='str'),
                      #'elements_used': pd.Series(dtype='str'),
                      #'location_likely': pd.Series(dtype='str'),
                      #'arresting_department_likely': pd.Series(dtype='float'), 
                      'Code_1591': pd.Series(dtype='int'), 
                      'Code_2421': pd.Series(dtype='int'), 
                      'Code_1594': pd.Series(dtype='int'), 
                      'Code_2422': pd.Series(dtype='int'), 
                      'Code_2423': pd.Series(dtype='int'), 
                      'Code_1328': pd.Series(dtype='int'), 
                      'Code_1952': pd.Series(dtype='int')})

In [32]:
for filename in os.scandir(data_dir):
        if ".pdf" in filename.name:
            df = pd.DataFrame({'file_name': pd.Series(dtype='str'),
                                #'court': pd.Series(dtype='str'),                      
                                #'acts': pd.Series(dtype='str'),
                                #'elements_used': pd.Series(dtype='str'),
                                #'location_likely': pd.Series(dtype='str'),
                                #'arresting_department_likely': pd.Series(dtype='float'), 
                                  'Code_1591': pd.Series(dtype='int'), 
                                  'Code_2421': pd.Series(dtype='int'), 
                                  'Code_1594': pd.Series(dtype='int'), 
                                  'Code_2422': pd.Series(dtype='int'), 
                                  'Code_2423': pd.Series(dtype='int'), 
                                  'Code_1328': pd.Series(dtype='int'), 
                                  'Code_1952': pd.Series(dtype='int')})
            '''
            if "Attachment" in filename:
                main_document_name = re.sub("_Attachment[0-9]+", "", filename.name)
                if (cases['file_name'].eq(main_document_name)).any():
                    # TODO (can we scrape the attachments because they are often screenshots)
                    # add the attachment and what it is to the dictionary
                    attachment_type = get_attachment_type()
                else:
                    # TODO
                    # create a new pandas column for the main court file we extracted and add this attachment as the first
                    # in the attachment dict, then 
            else:
            '''
            reader = pypdf.PdfReader(filename.path)
            pypdf2_text = ""
            for i in range(len(reader.pages)):
                pypdf2_text += reader.pages[i].extract_text() 
            df['file_name'] = [filename.name]
            #df['court'] = get_court(pypdf2_text)
            #df['acts'] = get_acts(pypdf2_text)
            #df['elements_used'] = get_elements(pypdf2_text)
            #df['arresting_department_likely'] = get_arresting_dept(pypdf2_text)
            df['Code_1591'] = [get_courtcode(pypdf2_text, "1591")]
            df['Code_2421'] = [get_courtcode(pypdf2_text, "2421")]
            df['Code_1594'] = [get_courtcode(pypdf2_text, "1594")]
            df['Code_2422'] = [get_courtcode(pypdf2_text, "2422")]
            df['Code_2423'] = [get_courtcode(pypdf2_text, "2423")]
            df['Code_1328'] = [get_courtcode(pypdf2_text, "1328")]
            df['Code_1952'] = [get_courtcode(pypdf2_text, "1952")]
            cases = cases.append(df)


  cases = cases.append(df)
  cases = cases.append(df)
  cases = cases.append(df)
  cases = cases.append(df)
  cases = cases.append(df)
  cases = cases.append(df)
  cases = cases.append(df)
  cases = cases.append(df)


KeyboardInterrupt: 

In [None]:
cases

In [35]:
# messing with pypdf
df = pd.DataFrame({'file_name': pd.Series(dtype='str'),
                    'acts': pd.Series(dtype='str'),
                    'elements_used': pd.Series(dtype='str'),
                    'location_likely': pd.Series(dtype='str'),
                    'arresting_department_likely': pd.Series(dtype='float')})
df['file_name'] = "A.M. v. N.M. Dep't of Health, 148 F. Supp. 3d 1232.pdf"
reader = pypdf.PdfReader("../data/A.M. v. N.M. Dep't of Health, 148 F. Supp. 3d 1232.pdf")
pypdf2_text = ""
for i in range(len(reader.pages)):
    pypdf2_text += reader.pages[i].extract_text() 

FileNotFoundError: [Errno 2] No such file or directory: "./data/A.M. v. N.M. Dep't of Health, 148 F. Supp. 3d 1232.pdf"

In [78]:
print(pypdf2_text)

 Zamora v. Bd. of Educ. for the Las Cruces Pub. Schs
United States District Court for the District of New Mexico
May 22, 2013, Decided; May 22, 2013, Filed
No. 12-CV-550 MCA/CG
Reporter
2013 U.S. Dist. LEXIS 203078 *; 2013 WL 12086304
DENNIS ZAMORA, Plaintiff, v. BOARD OF EDUCATION FOR THE LAS CRUCES PUBLIC SCHOOLS, 
Defendants.
Subsequent History: Affirmed by Zamora v. Bd. of Educ. for the Las Cruces Pub. Sch., 553 Fed. Appx. 786, 2014 
U.S. App. LEXIS 1380 (10th Cir. N.M., Jan. 24, 2014)
Core Terms
Rounds, terminating, employees, investigator, sexual harassment, hostile work environment, proffered, allegations, 
pretext, adverse employment action, summary judgment, national origin, complaints, renew, interviewed, contends, 
argues, hired, discriminatory, Schools, submits, woman, prima facie case, conversation, pretextual, supervised, 
yelled, non discriminatory reason, proffered evidence, deposition
Counsel:  [*1] For Dennis Zamora, Plaintiff: Daniela Labinoti, Law Firm of Daniela La

In [92]:
re.findall('(?<=Core Terms).*$', pypdf2_text)

[]

In [115]:
#pypdf2_text.lower().partition("core terms\n")[2].split('\n')[0:5]
' '.join(pypdf2_text.lower().partition("core terms\n")[2].split('\n')[0:5])

'rounds, terminating, employees, investigator, sexual harassment, hostile work environment, proffered, allegations,  pretext, adverse employment action, summary judgment, national origin, complaints, renew, interviewed, contends,  argues, hired, discriminatory, schools, submits, woman, prima facie case, conversation, pretextual, supervised,  yelled, non discriminatory reason, proffered evidence, deposition counsel:  [*1] for dennis zamora, plaintiff: daniela labinoti, law firm of daniela labinoti pc, el paso, tx usa.'