This file rescrapes the URLs using different encodings, to see if that will fix the weird text.

In [None]:
import requests
from requests.exceptions import HTTPError
import fitz # imports PyMuPDF
import pandas as pd
import numpy as np

## Helper functions

In [36]:
# This code messes around with a way to check the encoding
def pdf_scraper(url: str) -> str:
    """ Returns the text of the PDF found at the given URL (?) """

    if url=="No articles found":
        return "No articles found"
   
    try:
        response = requests.get(url) # Response object created by sending a GET request to the URL
        response.raise_for_status() # Raises HTTPError, if one occurred.
    except HTTPError:
        return("Access denied")
    
    pdf = response.content  # Content of the response, in bytes.
    doc = fitz.open("pdf", pdf)

    text = ""

    if len(doc) > 7:
        text="Too_Long"

    for k in range(min(len(doc), 7)):
        text = text + doc[k].get_text()

    return text

In [None]:
def pdf_scraper_new(url):
    """ Returns the text of the PDF found at the given URL """
    
    if url=="No articles found":
        return "No articles found"

    try:
        response = requests.get(url)
        response.raise_for_status()
    except HTTPError:
        return("Access denied")
    
    pdf = response.content
    doc = fitz.open("pdf", pdf)
    text_parts = []

    if len(doc)>20:
        text_parts.append("Too_Long")

    for k in range(min(len(doc), 20)):
      text_parts.append(doc[k].get_text())

    text = "".join(text_parts)
    
    if len(text) == 0:
        return "No text found"
    return text

In [38]:
def test_encodings(url: str, encodings: list[str]):
    """ Given a URL that contains a PDF, test different encodings on the text in the PDF, to see which one is correct. 

    Params:
        - url: URL to the PDF
        - encodings: list of valid text encodings to try
    """

    text = pdf_scraper_new(url) # get text in pdf
 
    # Try all encodings
    for enc in encodings:
        try:
            # text.encode() encodes a string and returns bytes
            # text.encode(errors = 'replace') replaces a character that cannot be encoded with a question mark
            simulated = text.strip().encode(enc, errors='backslashreplace').decode(enc, errors='backslashreplace')  # the text after trying a certain encoding
            print(f"\n{enc}:")
            print(simulated)

            return simulated

        except Exception as e:
            print(f"Exception thrown for {enc}: {e}")

In [None]:
def decode_with_one_encoding(url: str, encoding: str) -> str:
    """ Decode a PDF with one encoding, and return the decoded text. """

    text = pdf_scraper_new(url) # get text in pdf
    try:
        decoded_text = text.strip().encode(encoding, errors='namereplace').decode(encoding, errors='namereplace')
        return decoded_text
    except Exception as e:
            print(f"Exception thrown for {encoding}: {e}")

## Load & examine data

In [40]:
# Try this code on some of the URLs in the `cleaned_police_reports.csv` dataset
df = pd.read_csv("cleaned_police_reports.csv")
df.head()

Unnamed: 0,name,department,url,text
0,"Andrew Allen, badge #37",Minneapolis Police Department,https://d3n8a8pro7vhmx.cloudfront.net/cuapb/pa...,Home Legislative File 2021-01132 RCA Legal s...
1,"Guled Abdullahi, badge #706",Hennepin County Sheriff's Department,https://assets.nationbuilder.com/cuapb/pages/1...,"Hennepin County 300 South Sixth Street, Minne..."
2,"Dean V. Albers, badge #None",Goodhue County Sheriff's Department,https://d3n8a8pro7vhmx.cloudfront.net/cuapb/pa...,"2/22/2021 Jenson v. Craft, Civil No. 01-1488(D..."
3,"Scott Aikins, badge #22",Minneapolis Police Department,https://d3n8a8pro7vhmx.cloudfront.net/cuapb/pa...,"2/22/2021 United States v. Diriye, Case No. 14..."
4,"Matthew Aish, badge #None",Columbia Heights Police Department,https://d3n8a8pro7vhmx.cloudfront.net/cuapb/pa...,Too_Long1 Arbitration LELS (Mathew Aish)/...


In [42]:
unique_urls = df["url"].unique() # all unique URLs in the dataframe
unique_urls[:5]

array(["https://d3n8a8pro7vhmx.cloudfront.net/cuapb/pages/1462/attachments/original/1638637610/RCA-2021-01206_-_Legal_settlement_Workers'_Compensation_claim_of_Andrew_Allen.pdf?1638637610",
       'https://assets.nationbuilder.com/cuapb/pages/1473/attachments/original/1671254309/20211221_Abdullahi.pdf?1671254309',
       'https://d3n8a8pro7vhmx.cloudfront.net/cuapb/pages/270/attachments/original/1618004460/Jenson_v._Craft__Civil_No._01-1488%28DSD_JMM%29___Casetext_Search___Citator.pdf?1618004460',
       'https://d3n8a8pro7vhmx.cloudfront.net/cuapb/pages/270/attachments/original/1626647366/United_States_v._Diriye__Case_No._14-cr-236%281%29_%28JNE_TNL%29___Casetext_Search___Citator.pdf?1626647366',
       'https://d3n8a8pro7vhmx.cloudfront.net/cuapb/pages/270/attachments/original/1632189461/2020-Matthew_Aish-Columbia_Heights.pdf?1632189461'],
      dtype=object)

In [96]:
unique_urls.shape

(1402,)

## Find all Unicode characters that may be problematic

Idea: replace all problematic Unicode characters with the appropriate ASCII character, so that the LLM and NER model can properly read the text.

In [131]:
decoded_texts = np.empty(unique_urls.shape, dtype="object") # the decoded text for each url, in latin-1

# Decode all texts
for i in range(len(unique_urls)):
    url = unique_urls[i]
    decoded_texts[i] = decode_with_one_encoding(url, "latin-1") # decode in latin-1 to identify all unicode characters that may be problematic

In [182]:
decoded_df = pd.DataFrame(decoded_texts, columns = ["decoded_text"])
all_unicodes = decoded_df.decoded_text.str.findall(r"\\u.{4}") # find all rows that contain at least one unicode character. This is a Series where each row is a list of Unicode characters in that text.
all_unicodes

0                                        [\uf105, \uf105]
1                                                      []
2                                                      []
3                                                      []
4       [\uf0b7, \uf0b7, \uf0b7, \uf0b7, \uf0b7, \uf0b...
                              ...                        
1397                                                   []
1398                                                   []
1399                                                   []
1400                                                   []
1401                                                   []
Name: decoded_text, Length: 1402, dtype: object

In [183]:
all_unicodes = all_unicodes.apply(lambda item: "" if (len(item) == 0) else item) # In the `all_unicodes` series, convert all rows that have empty list, to contain an empty string
all_unicodes = all_unicodes[all_unicodes != ""] # get only rows that have at least one unicode character

Unicode characters in the "Private Use Area" are reserved for organizations to create their own custom Unicode characters. They do not have predefined Unicode values. 

In [184]:
all_unicodes

0                                        [\uf105, \uf105]
4       [\uf0b7, \uf0b7, \uf0b7, \uf0b7, \uf0b7, \uf0b...
9       [\u0cd4, \u0cd1, \u0cc5, \u0cc9, \uf653, \uf64...
14                                       [\uf105, \uf105]
15               [\ue111, \uf09a, \uf09a, \uf099, \uf099]
                              ...                        
1378                                             [\ue111]
1383    [\u0cd3, \u0cc5, \u0cc5, \ue053, \uf644, \uf64...
1385                                     [\uf105, \uf105]
1389    [\u0cc5, \uf645, \uf647, \uf645, \uf649, \ue05...
1393    [\uf645, \uf643, \uf643, \uf649, \uf645, \uf64...
Name: decoded_text, Length: 278, dtype: object

In [185]:
unicode_chars_list = [] # get list of all unicode characters in the scraped texts

for element in all_unicodes:
    unicode_chars_list.extend(element)

In [186]:
unique_unicodes = np.unique(np.array(unicode_chars_list)).tolist() # get all unique unicode characters in the scraped texts

In [187]:
for item in unique_unicodes:
    print(item)

\u0cc5
\u0cc9
\u0cce
\u0ccf
\u0cd1
\u0cd2
\u0cd3
\u0cd4
\u0d80
\u0e5f
\ue004
\ue006
\ue053
\ue111
\ue258
\ue600
\ue601
\ue603
\ue61d
\ue800
\ue801
\ue804
\ue809
\ue80a
\ue900
\ue903
\ue904
\ue906
\ue907
\ue90b
\ue90c
\ue980
\ue99a
\ue9c6
\uf002
\uf003
\uf006
\uf017
\uf02c
\uf02f
\uf030
\uf039
\uf03a
\uf04b
\uf05a
\uf067
\uf071
\uf081
\uf082
\uf095
\uf097
\uf099
\uf09a
\uf09e
\uf0a2
\uf0a4
\uf0a7
\uf0a8
\uf0a9
\uf0b7
\uf0c1
\uf0c9
\uf0da
\uf0e0
\uf0e1
\uf102
\uf104
\uf105
\uf106
\uf107
\uf108
\uf111
\uf13b
\uf144
\uf167
\uf16a
\uf16d
\uf19c
\uf1ea
\uf202
\uf204
\uf207
\uf209
\uf214
\uf222
\uf24e
\uf39e
\uf3d9
\uf400
\uf40a
\uf410
\uf419
\uf469
\uf4e7
\uf4e8
\uf4ef
\uf608
\uf642
\uf643
\uf644
\uf645
\uf646
\uf647
\uf648
\uf649
\uf64a
\uf64b
\uf64c
\uf653


Unicode characters from E000 to F8FF are "Private Use Area". We can replace those with empty strings.

In [188]:
len(unique_unicodes)

109

In [189]:
decoded_df

Unnamed: 0,decoded_text
0,Home \uf105Legislative File 2021-01132 \uf105R...
1,"Hennepin County \n300 South Sixth Street, Minn..."
2,"2/22/2021\nJenson v. Craft, Civil No. 01-1488(..."
3,"2/22/2021\nUnited States v. Diriye, Case No. 1..."
4,Too_Long1 \n \nArbitration \nLELS (Mathew Ais...
...,...
1397,INTERNAL AFFAIRS CASE NUMBER: \n14-01771 \nMIN...
1398,MINNEAPOLIS POLICE DEPARTMENT \nINTERNAL AFFAI...
1399,"Cook, Joni \n,:- --Irom: \nGlampe, Travis \n....."
1400,MINNEAPOLIS POLICE DEPARTMENT \nInternal Affai...


In [None]:
# Replace all "Private Use" characters w/ an empty string
for unicode_char in unique_unicodes:
    if unicode_char.startswith("\\ue") or unicode_char.startswith("\\uf"):
        decoded_df["decoded_text"] = decoded_df["decoded_text"].str.replace(pat = unicode_char, repl = "")

In [191]:
decoded_df

Unnamed: 0,decoded_text
0,Home Legislative File 2021-01132 RCA\nLegal se...
1,"Hennepin County \n300 South Sixth Street, Minn..."
2,"2/22/2021\nJenson v. Craft, Civil No. 01-1488(..."
3,"2/22/2021\nUnited States v. Diriye, Case No. 1..."
4,Too_Long1 \n \nArbitration \nLELS (Mathew Ais...
...,...
1397,INTERNAL AFFAIRS CASE NUMBER: \n14-01771 \nMIN...
1398,MINNEAPOLIS POLICE DEPARTMENT \nINTERNAL AFFAI...
1399,"Cook, Joni \n,:- --Irom: \nGlampe, Travis \n....."
1400,MINNEAPOLIS POLICE DEPARTMENT \nInternal Affai...


In [193]:
decoded_df.iloc[0].values[0]

"Home Legislative File 2021-01132 RCA\nLegal se\x01lement: Workers' Compensa\x02on claim of Andrew Allen\n(RCA-2021-01206)\nORIGINATING DEPARTMENT\nFinance & Property Services\nTo Commi\x01ee(s)\n#\nCommi\x01ee Name\nMee\x02ng Date\n1\nPolicy & Government Oversight\nCommi\x01ee\nOct 20, 2021\nLEAD\nSTAFF:\nEmily Ann Colby\nPRESENTED\nBY:\nEmily Ann Colby\nAc\x02on Item(s)\n#\nFile Type\nSubcategory\nItem Descrip\x02on\n1\nAc\x02on\nSe\x01lement\nApproving the se\x01lement of the Workers'\nCompensa\x02on claim of Andrew Allen, by payment\nof $170,000 to Andrew Allen and a\x01orney,\nMeuser Law Firm, and authorizing the City\nA\x01orney's O\\N{LATIN SMALL LIGATURE FFI}ce to execute any documents\nnecessary to e\\N{LATIN SMALL LIGATURE FF}ectuate the se\x01lement.\nPrevious Ac\x02ons\nNone\nRCA-2021-01206 - Legal settlement: Workers' Compensation claim of ...\nhttps://lims.minneapolismn.gov/RCA/8755\n1 of 2\n12/4/2021, 1:33 AM\nWard / Neighborhood / Address\n#\nWard\nNeighborhood\nAddress

Looking at the first row, we got rid of the Unicode characters! But there are still hexadecimal characters.

In [None]:
# Potential BUG: Maybe I shouldn't decode Unicode text with a non-Unicode encoding. 