# **01_PREPROCESSING**

Summary:


1.   Import and Normalization
2.   Split Opinions into Subjects of Interest
3.   Text Cleaning





---

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import sys
sys.path.append('/content/drive/My Drive/Università/inforet_prj/')

In [3]:
!pip install -U spacy unidecode



In [4]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
[K     |████████████████████████████████| 13.6 MB 80 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [5]:
import lzma, json
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
import spacy
import string
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from unidecode import unidecode
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
sns.set()
tqdm.pandas()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
nlp = spacy.load("en_core_web_sm")

## 1. Import and Normalization

### *1.1 Data Import*


**NB**: run the 3 cells below only if on Google Colab. Otherwise skip them and download the compressed data manually from https://api.case.law/v1/bulk/22341/download/

In [None]:
!pip install selenium
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('chromedriver', options=chrome_options)
wd.get("https://case.law/bulk/download/")

In [None]:
wd.find_element_by_xpath("/html/body/div/main/div/div/div[2]/div/div[2]/div/div[2]/a").click()

In [None]:
!unzip Illinois-20200302-text.zip
!mv Illinois-20200302-text/data/data.jsonl.xz data.jsonl.xz
!rm -r Illinois-20200302-text
!rm Illinois-20200302-text.zip

### *1.2 Data Normalization*

Creation of opinions, citations and df

In [None]:
# We know that there will be 183146 items,
# so we set this manually since tqdm will not
# be able to display a progress bar when reading from
# a file.
pbar = tqdm(total=183146)

# Read directly from the compressed file.
# We will create a list where each element is a line
# of the file, which in turns is a json
# (casted in python as a dict).
with lzma.open("data.jsonl.xz") as f:
    cases = []

    for line in f:
        cases.append(json.loads(str(line, 'utf8')))
        pbar.update(1)

    pbar.close()

100%|██████████| 183146/183146 [01:23<00:00, 2183.15it/s]


In [None]:
# https://pandas.pydata.org/docs/reference/api/pandas.json_normalize.html
df = pd.json_normalize(cases)

In [None]:
del cases

In [None]:
# Flattens the list of attorneys to a single string
# with ; as separator
df["casebody.data.attorneys"] = df.apply(lambda x: "; ".join(x["casebody.data.attorneys"]), axis=1)

In [None]:
"""
Each element of the columns 'citations' and 'casebody.data.opinions' is
a list, and in turn each element of the list is a json object.
This means that we need to unravel those column to have a more "flatten"
version (like a simple table, eg. a DataFrame).
The approach shown here consists of creating two different DataFrames
that will contain data from the two columns. In order to preserve the
association of each row of the new DataFrame with the corresponding data
in the original DataFrame, we will add to each json a new key called "id"
that will have the original row number as value.
"""

def add_id_todict(x, col):
    vals = x[col]

    for i, elem in enumerate(vals):
        d = elem
        d["id"] = x.name
        vals[i] = d

    return vals

In [None]:
df["casebody.data.opinions"] = df.apply(lambda x: add_id_todict(x, "casebody.data.opinions"), axis=1)
df["citations"] = df.apply(lambda x: add_id_todict(x, "citations"), axis=1)

In [None]:
# For clarity, let's also add the "id" column to the original df
df["id"] = df.index.values

In [None]:
# We merge each element in the "citations" column (which is a list)
# to a single list called "citations".
#
# Using list comprehension instead of df["column"].sum()
# because the latter is slow for large df. See:
# https://stackoverflow.com/a/51576777
citations = [item for x in df["citations"] for item in x]
df.drop(columns=["citations"], inplace=True)

In [None]:
# Same for the opinions column
opinions = [item for x in df["casebody.data.opinions"] for item in x]
df.drop(columns=["casebody.data.opinions"], inplace=True)

In [None]:
# Let's now get the flattened table from the citations
# and from the opinions
citations_df = pd.json_normalize(citations)

In [None]:
opinions_df = pd.json_normalize(opinions)

We now have 3 dataframes that can be joined using the "id" column.

In [None]:
df['year'] = pd.to_datetime(df['decision_date']).apply(lambda x: x.year)
opinions_df = pd.merge(opinions_df, df[['year','id']], on="id", how="left")

### *1.3 Serialize data*


In [None]:
with open("/content/drive/MyDrive/Università/inforet_prj/df.pkl", "wb") as f:
    pickle.dump(df, f)

In [None]:
with open("/content/drive/MyDrive/Università/inforet_prj/citations.pkl", "wb") as f:
    pickle.dump(citations_df, f)

In [None]:
with open("/content/drive/MyDrive/Università/inforet_prj/opinions.pkl", "wb") as f:
    pickle.dump(opinions_df, f)

In [None]:
del df
del citations_df
del opinions
del citations
del opinions_df

In [None]:
import gc
gc.collect()

253

---

## **2. Split Opinions into Subjects of Interest**

We divide into 3 groups rows based on the lists of terms provided for each subject of interest: narcotics, weapons and investigation.

In [None]:
with open("/content/drive/MyDrive/Università/inforet_prj/opinions.pkl", "rb") as f:
  opinions_df = pickle.load(f)

In [None]:
opinions_df["text"] = opinions_df["text"].str.replace("|", " ")

In [None]:
opinions_df.author = opinions_df.author.fillna("")
array = opinions_df["author"].progress_apply(lambda x: nltk.word_tokenize(x.lower()))

authors_judges = []

for op in array:
    for token in op:
        if token.isalpha() and len(token) > 1:
            authors_judges.append(token)

authors_judges = set(authors_judges)

100%|██████████| 194366/194366 [00:25<00:00, 7742.05it/s]


In [None]:
with open("authors_judges.pkl", "wb") as f:
    pickle.dump(authors_judges, f)

In [None]:
!cp authors_judges.pkl /content/drive/MyDrive/Università/inforet_prj

In [None]:
def typo(text):
    cleaned_text = (
        text.replace('cannabi ','cannabis ')
        .replace('lysergic acid diethylamide', 'lsd')
        .replace('methylenedioxymethamphetamine', 'mdma')
        .replace('MDMA', 'mdma')
        .replace('methylenedioxyamphetamine', 'mda')
        .replace('ciacetyl','diacetyl')
        .replace(' nar cotic', ' narcotic')
        .replace(' fi ','')
        )
    return cleaned_text

In [None]:
opinions_df['text'] = opinions_df.text.progress_apply(lambda x: typo(x))
#typo(narco_data.lemmatized[30])

100%|██████████| 194366/194366 [00:16<00:00, 11851.36it/s]


In [None]:
#narcotics = ["cannabis", "cocaine", "methamphetamine", "drug", "marijuana","heroin", "fentanyl", "mdma", "lsd", "ketamine", "modafinil", "provigil", "adderall", "methylphenidate", "memantine", "axura", "anabolic" , "steroids",  "testosterone"]
narcotics = ["cannabis",  "marijuana",  "lsd", "heroin", 'methaqualone', "ecstasy", "mdma", "cocaine", "cocaine", "methamphetamine", "hydromorphone", "dilaudid", "meperidine", "demerol", "oxycodone", "dexedrine", "fentanyl", "ritalin", "methadone", "amphetamine", "phencyclidine", "ephedrine"]
#narcotics = [ "cannabis",  "marijuana",  "lsd", "heroin", 'methaqualone', "ecstasy", "peyote", "mescaline", "mda", "mdma", "cocaine", "methamphetamine", "hydromorphone", "dilaudid", "meperidine", "demerol", "oxycodone", "dexedrine", "fentanyl", "ritalin", "methadone", "amphetamine", "phencyclidine", "pseudoephedrine", "ephedrine", "meth", "opium", "dilaudid", "preludin","ketamine", "anabolic" , "steroids",  "testosterone", "ketamine", "modafinil", "provigil", "adderall", "methylphenidate", "memantine", "axura", "soma", "xanax", "darvon", "darvocet", "valium", "ativan", "talwin", "ambien", "tramadol",  "ethclorvynol","phenylpropanolamine", "lomotil", "motofen", "lyrica", "parepectolin", "tetracaine"]
weapons = ["gun", "knife", "weapon", "firearm", "rifle", "carabine", "shotgun", "assaults rifle", "sword", "blunt objects"]
investigations = ["gang", "mafia", "serial killer", "rape", "thefts", "recidivism", "arrest", "ethnicity", "caucasian", "afroamerican", "native american", "hispanic", "gender", "male", "female", "man", "woman", "girl", "boy", "robbery", "cybercrime"]

In [None]:
narco_df = opinions_df.loc[opinions_df['text'].str.contains("|".join(narcotics)).any(level=0)] # 35410 rows / 6076  / 11038

In [None]:
narco_df

Unnamed: 0,type,text,author,id,year
2,majority,CHIEF JUSTICE HEIPLE\ndelivered the opinion of...,CHIEF JUSTICE HEIPLE,1,1997
8,majority,JUSTICE BILANDIC\ndelivered the opinion of the...,JUSTICE BILANDIC,5,1997
11,majority,JUSTICE ZWICK\ndelivered the opinion of the co...,JUSTICE ZWICK,8,1997
51,majority,PRESIDING JUSTICE GREIMAN\ndelivered the opini...,PRESIDING JUSTICE GREIMAN,44,1997
63,majority,PRESIDING JUSTICE CAHILL\ndelivered the opinio...,PRESIDING JUSTICE CAHILL,55,2000
...,...,...,...,...,...
194288,concurrence,"JUSTICE McNAMARA,\nspecially concurring:\nI ag...","JUSTICE McNAMARA,",183077,1986
194307,majority,PRESIDING JUSTICE WOLFSON\ndelivered the opini...,PRESIDING JUSTICE WOLFSON,183094,2007
194322,majority,Mr. PRESIDING JUSTICE GOLDBERG\ndelivered the ...,Mr. PRESIDING JUSTICE GOLDBERG,183108,1976
194341,majority,PRESIDING JUSTICE QUINN\ndelivered the opinion...,PRESIDING JUSTICE QUINN,183124,2006


In [None]:
narco_df.to_csv("narco_df.csv", index=False, sep="|")

In [None]:
!cp narco_df.csv /content/drive/MyDrive/Università/inforet_prj

In [None]:
del opinions_df
del authors_judges

In [None]:
import gc

In [None]:
gc.collect()

377

---

## **3. Text Cleaning**
Load Opinions from the previous step.

In [7]:
with open("/content/drive/MyDrive/Università/inforet_prj/authors_judges.pkl", "rb") as f:
    authors_judges = pickle.load(f)

In [8]:
# Proper nouns found in the dataset
names = ["Brinks", "Flores", "People v.","Pinnix", "Garvey", "Steinbach", "Fowlar", "Mobil", "Milian", "TQ", "Yanez", "Tawanda", "Geder", "Mason", "Payne", "Bair", "ILCS",  "tbe", "tbat", "Delores","Stivers", "Spades", "Snyders", "Nally", "Budaj", "Yacoo", "Cosgrove", "Cos-grove", "Gayles", "Hodges"]

In [9]:
def full_text_clean(text, is_sentence=False):
    if text == '' or pd.isna(text) or not isinstance(text, str):
        return ''

    bb = (
        text.replace(' U.S. ','US')
        .replace(' S.Ct. ','SCt')
        .replace(' f. supp. ', ' fsupp ')
        .replace(' cir.', ' cir ')
        .replace("[o]", "o")
        .replace(" CIR ", " confidential source ")
        .replace("Reg.", " regulation ")
        .replace("miIe", " mile ")
        .replace(" com mitted ", " committed ")
        .replace("wtap", "tap")
        )
    
    if bb.strip() == '' or pd.isna(bb):
        return ''

    if not ' ' in bb:
        return ''

    temp = bb.split()
    bb = " ".join([ele for ele in temp if not ele[0].isupper()])
    
    if not is_sentence:
        bb = bb.split(":")
        if len(bb) > 1:
            bb.pop(0)
        bb = ' '.join(bb)

    if bb.strip() == '' or pd.isna(bb):
        return ''

    bb = unidecode(re.sub(' +', ' ', bb.strip())) #any additional whitespaces and foreign characters
    bb = bb.strip()
    bb = re.sub('[0-9]{1,2} [Uu]\.[Ss]\.[Cc]\. §\s?\d+(\w+)?( \([0-9]{4}\))?',' USCCITATION ', bb)
    bb = re.sub('[a-zA-Z]+ [vV]\. [a-zA-Z]+',' CaseAvCaseB ', bb) #CaseA v. CaseB = CaseAvCaseB
    bb = re.sub('\d+ (Ark|Ill)\. \d+',' StateCase ', bb) #300 Ark. 230 = 300Ark230
    bb = re.sub(' [Ss][Tt][Aa][Tt][Ss]\.',' StateCase2 ',bb) #300 Ark. 230 = 300Ark230
    bb = re.sub('\d+ [A-z]+\.[ ]*[A-z]+\.[ ]*\d[A-z]+ \d+',' CaseRef ',bb) #953 S.W.2d 559 or 87 L.Ed.2d 481
    bb = re.sub('[Jj][Rr]\.', 'Jr ', bb)
    bb = re.sub('\d+ (Ark|Ill)\. App. \d+',' StateAppCase ', bb)
    bb = re.sub('(Ark|Ill)\. Code Ann\. § ',' StateCodeSection ', bb)
    bb = re.sub(' [Ii][Dd]\.',' Idem ', bb)
    bb = re.sub('§+',' Section ', bb)
    bb = re.sub('[Aa][Nn][Nn][Oo][:.]* \d+ [Aa]\.*[ ]*[Ll]\.*[ ]*[Rr]\.*[ ]*\d+','anno', bb)
    bb = re.sub(' [Aa][Nn][Nn][Oo][:.]*',' anno', bb)
    bb = re.sub('[Cc][Ff]\.','cf', bb)
    bb = re.sub(' [Rr][Ee][Vv]\. [Ss][Tt][Aa][Tt]\.',' revstat ', bb)
    bb = re.sub('[ \d]+[Pp][Aa][Rr]\.',' par ', bb)
    bb = re.sub('[ \d]+[Ss][Tt][Aa][Tt]\.',' stat ', bb)
    bb = re.sub("[\(\[].*?[\)\]]", "", bb)

    
    bb = (
        bb.replace("USCCITATION", "")
        .replace("CaseAvCaseB", "")
        .replace("StateCase", "")
        .replace("StateCase2", "")
        .replace("CaseRef", "")
        .replace("StateAppCase", "")
        .replace("StateCodeSection", "")
        .replace("anno", "")
    )

    bb = unidecode(re.sub(' +', ' ', bb.strip()))
    bb = bb.strip()

    if bb.strip() == '' or pd.isna(bb):
        return ''    

    doc = nlp(bb)
    persons = set([str(ent.text).lower() for ent in doc.ents if ent.label_ == "PERSON"])
    persons = [x.translate(str.maketrans('', '', string.punctuation)) for x in set(nltk.word_tokenize(" ".join(persons)))]
    persons.extend(names)

    result = []
    for token in doc:
        if (len(token.text) > 1 
            and token.text.isalpha() # Token is word
            and token.pos_ not in ['NUM', 'PROPN']  # Token not NUM, PROPN nor ADV,, , 'ADV', 'PRON', 'CONJ'
            and not token.is_punct # Token not punctuation
            and not token.is_stop # Token not punctuation
            and token.text not in authors_judges # Token is not a judge
            and token.text not in persons # Token is not a persona name
        ):

            result.append(token.lemma_.lower())
    
    # Our result is a string of the form:
    # "text lemma POS; text lemma POS; text lemma POS; ..."
    result = " ".join(result)
    
    return result

In [None]:
# 5 H
with open("narco_nlp_21set_nostop.csv", "w") as my_empty_csv:
    pass

pbar = tqdm(total=6076 ) # narco_df total rows
chunksize = 1

for chunk in pd.read_csv("narco_df.csv", chunksize=chunksize, sep="|", usecols=["text"]):
    chunk['spacy_nlp'] = chunk.apply(lambda row: full_text_clean(row["text"]), axis=1)
    chunk.drop(columns=["text"], inplace=True)
    chunk.to_csv("narco_nlp_21set_nostop.csv", index=False, sep="|", mode="a", header=False)

    pbar.update(1)

pbar.close()

100%|██████████| 6076/6076 [57:57<00:00,  1.75it/s]


In [None]:
!cp narco_nlp_21set_nostop.csv /content/drive/MyDrive/Università/inforet_prj

Check that the cleaning was ok

In [None]:
narco_nlp = pd.read_csv(
    "/content/drive/MyDrive/Università/inforet_prj/narco_nlp_21set_nostop.csv",
    sep="|",
    names=['spacy_nlp'],
    header=None
)

In [None]:
assert narco_nlp.shape[0] > 0

In [None]:
assert narco_nlp.loc[pd.isna(narco_nlp.spacy_nlp)].shape[0] == 0

## **SENTENCES**

In [None]:
opinions_df = pd.read_csv("/content/drive/MyDrive/Università/inforet_prj/narco_df.csv", sep="|")
opinions_df

Unnamed: 0,type,text,author,id,year
0,majority,CHIEF JUSTICE HEIPLE\ndelivered the opinion of...,CHIEF JUSTICE HEIPLE,1,1997
1,majority,JUSTICE BILANDIC\ndelivered the opinion of the...,JUSTICE BILANDIC,5,1997
2,majority,JUSTICE ZWICK\ndelivered the opinion of the co...,JUSTICE ZWICK,8,1997
3,majority,PRESIDING JUSTICE GREIMAN\ndelivered the opini...,PRESIDING JUSTICE GREIMAN,44,1997
4,majority,PRESIDING JUSTICE CAHILL\ndelivered the opinio...,PRESIDING JUSTICE CAHILL,55,2000
...,...,...,...,...,...
6071,concurrence,"JUSTICE McNAMARA,\nspecially concurring:\nI ag...","JUSTICE McNAMARA,",183077,1986
6072,majority,PRESIDING JUSTICE WOLFSON\ndelivered the opini...,PRESIDING JUSTICE WOLFSON,183094,2007
6073,majority,Mr. PRESIDING JUSTICE GOLDBERG\ndelivered the ...,Mr. PRESIDING JUSTICE GOLDBERG,183108,1976
6074,majority,PRESIDING JUSTICE QUINN\ndelivered the opinion...,PRESIDING JUSTICE QUINN,183124,2006


In [None]:
schedule_1 = ["cannabis",  "marijuana",  "lsd", "heroin", 'methaqualone', "ecstasy", "peyote", "mescaline", "mda", "mdma"] #https://www.dea.gov/drug-information/drug-scheduling
schedule_2 = ["cocaine", "methamphetamine", "hydromorphone", "dilaudid", "meperidine", "demerol", "oxycodone", "dexedrine", "fentanyl", "ritalin", "methadone", "amphetamine", "phencyclidine", "pseudoephedrine", "ephedrine", "meth", "opium", "dilaudid", "preludin"]
schedule_3 = ["ketamine", "anabolic" , "steroids",  "testosterone", "ketamine"]
schedule_4 = ["modafinil", "provigil", "adderall", "methylphenidate", "memantine", "axura", "soma", "xanax", "darvon", "darvocet", "valium", "ativan", "talwin", "ambien", "tramadol",  "ethclorvynol"]
schedule_5 = ["phenylpropanolamine", "lomotil", "motofen", "lyrica", "parepectolin", "tetracaine"]

In [None]:
conditions = [
    (opinions_df['text'].str.contains("|".join(schedule_1))) & ~(opinions_df['text'].str.contains("|".join(schedule_3))) & ~(opinions_df['text'].str.contains("|".join(schedule_2))) & ~(opinions_df['text'].str.contains("|".join(schedule_4))) & ~(opinions_df['text'].str.contains("|".join(schedule_5))),
    (opinions_df['text'].str.contains("|".join(schedule_2))) & ~(opinions_df['text'].str.contains("|".join(schedule_3))) & ~(opinions_df['text'].str.contains("|".join(schedule_1))) & ~(opinions_df['text'].str.contains("|".join(schedule_4))) & ~(opinions_df['text'].str.contains("|".join(schedule_5))),
    (opinions_df['text'].str.contains("|".join(schedule_3))) & ~(opinions_df['text'].str.contains("|".join(schedule_1))) & ~(opinions_df['text'].str.contains("|".join(schedule_2))) & ~(opinions_df['text'].str.contains("|".join(schedule_4))) & ~(opinions_df['text'].str.contains("|".join(schedule_5))),
    (opinions_df['text'].str.contains("|".join(schedule_4))) & ~(opinions_df['text'].str.contains("|".join(schedule_3))) & ~(opinions_df['text'].str.contains("|".join(schedule_2))) & ~(opinions_df['text'].str.contains("|".join(schedule_1))) & ~(opinions_df['text'].str.contains("|".join(schedule_5)))
    ]

# create a list of the values we want to assign for each condition
values = ['narco_1', 'narco_2', 'narco_3', 'narco_4']

# create a new column and use np.select to assign values to it using our lists as arguments
opinions_df['schedule'] = np.select(conditions, values)

# display updated DataFrame
opinions_df.head()

Unnamed: 0,type,text,author,id,year,schedule
0,majority,CHIEF JUSTICE HEIPLE\ndelivered the opinion of...,CHIEF JUSTICE HEIPLE,1,1997,narco_2
1,majority,JUSTICE BILANDIC\ndelivered the opinion of the...,JUSTICE BILANDIC,5,1997,0
2,majority,JUSTICE ZWICK\ndelivered the opinion of the co...,JUSTICE ZWICK,8,1997,narco_2
3,majority,PRESIDING JUSTICE GREIMAN\ndelivered the opini...,PRESIDING JUSTICE GREIMAN,44,1997,narco_2
4,majority,PRESIDING JUSTICE CAHILL\ndelivered the opinio...,PRESIDING JUSTICE CAHILL,55,2000,0


In [None]:
#narcotics_schedule_1 = ["cannabis",  "marijuana",  "lsd", "heroin", 'methaqualone', "ecstasy", "mdma"]

In [None]:
#narcotics_schedule_1 = ["cannabis",  "marijuana", "mdma", "lsd", "heroin", "cannabis"]

In [None]:
narco_1_pmi = opinions_df.loc[:, ["schedule", "text"]]

In [None]:
narco_1_pmi = narco_1_pmi.loc[narco_1_pmi['schedule'] == 'narco_1'] # 1969

In [None]:
narco_1_pmi

Unnamed: 0,schedule,text,sentences
8,narco_1,JUSTICE COLWELL\ndelivered the opinion of the ...,[JUSTICE COLWELL\ndelivered the opinion of the...
17,narco_1,PRESIDING JUSTICE KUEHN\ndelivered the opinion...,[PRESIDING JUSTICE KUEHN\ndelivered the opinio...
20,narco_1,PRESIDING JUSTICE HOLDRIDGE\ndelivered the opi...,[PRESIDING JUSTICE HOLDRIDGE\ndelivered the op...
26,narco_1,JUSTICE MICHELA\ndelivered the opinion of the ...,[JUSTICE MICHELA\ndelivered the opinion of the...
27,narco_1,PRESIDING JUSTICE HOPKINS\ndelivered the opini...,[PRESIDING JUSTICE HOPKINS\ndelivered the opin...
...,...,...,...
6055,narco_1,JUSTICE McCULLOUGH\ndelivered the opinion of t...,[JUSTICE McCULLOUGH\ndelivered the opinion of ...
6057,narco_1,CHIEF JUSTICE BILANDIC\ndelivered the opinion ...,[CHIEF JUSTICE BILANDIC\ndelivered the opinion...
6062,narco_1,Mr. JUSTICE HOPE\ndelivered the opinion of the...,[Mr. JUSTICE HOPE\ndelivered the opinion of th...
6071,narco_1,"JUSTICE McNAMARA,\nspecially concurring:\nI ag...","[JUSTICE McNAMARA,\nspecially concurring:\nI a..."


In [None]:
from nltk.tokenize import sent_tokenize
narco_1_pmi["sentences"] = narco_1_pmi.text.progress_apply(lambda x: sent_tokenize(x)) 

100%|██████████| 1969/1969 [00:12<00:00, 153.41it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
narco_sentences_1 = narco_1_pmi.explode('sentences')

In [None]:
narco_sentences_1 = narco_sentences_1.drop(columns=["text"]).reset_index().rename(columns={"index": "opinion_id"})

In [None]:
narco_sentences_1

Unnamed: 0,opinion_id,schedule,sentences
0,8,narco_1,JUSTICE COLWELL\ndelivered the opinion of the ...
1,8,narco_1,Defendant contends that the petition demonstra...
2,8,narco_1,Defendant was convicted following a jury trial...
3,8,narco_1,Lawrence Tankus represented defendant during p...
4,8,narco_1,"On August 24, 1995, the court sentenced defend..."
...,...,...,...
239150,6073,narco_1,The case before us deals with reclassification...
239151,6073,narco_1,For these reasons we cannot agree that Bevins ...
239152,6073,narco_1,We conclude that the orders dismissing each of...
239153,6073,narco_1,Judgments reversed; causes remanded.


In [None]:
narco_sentences_1.to_csv("narco_sentences_1.csv", index=False, sep="|")

In [None]:
!cp narco_sentences_1.csv /content/drive/MyDrive/Università/inforet_prj

---

In [None]:
narco_sentences_1 = pd.read_csv("/content/drive/MyDrive/Università/inforet_prj/narco_sentences_1.csv", sep="|")
narco_sentences_1

In [20]:
narco_sentences_1.shape[0]

239155

In [10]:
names = ["Brinks", "Flores", "People v.","Pinnix", "Garvey", "Steinbach", "Fowlar", "Mobil", "Milian", "TQ", "Yanez", "Tawanda", "Geder", "Mason", "Payne", "Bair", "ILCS",  "tbe", "tbat", "Delores","Stivers", "Spades", "Snyders", "Nally", "Budaj", "Yacoo", "Cosgrove", "Cos-grove", "Gayles", "Hodges"]

In [9]:
#narco_sentences_1_sample = narco_sentences_1.sample(5000, random_state=0)

In [10]:
#narco_sentences_1["sent_clean"] = narco_sentences_1.sentences.progress_apply(lambda txt: full_text_clean(txt, True))

In [11]:
#narco_sentences_1.to_csv("narco_sentences_1_sample.csv", index=False, sep="|")

In [12]:
#narco_sentences_1

In [13]:
#!cp narco_sentences_1.csv /content/drive/MyDrive/Università/inforet_prj

In [11]:
# 1h
with open("narco_1_pmi_nlp.csv", "w") as my_empty_csv:
    pass

pbar = tqdm(total=239155) # narco_sentences total rows 239155
chunksize = 1

for chunk in pd.read_csv("/content/drive/MyDrive/Università/inforet_prj/narco_sentences_1.csv", chunksize=chunksize, sep="|", usecols=["sentences"]):
    chunk['sent_clean'] = chunk.apply(lambda row: full_text_clean(row["sentences"], is_sentence=True), axis=1)
    chunk.drop(columns=["sentences"], inplace=True)
    chunk.to_csv("narco_1_pmi_nlp.csv", index=False, sep="|", mode="a", header=False)

    pbar.update(1)

pbar.close()

100%|██████████| 239155/239155 [1:05:31<00:00, 60.83it/s]


In [12]:
!cp narco_1_pmi_nlp.csv /content/drive/MyDrive/Università/inforet_prj

In [None]:
y = narco_1_pmi[:3]
y

In [None]:
y["sent_clean"] = y.sentences.progress_apply(lambda x: full_text_clean_sentences(x)) 

In [None]:
y.iloc[0]["sent_clean"]

In [None]:
!cp narco_1_pmi_nlp.csv /content/drive/MyDrive/Università/inforet_prj