# NB3: Notebook created for processing judgment text

**Output:** Clean dataset with the following variables. 

- [ ] `decision_date`: application date. 
- [ ] `referral_date `: date when the reference for preliminary referenc was sent. 
- [ ] `referring_court_name`: ???
- [ ] `full_text`: 
- [ ] `par_location`:
- [ ] 

**Output**: Create script that scrapes updates

## Environment and Settings for Python

In [25]:
# REQUIRED  Libraries
import os 
import sys
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import pyarrow
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity

# ADD project paths for for easier access across whole project
#paths_script_path = "C:\\Users\\mauricmm\\iCloudDrive\\cloudgit\\uio24emc\\"
parent_dir = os.path.dirname(os.getcwd(),) +"\\"
sys.path.append(parent_dir)
import paths

# Access paths globally
paths.figures_dir

'c:\\gitprojects\\uio24emc\\figures\\'

## Start Processing Judgements Text

In [22]:
#READ IUROPA Text Corpus Data
iuropa_corpus_path = "C:\\gitprojects\\uio24emc\\raw\\iuropa_text.gz.parquet"
iuropa_corpus = pq.read_table(iuropa_corpus_path)
iuropa_corpus_df = iuropa_corpus.to_pandas()
schema = pq.read_schema(iuropa_corpus_path)

In [38]:
iuropa_corpus_df.columns

# COLUMNS Selector
cols_selector =  ['document_id', 'paragraph_id', 'language', 'ecli', 'court',
       'date', 'year', 'text', 'line_id', 'line_id_prop', 'section',
       'paragraph_type', 'paragraph_number', 'nchar', 'html_class',
       'html_attr'

# BOOLEAN Conditions



Index(['document_id', 'paragraph_id', 'source', 'language', 'ecli', 'court',
       'date', 'year', 'text', 'line_id', 'line_id_prop', 'section',
       'paragraph_type', 'paragraph_number', 'nchar', 'html_class',
       'html_attr'],
      dtype='object')

In [39]:
iuropa_corpus_df.head()
iuropa_corpus_df

# Search for rows where the specified column contains the search text
# This example performs a case-insensitive search
matching_rows = iuropa_corpus_df[iuropa_corpus_df['text'].str.contains( "ilovepdf" , case=False, na=False)]

In [40]:
matching_rows

Unnamed: 0,document_id,paragraph_id,source,language,ecli,court,date,year,text,line_id,line_id_prop,section,paragraph_type,paragraph_number,nchar,html_class,html_attr
10748840,T_2024_9_2024_01_17_FR,T_2024_9_2024_01_17_FR_4,cur,FR,ECLI:EU:T:2024:9,General Court,2024-01-17,2024,« Marque de l'Union européenne - Demande de ma...,4,0.034,presentation,keywords,0,336,C71Indicateur,regular_p
10748842,T_2024_9_2024_01_17_FR,T_2024_9_2024_01_17_FR_6,cur,FR,ECLI:EU:T:2024:9,General Court,2024-01-17,2024,"Ilovepdf, SL, établie à Barcelone (Espagne), r...",6,0.05,presentation,paragraph,0,89,C02AlineaAltA,regular_p
10748854,T_2024_9_2024_01_17_FR,T_2024_9_2024_01_17_FR_18,cur,FR,ECLI:EU:T:2024:9,General Court,2024-01-17,2024,1 Par son recours fondé sur l'article 263 TFUE...,18,0.151,grounds,paragraph,1,321,C01PointnumeroteAltN,regular_p
10748856,T_2024_9_2024_01_17_FR,T_2024_9_2024_01_17_FR_20,cur,FR,ECLI:EU:T:2024:9,General Court,2024-01-17,2024,"2 Le 24 octobre 2019, la requérante a présenté...",20,0.168,grounds,paragraph,2,148,C01PointnumeroteAltN,regular_p
10748914,T_2024_9_2024_01_17_FR,T_2024_9_2024_01_17_FR_78,cur,FR,ECLI:EU:T:2024:9,General Court,2024-01-17,2024,"46 La chambre de recours a relevé, aux points ...",78,0.655,grounds,paragraph,46,813,C01PointnumeroteAltN,regular_p
10748915,T_2024_9_2024_01_17_FR,T_2024_9_2024_01_17_FR_79,cur,FR,ECLI:EU:T:2024:9,General Court,2024-01-17,2024,47 La requérante soutient que le signe ILOVEPD...,79,0.664,grounds,paragraph,47,566,C01PointnumeroteAltN,regular_p
10748920,T_2024_9_2024_01_17_FR,T_2024_9_2024_01_17_FR_84,cur,FR,ECLI:EU:T:2024:9,General Court,2024-01-17,2024,"52 Dès lors, pris dans son ensemble, le signe ...",84,0.706,grounds,paragraph,52,889,C01PointnumeroteAltN,regular_p
10748922,T_2024_9_2024_01_17_FR,T_2024_9_2024_01_17_FR_86,cur,FR,ECLI:EU:T:2024:9,General Court,2024-01-17,2024,"54 Premièrement, contrairement à ce que soutie...",86,0.723,grounds,paragraph,54,425,C01PointnumeroteAltN,regular_p
10748924,T_2024_9_2024_01_17_FR,T_2024_9_2024_01_17_FR_88,cur,FR,ECLI:EU:T:2024:9,General Court,2024-01-17,2024,"56 Deuxièmement, la circonstance que le signe ...",88,0.739,grounds,paragraph,56,476,C01PointnumeroteAltN,regular_p
10748952,T_2024_9_2024_01_17_FR,T_2024_9_2024_01_17_FR_116,cur,FR,ECLI:EU:T:2024:9,General Court,2024-01-17,2024,"2) Ilovepdf, SL et l'Office de l'Union europée...",116,0.975,operative,paragraph,76,132,C08Dispositif,regular_p


### NOTES

In [None]:
# USE Query for complex but not to elaborated queries
filtered_df = df.query('description.str.contains("data") & id > 1', engine='python')

# USE BOOLEAN masks for more complex conditions
condition1 = df['description'].str.contains('data', case=False, na=False)
condition2 = df['age'] > 30
condition3 = df['name'].isin(['Alice', 'David'])
# Combine the conditions using logical operators
complex_mask = condition1 & condition2 | condition3

# Apply the mask to filter the DataFrame
filtered_df = df[complex_mask]