In [1]:
import torch
import sys
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import pipeline

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device:", device)

  from .autonotebook import tqdm as notebook_tqdm


Device: cuda:0


In [2]:
df = pd.read_csv('dataset/legal_text_classification.csv')

# Raw Dataset Info
print("Before Preprocessing:")
print(f"Total Rows: {df.shape[0]}")
print(f"Total Columns: {df.shape[1]}")
print(f"\nColumn Names: {df.columns}")

Before Preprocessing:
Total Rows: 24985
Total Columns: 4

Column Names: Index(['case_id', 'case_outcome', 'case_title', 'case_text'], dtype='object')


In [3]:
# Count duplicates where both case_title and case_text are the same
duplicate_rows = df.duplicated().sum()
print(f"Number of Duplicate Rows (same title & text): {duplicate_rows}")
#df = df.drop_duplicates(subset=['case_title', 'case_text'])

Number of Duplicate Rows (same title & text): 0


In [4]:
# Find and drop rows with missing values in 'case_text' or 'case_outcome'
missing_rows = df[df[['case_text', 'case_outcome']].isnull().any(axis=1)]
df = df.dropna(subset=['case_text', 'case_outcome'])
print("Rows with missing values in 'case_text' or 'case_outcome':", len(missing_rows))

Rows with missing values in 'case_text' or 'case_outcome': 176


In [5]:
# Strip whitespace in text fields
df['case_text'] = df['case_text'].str.strip()
df['case_title'] = df['case_title'].str.strip()

# Normalize labels to lowercase
df['case_outcome'] = df['case_outcome'].str.lower().str.strip()

In [6]:
# Encode the 'case_outcome' column into numeric labels
label_encoder = LabelEncoder()
df['case_label'] = label_encoder.fit_transform(df['case_outcome'])

label_mapping_df = pd.DataFrame({
    'case_outcome': label_encoder.classes_,
    'case_label': label_encoder.transform(label_encoder.classes_)
})
print("Label Mapping Table:")
print(label_mapping_df.to_string(index=False))

Label Mapping Table:
 case_outcome  case_label
     affirmed           0
      applied           1
     approved           2
        cited           3
   considered           4
    discussed           5
distinguished           6
     followed           7
  referred to           8
      related           9


In [7]:
# Preprocessed Dataset info
print("After Preprocessing:")
print(f"Total Rows: {df.shape[0]}")
print(f"Total Columns: {df.shape[1]}")
print(f"\nColumn Names: {df.columns}")

print("\nSample Data:\n", df.head())

After Preprocessing:
Total Rows: 24809
Total Columns: 5

Column Names: Index(['case_id', 'case_outcome', 'case_title', 'case_text', 'case_label'], dtype='object')

Sample Data:
   case_id case_outcome                                         case_title  \
0   Case1        cited  Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...   
1   Case2        cited  Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...   
2   Case3        cited  Colgate Palmolive Co v Cussons Pty Ltd (1993) ...   
3   Case4        cited  Dais Studio Pty Ltd v Bullett Creative Pty Ltd...   
4   Case5        cited  Dr Martens Australia Pty Ltd v Figgins Holding...   

                                           case_text  case_label  
0  Ordinarily that discretion will be exercised s...           3  
1  The general principles governing the exercise ...           3  
2  Ordinarily that discretion will be exercised s...           3  
3  The general principles governing the exercise ...           3  
4  The preceding general

In [8]:
from sklearn.model_selection import train_test_split

# First split: Train (80%) and Temp (20%)
train_df, temp_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df['case_label'],
    random_state=42
)

# Second split: Validation (10%) and Test (10%) from Temp
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df['case_label'],
    random_state=42
)

# Save to CSV files (without index )
train_df.to_csv('dataset/legal_text_classifcation_train.csv', index=False)
val_df.to_csv('dataset/legal_text_classifcation_val.csv', index=False)
test_df.to_csv('dataset/legal_text_classifcation_test.csv', index=False)


print(f"Train set size     : {len(train_df)} rows")
print(f"Validation set size: {len(val_df)} rows")
print(f"Test set size      : {len(test_df)} rows")

print("Overall Label Distribution:\n", df['case_label'].value_counts())


Train set size     : 19847 rows
Validation set size: 2481 rows
Test set size      : 2481 rows
Overall Label Distribution:
 case_label
3    12110
8     4363
1     2438
7     2252
4     1699
5     1018
6      603
9      112
2      108
0      106
Name: count, dtype: int64


In [9]:
print(df.head(5))

  case_id case_outcome                                         case_title  \
0   Case1        cited  Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...   
1   Case2        cited  Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...   
2   Case3        cited  Colgate Palmolive Co v Cussons Pty Ltd (1993) ...   
3   Case4        cited  Dais Studio Pty Ltd v Bullett Creative Pty Ltd...   
4   Case5        cited  Dr Martens Australia Pty Ltd v Figgins Holding...   

                                           case_text  case_label  
0  Ordinarily that discretion will be exercised s...           3  
1  The general principles governing the exercise ...           3  
2  Ordinarily that discretion will be exercised s...           3  
3  The general principles governing the exercise ...           3  
4  The preceding general principles inform the ex...           3  
