In [1]:
# AI Legal Sentiment Analyzer with File Upload
# Complete implementation for Google Colab

# Install required packages
!pip install transformers torch sentencepiece pandas

import pandas as pd
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from google.colab import files
from IPython.display import display

# Set device to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# @title Select Model
model_choice = "FLAN-T5" # @param ["Legal-BERT", "RoBERTa-Legal", "FLAN-T5", "DistilBERT"]

# Load appropriate model based on selection
if model_choice == "Legal-BERT":
    model_name = "nlpaueb/legal-bert-small-uncased"
elif model_choice == "RoBERTa-Legal":
    model_name = "joelito/legal-roberta-base"
elif model_choice == "FLAN-T5":
    model_name = "google/flan-t5-base"
else:
    model_name = "distilbert-base-uncased-finetuned-sst-2-english"

# Initialize sentiment analysis pipeline
print("\nLoading model...")
try:
    if "flan-t5" in model_name.lower():
        # For FLAN-T5 we'll use a text generation approach
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

        def analyze_sentiment(text):
            input_text = f"Analyze the sentiment of this legal text: {text}. The sentiment is:"
            inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512).to(device)
            outputs = model.generate(**inputs, max_new_tokens=10)
            return tokenizer.decode(outputs[0], skip_special_tokens=True)
    else:
        # For other models, use standard sentiment analysis
        sentiment_pipeline = pipeline(
            "sentiment-analysis",
            model=model_name,
            tokenizer=model_name,
            device=0 if device == "cuda" else -1
        )

        def analyze_sentiment(text):
            result = sentiment_pipeline(text)
            return result[0]['label'], result[0]['score']
except Exception as e:
    print(f"Error loading model: {e}")
    print("Falling back to DistilBERT")
    sentiment_pipeline = pipeline("sentiment-analysis")

    def analyze_sentiment(text):
        result = sentiment_pipeline(text)
        return result[0]['label'], result[0]['score']

# Function to process legal documents
def analyze_legal_documents(documents):
    results = []
    for doc in documents:
        if isinstance(doc, str):  # Ensure we're processing text
            if "flan-t5" in model_name.lower():
                sentiment = analyze_sentiment(doc)
                results.append({
                    "text": doc,
                    "sentiment": sentiment,
                    "score": None  # FLAN-T5 doesn't provide scores
                })
            else:
                try:
                    label, score = analyze_sentiment(doc)
                    results.append({
                        "text": doc,
                        "sentiment": label,
                        "score": score
                    })
                except Exception as e:
                    print(f"Error analyzing document: {e}")
                    results.append({
                        "text": doc,
                        "sentiment": "ERROR",
                        "score": None
                    })
    return pd.DataFrame(results)

# File upload section
print("\nPlease upload your CSV file:")
uploaded = files.upload()
filename = next(iter(uploaded))

# Read the uploaded file
try:
    df = pd.read_csv(filename)
    print("\nFile contents preview:")
    display(df.head())

    # Try to automatically detect the text column
    text_columns = ['Document_Text', 'text', 'content', 'document', 'sentence', 'paragraph']
    found_column = None
    for col in text_columns:
        if col in df.columns:
            found_column = col
            break

    if found_column:
        print(f"\nUsing column '{found_column}' for analysis")
        documents = df[found_column].astype(str).tolist()
    else:
        print("\nCouldn't find standard text column. Using first column of text type.")
        text_cols = df.select_dtypes(include=['object', 'string']).columns
        if len(text_cols) > 0:
            documents = df[text_cols[0]].astype(str).tolist()
        else:
            raise ValueError("No suitable text column found in the CSV file")

    # Analyze documents
    print("\nAnalyzing documents...")
    results_df = analyze_legal_documents(documents)

    # Add results back to original dataframe
    df['sentiment'] = results_df['sentiment']
    if 'score' in results_df.columns:
        df['sentiment_score'] = results_df['score']

    # Display results
    print("\nAnalysis complete. Results:")
    display(df)

    # Basic statistics
    if 'sentiment_score' in df.columns:
        avg_score = df['sentiment_score'].mean()
        print(f"\nAverage sentiment score: {avg_score:.2f}")

    sentiment_counts = df['sentiment'].value_counts()
    print("\nSentiment Distribution:")
    print(sentiment_counts)

    # Save results option
    save_results = True # @param {type:"boolean"}
    if save_results:
        output_filename = 'legal_sentiment_results.csv'
        df.to_csv(output_filename, index=False)
        files.download(output_filename)
        print(f"\nResults saved and downloaded as '{output_filename}'")

except Exception as e:
    print(f"\nError processing file: {e}")
    print("Please ensure you've uploaded a valid CSV file with text content.")

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/989 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/141M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/141M [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu



Please upload your CSV file:


Saving legal_documents_input.csv to legal_documents_input.csv

File contents preview:


Unnamed: 0,Document_ID,Document_Text,Metadata
0,1,The client expressed satisfaction with the res...,Client Feedback
1,2,The ruling imposes severe penalties on the def...,Case Summary
2,3,"While the settlement offer is reasonable, it d...",Settlement Review
3,4,"We are neutral regarding the outcome, as it al...",Internal Memo
4,5,The decision favors our position and establish...,Legal Opinion



Using column 'Document_Text' for analysis

Analyzing documents...

Analysis complete. Results:


Unnamed: 0,Document_ID,Document_Text,Metadata,sentiment,sentiment_score
0,1,The client expressed satisfaction with the res...,Client Feedback,LABEL_1,0.628747
1,2,The ruling imposes severe penalties on the def...,Case Summary,LABEL_1,0.593839
2,3,"While the settlement offer is reasonable, it d...",Settlement Review,LABEL_1,0.599273
3,4,"We are neutral regarding the outcome, as it al...",Internal Memo,LABEL_1,0.566986
4,5,The decision favors our position and establish...,Legal Opinion,LABEL_1,0.602768
5,6,There is considerable dissatisfaction among st...,Client Feedback,LABEL_1,0.607286
6,7,The legal counsel provided clear and helpful a...,Client Feedback,LABEL_1,0.685454
7,8,"The outcome appears largely positive, providin...",Case Analysis,LABEL_1,0.619256
8,9,The recent court decision has created uncertai...,Case Update,LABEL_1,0.630785
9,10,We believe the agreement represents a fair com...,Settlement Summary,LABEL_1,0.631454



Average sentiment score: 0.62

Sentiment Distribution:
sentiment
LABEL_1    30
Name: count, dtype: int64


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Results saved and downloaded as 'legal_sentiment_results.csv'
