In [None]:
!pip install transformers torch datasets numpy pandas sentence-transformers pinecone-client boto3 streamlit pinecone

: 

In [8]:
!unzip /content/t5_finetuned.zip -d /content/t5_finetuned


Archive:  /content/t5_finetuned.zip
replace /content/t5_finetuned/content/t5_finetuned/model.safetensors? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [14]:
!pip install pinecone sentence_transformers



In [15]:
!pip install sentence-transformers



In [17]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.30.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.18-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/41.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.

In [22]:
import pinecone
from sentence_transformers import SentenceTransformer
import pandas as pd

# Initialize Pinecone client
client = pinecone.Pinecone(api_key={API_KEY})

# Define index name
index_name = "financial-docs"

# Check if the index exists; if not, create it
if index_name not in client.list_indexes().names():
    print(f"Index {index_name} does not exist. Creating it...")
    client.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=pinecone.ServerlessSpec(cloud="aws", region="us-east-1")
    )

    # Connect to the index
    index = client.Index(index_name)

    # Load dataset
    df = pd.read_csv("/content/financial_data.csv")

    # Load embedding model
    embedder = SentenceTransformer("all-MiniLM-L6-v2")

    # Generate embeddings
    embeddings = embedder.encode(df["transcript"].tolist(), show_progress_bar=True)

    # Prepare vectors with truncated metadata
    ids = [str(i) for i in range(len(df))]
    vectors = [(ids[i], embeddings[i].tolist(), {"text": df["transcript"][i][:500]}) for i in range(len(df))]

    # Batch upsert
    batch_size = 50
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i + batch_size]
        index.upsert(vectors=batch)
        print(f"Uploaded batch {i // batch_size + 1} of {len(vectors) // batch_size + 1}")

    print("Documents stored in Pinecone!")
else:
    print(f"Index {index_name} already exists. Connecting to it...")
    index = client.Index(index_name)

# Step 4: Recreate the Gradio UI
import gradio as gr
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Connect to the Pinecone index (already connected, but ensuring it's available)
index = client.Index("financial-docs")

# Load the fine-tuned T5 model
tokenizer = T5Tokenizer.from_pretrained("/content/t5_finetuned/content/t5_finetuned")
model = T5ForConditionalGeneration.from_pretrained("/content/t5_finetuned/content/t5_finetuned")

# Load transcript mapping
mapping = pd.read_csv("/content/transcript_mapping.csv")

# Define the search and summarize function for the UI
def search_and_summarize(query):
    # Search Pinecone
    query_embedding = embedder.encode([query])[0].tolist()
    results = index.query(vector=query_embedding, top_k=1, include_metadata=True)

    if not results['matches']:

        return "No matching transcripts found.", ""

    # Retrieve the top result
    top_match = results['matches'][0]
    top_id = top_match['id']
    score = top_match['score']
    truncated_text = top_match['metadata']['text']

    # Get the full transcript using the 'mapping' DataFrame
    matching_rows = mapping[mapping["pinecone_id"].astype(str) == top_id] # Filter the mapping DataFrame

    if matching_rows.empty:
        return f"Error: No transcript found for Pinecone ID {top_id}", ""

    top_transcript = matching_rows["transcript"].values[0]

    # Generate summary
    inputs = tokenizer("summarize: " + top_transcript, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=100,
        min_length=20,
        num_beams=6,
        no_repeat_ngram_size=3,
        length_penalty=1.0,
        early_stopping=False
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Format the output
    search_result = f"**Top Matching Transcript (ID: {top_id}, Score: {score:.2f})**\n{truncated_text}..."
    return search_result, summary

# Build the Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# Financial Report Summarizer")
    gr.Markdown("Enter a query to search financial transcripts and get a summary.")

    with gr.Row():
        query_input = gr.Textbox(label="Query", placeholder="e.g., What is the revenue growth for Pharma?")
        submit_button = gr.Button("Search and Summarize")

    with gr.Row():
        with gr.Column():
            search_output = gr.Textbox(label="Search Result (Top Matching Transcript)")
        with gr.Column():
            summary_output = gr.Textbox(label="Summary")

    # Connect the button to the function
    submit_button.click(
        fn=search_and_summarize,
        inputs=query_input,
        outputs=[search_output, summary_output]
    )

# Launch the UI
demo.launch()

Index financial-docs already exists. Connecting to it...
It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://49f01cdeaf4cd2ccda.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [30]:
import pinecone
from sentence_transformers import SentenceTransformer
import pandas as pd
import re

# Initialize Pinecone client
client = pinecone.Pinecone(api_key="pcsk_3z4iTz_BoTpE3CrVJVTUvF37hWHeN5Eo6XnosKTjidbKsvb9FotdUETgUnDNpXqHkCFVN9")

# Define index name
index_name = "financial-docs"

# Check if the index exists; if not, create it
if index_name not in client.list_indexes().names():
    print(f"Index {index_name} does not exist. Creating it...")
    client.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=pinecone.ServerlessSpec(cloud="aws", region="us-east-1")
    )

    # Connect to the index
    index = client.Index(index_name)

    # Load dataset
    df = pd.read_csv("/content/financial_data.csv")

    # Load embedding model
    embedder = SentenceTransformer("all-MiniLM-L6-v2")

    # Generate embeddings
    embeddings = embedder.encode(df["transcript"].tolist(), show_progress_bar=True)

    # Prepare vectors with truncated metadata
    ids = [str(i) for i in range(len(df))]
    vectors = [(ids[i], embeddings[i].tolist(), {"text": df["transcript"][i][:500]}) for i in range(len(df))]

    # Batch upsert
    batch_size = 50
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i + batch_size]
        index.upsert(vectors=batch)
        print(f"Uploaded batch {i // batch_size + 1} of {len(vectors) // batch_size + 1}")

    print("Documents stored in Pinecone!")
else:
    print(f"Index {index_name} already exists. Connecting to it...")
    index = client.Index(index_name)

# Load the embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Connect to the Pinecone index
index = client.Index("financial-docs")

# Load the fine-tuned T5 model (fixed path)
from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained("/content/t5_finetuned/content/t5_finetuned")
model = T5ForConditionalGeneration.from_pretrained("/content/t5_finetuned/content/t5_finetuned")

# Load transcript mapping
mapping = pd.read_csv("/content/transcript_mapping.csv")


# Function to preprocess transcript and extract key financial sentences
def preprocess_transcript(transcript):
    # Split transcript into sentences
    sentences = re.split(r'(?<=[.!?])\s+', transcript)

    # Keywords for financial metrics
    financial_keywords = [
        'revenue', 'earnings', 'eps', 'growth', 'margin', 'operating', 'profit',
        'q1', 'q2', 'q3', 'q4', 'quarter', 'fiscal', 'year', 'outlook', 'guidance', 'market',
        'pharma', 'chemical', 'energy', 'diagnostics', 'business unit', 'americas', 'china'
    ]

    # Filter sentences containing financial keywords
    key_sentences = [
        sentence for sentence in sentences
        if any(keyword.lower() in sentence.lower() for keyword in financial_keywords)
    ]

    # Join the key sentences back into a shorter text
    return " ".join(key_sentences) if key_sentences else transcript
# Updated search_and_summarize function with length parameter
def search_and_summarize(query, summary_length):
    try:
        # Search Pinecone
        query_embedding = embedder.encode([query])[0].tolist()
        if len(query_embedding) != 384:
            return f"Error: Query embedding dimension ({len(query_embedding)}) does not match index dimension (384).", ""

        results = index.query(vector=query_embedding, top_k=1, include_metadata=True)
        if not results['matches']:
            return "No matching transcripts found in Pinecone.", ""

        # Retrieve the top result
        top_match = results['matches'][0]
        top_id = top_match['id']
        score = top_match['score']
        truncated_text = top_match['metadata']['text']

        # Ensure pinecone_id is a string in the mapping
        mapping["pinecone_id"] = mapping["pinecone_id"].astype(str)

        # Get the full transcript
        matching_rows = mapping[mapping["pinecone_id"] == top_id]
        if matching_rows.empty:
            return f"Error: No transcript found for Pinecone ID {top_id}.", ""

        if "transcript" not in matching_rows.columns:
            return "Error: 'transcript' column missing in transcript_mapping.csv.", ""

        top_transcript = matching_rows["transcript"].values[0]

        # Preprocess the transcript to focus on financial details
        processed_transcript = preprocess_transcript(top_transcript)

        # Set max_length and min_length based on user selection
        if summary_length == "Short":
            max_len = 50
            min_len = 20
        elif summary_length == "Medium":
            max_len = 200
            min_len = 40
        else:  # Long
            max_len = 500
            min_len = 60

        # Generate summary
        inputs = tokenizer("summarize: " + processed_transcript, return_tensors="pt", max_length=512, truncation=True)
        summary_ids = model.generate(
            inputs["input_ids"],
            max_length=max_len,
            min_length=min_len,
            num_beams=6,
            no_repeat_ngram_size=3,
            length_penalty=2.0,
            early_stopping=True
        )
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        # Format the output
        search_result = f"**Top Matching Transcript (ID: {top_id}, Score: {score:.2f})**\n{truncated_text}..."
        return search_result, summary

    except Exception as e:
        return f"Error in search_and_summarize: {str(e)}", ""

# Build the Gradio UI with summary length option
import gradio as gr
with gr.Blocks() as demo:
    gr.Markdown("# Financial Report Summarizer")
    gr.Markdown("Enter a query to search financial transcripts and get a summary.")

    with gr.Row():
        query_input = gr.Textbox(label="Query", placeholder="e.g., What is the revenue growth for Pharma?")
        summary_length = gr.Dropdown(
            choices=["Short", "Medium", "Long"],
            label="Summary Length",
            value="Medium"
        )
        submit_button = gr.Button("Search and Summarize")

    with gr.Row():
        with gr.Column():
            search_output = gr.Textbox(label="Search Result (Top Matching Transcript)")
        with gr.Column():
            summary_output = gr.Textbox(label="Summary")

    # Connect the button to the function
    submit_button.click(
        fn=search_and_summarize,
        inputs=[query_input, summary_length],
        outputs=[search_output, summary_output]
    )

# Launch the UI
demo.launch()

Index financial-docs already exists. Connecting to it...
It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5e3dc459867fefcf71.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
print("Columns in transcript_mapping.csv:", df.columns)
print("First few rows of transcript_mapping.csv:")
print(df.head())

Columns in transcript_mapping.csv: Index(['transcript', 'summary'], dtype='object')
First few rows of transcript_mapping.csv:
                                          transcript  \
0  Chief Investment Officer Greg Wright, Chief Te...   
1  With me on the call is Ronald Kramer, our Chai...   
2  These statements are based on management's cur...   
3  I'm Susie Lisa, senior vice president of inves...   
4  Also on the call are Brian McDade, chief finan...   

                                             summary  
0        q3 revenue rose 11 percent to $1.1 billion.  
1  q3 earnings per share $0.31.\nq3 adjusted earn...  
2  q3 non-gaap earnings per share $2.55 from cont...  
3  cvs health qtrly revenue rose 10.1% to $76.6 b...  
4  simon property sees fy ffo per share $9.70 to ...  


KeyError: 'pinecone_id'

In [None]:
mapping["pinecone_id"] = mapping["pinecone_id"].astype(str)
matching_row = mapping[mapping["pinecone_id"] == top_id]
if matching_row.empty:c
    print("No matching transcript found for ID:", top_id)
else:
    top_transcript = matching_row["transcript"].values[0]
    print("Top transcript:", top_transcript)

Top transcript: More information is included in our most recent annual report on Form 10-K and subsequent quarterly reports on Form 10-Q and in the company's other filings with the SEC.
It's a pleasure to be with you today.
Labcorp is carrying on our mission to improve health and improve lives by harnessing the power of science, technology and innovation.
In doing so, we're able to execute against our strategy, to deliver strong results for stakeholders and to effectively respond to global challenges like the pandemic.
Our company rounded out a historic 2021 with another strong quarter that sets the stage for further success in 2022 and beyond.
In the fourth quarter, revenue totaled $4.1 billion, adjusted earnings per share reached $6.77, and free cash flow was $548 million.
For the full year, revenue was $16.1 billion, adjusted earnings per share totaled $28.52, and free cash flow reached $2.6 billion.
Our base business continued its progress during the quarter, with diagnostics and d

In [None]:
# Import libraries
import pinecone
from sentence_transformers import SentenceTransformer
import pandas as pd

# Initialize Pinecone client
client = pinecone.Pinecone(api_key="pcsk_3z4iTz_BoTpE3CrVJVTUvF37hWHeN5Eo6XnosKTjidbKsvb9FotdUETgUnDNpXqHkCFVN9")

# Define index name
index_name = "financial-docs"

# Create index if it doesn't exist
if index_name not in client.list_indexes().names():
    client.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=pinecone.ServerlessSpec(cloud="aws", region="us-east-1")
    )

# Connect to the index
index = client.Index(index_name)

# Load dataset
df = pd.read_csv("/content/financial_data.csv")

# Load embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Generate embeddings
embeddings = embedder.encode(df["transcript"].tolist(), show_progress_bar=True)

# Prepare vectors with truncated metadata
ids = [str(i) for i in range(len(df))]
vectors = [(ids[i], embeddings[i].tolist(), {"text": df["transcript"][i][:500]}) for i in range(len(df))]

# Batch upsert
batch_size = 50
for i in range(0, len(vectors), batch_size):
    batch = vectors[i:i + batch_size]
    index.upsert(vectors=batch)
    print(f"Uploaded batch {i // batch_size + 1} of {len(vectors) // batch_size + 1}")

print("Documents stored in Pinecone!")

# Save transcript mapping
df["pinecone_id"] = ids
df[["pinecone_id", "transcript"]].to_csv("/content/transcript_mapping.csv", index=False)
print("Transcript mapping saved to /content/transcript_mapping.csv")

# # Save to Google Drive
# from google.colab import drive
# drive.mount('/content/drive')
# !cp /content/transcript_mapping.csv /content/drive/MyDrive/transcript_mapping.csv
# print("Transcript mapping copied to Google Drive!")

# # Install Gradio
# !pip install gradio

# # Re-upload the fine-tuned T5 model
# from google.colab import files
# uploaded = files.upload()  # Upload t5_finetuned.zip
# !unzip /content/t5_finetuned.zip -d /content

# # Import necessary libraries for the UI
# import gradio as gr
# from transformers import T5Tokenizer, T5ForConditionalGeneration

# # Load the embedding model (already loaded, but ensuring it's available)
# embedder = SentenceTransformer("all-MiniLM-L6-v2")

# # Connect to the Pinecone index (already connected, but ensuring it's available)
# index = client.Index("financial-docs")

# # Load the fine-tuned T5 model
# tokenizer = T5Tokenizer.from_pretrained("/content/t5_finetuned")
# model = T5ForConditionalGeneration.from_pretrained("/content/t5_finetuned")

# # Load transcript mapping
# mapping = pd.read_csv("/content/transcript_mapping.csv")

# # Define the search and summarize function for the UI
# def search_and_summarize(query):
#     # Search Pinecone
#     query_embedding = embedder.encode([query])[0].tolist()
#     results = index.query(vector=query_embedding, top_k=1, include_metadata=True)

#     if not results['matches']:
#         return "No matching transcripts found.", ""

#     # Retrieve the top result
#     top_match = results['matches'][0]
#     top_id = top_match['id']
#     score = top_match['score']
#     truncated_text = top_match['metadata']['text']

    # Get the full transcript
#     top_transcript = mapping[mapping["pinecone_id"] == top_id]["transcript"].values[0]

#     # Generate summary
#     inputs = tokenizer("summarize: " + top_transcript, return_tensors="pt", max_length=512, truncation=True)
#     summary_ids = model.generate(
#         inputs["input_ids"],
#         max_length=100,
#         min_length=20,
#         num_beams=6,
#         no_repeat_ngram_size=3,
#         length_penalty=1.0,
#         early_stopping=False
#     )
#     summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

#     # Format the output
#     search_result = f"**Top Matching Transcript (ID: {top_id}, Score: {score:.2f})**\n{truncated_text}..."
#     return search_result, summary

# # Build the Gradio UI
# with gr.Blocks() as demo:
#     gr.Markdown("# Financial Report Summarizer")
#     gr.Markdown("Enter a query to search financial transcripts and get a summary.")

#     with gr.Row():
#         query_input = gr.Textbox(label="Query", placeholder="e.g., What is the revenue growth for Pharma?")
#         submit_button = gr.Button("Search and Summarize")

#     with gr.Row():
#         with gr.Column():
#             search_output = gr.Textbox(label="Search Result (Top Matching Transcript)")
#         with gr.Column():
#             summary_output = gr.Textbox(label="Summary")

#     # Connect the button to the function
#     submit_button.click(
#         fn=search_and_summarize,
#         inputs=query_input,
#         outputs=[search_output, summary_output]
#     )

# # Launch the UI
# demo.launch()

Batches:   0%|          | 0/76 [00:00<?, ?it/s]

Uploaded batch 1 of 49
Uploaded batch 2 of 49
Uploaded batch 3 of 49
Uploaded batch 4 of 49
Uploaded batch 5 of 49
Uploaded batch 6 of 49
Uploaded batch 7 of 49
Uploaded batch 8 of 49
Uploaded batch 9 of 49
Uploaded batch 10 of 49
Uploaded batch 11 of 49
Uploaded batch 12 of 49
Uploaded batch 13 of 49
Uploaded batch 14 of 49
Uploaded batch 15 of 49
Uploaded batch 16 of 49
Uploaded batch 17 of 49
Uploaded batch 18 of 49
Uploaded batch 19 of 49
Uploaded batch 20 of 49
Uploaded batch 21 of 49
Uploaded batch 22 of 49
Uploaded batch 23 of 49
Uploaded batch 24 of 49
Uploaded batch 25 of 49
Uploaded batch 26 of 49
Uploaded batch 27 of 49
Uploaded batch 28 of 49
Uploaded batch 29 of 49
Uploaded batch 30 of 49
Uploaded batch 31 of 49
Uploaded batch 32 of 49
Uploaded batch 33 of 49
Uploaded batch 34 of 49
Uploaded batch 35 of 49
Uploaded batch 36 of 49
Uploaded batch 37 of 49
Uploaded batch 38 of 49
Uploaded batch 39 of 49
Uploaded batch 40 of 49
Uploaded batch 41 of 49
Uploaded batch 42 of 49
U

Archive:  /content/t5_finetuned.zip
replace /content/content/t5_finetuned/model.safetensors? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace /content/content/t5_finetuned/spiece.model? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace /content/content/t5_finetuned/generation_config.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace /content/content/t5_finetuned/added_tokens.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace /content/content/t5_finetuned/tokenizer_config.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace /content/content/t5_finetuned/config.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace /content/content/t5_finetuned/special_tokens_map.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/content/t5_finetuned'. Use `repo_type` argument if needed.

In [None]:
import zipfile
import os

# Define the path to your ZIP file
zip_path = "/content/final.zip"

# Define where to extract the files
extract_path = "/content/data"

# Unzip the file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"Unzipped to {extract_path}")

# List the extracted files to confirm
!ls {extract_path}

Unzipped to /content/data
final  __MACOSX


In [None]:
import os
import pandas as pd

# Define paths to the folders
ects_path = "/content/data/final/"
summaries_path = "/content/data/final/"

# Lists to store transcripts and summaries
transcripts = []
summaries = []
lis_folders = ["test", "train", "val"]
# Get list of files in ects folder
for folder in lis_folders:
  transcript_files = [f for f in os.listdir(ects_path + folder + "/ects") if f.endswith(".txt")]

  # For each transcript file, find its matching summary file
  for transcript_file in transcript_files:
      # Read the transcript
      with open(os.path.join(ects_path + folder + "/ects", transcript_file), "r", encoding="utf-8") as f:
          transcript_text = f.read().strip()

      # Look for the matching summary file (same filename)
      summary_file = os.path.join(summaries_path + folder + "/gt_summaries", transcript_file)
      if os.path.exists(summary_file):
          with open(summary_file, "r", encoding="utf-8") as f:
              summary_text = f.read().strip()

          # Add the pair to our lists
          transcripts.append(transcript_text)
          summaries.append(summary_text)
      else:
          print(f"Warning: No matching summary found for {transcript_file}")

# Create a DataFrame
data = {
    "transcript": transcripts,
    "summary": summaries
}
df = pd.DataFrame(data)

# Save to CSV
df.to_csv("financial_data.csv", index=False)
print(f"Created financial_data.csv with {len(df)} transcript-summary pairs!")

Created financial_data.csv with 2425 transcript-summary pairs!


Check whether the data was successfully transformed

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("/content/financial_data.csv")

# Check the number of samples
print(f"Number of samples: {len(df)}")

# Display the first few rows to verify
print(df.head(1)['transcript'])
test1 = df.head(1)['transcript']
test2 = df.head(1)['summary']


In [None]:
# Use a subset of 500 samples for testing
df_subset = df.head(500)

# Save the subset to a new CSV
df_subset.to_csv("/content/financial_data_subset.csv", index=False)
print("Created financial_data_subset.csv with 500 samples")

In [None]:
# Now try importing the T5 model
from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")
print("T5 model loaded successfully!")

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd

# Load dataset with pandas
df = pd.read_csv("/content/financial_data.csv")

# Convert pandas DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Preprocess data: Add "summarize: " prefix to inputs
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["transcript"]]
    targets = examples["summary"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Split into train/test (80% train, 20% test)
train_size = int(0.8 * len(tokenized_dataset))
train_dataset = tokenized_dataset.select(range(train_size))
eval_dataset = tokenized_dataset.select(range(train_size, len(tokenized_dataset)))

# Define training arguments
training_args = TrainingArguments(
    output_dir="/content/t5_finetuned",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,  # Keep batch size low to avoid memory issues
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="/content/logs",
    logging_steps=100,
    gradient_accumulation_steps=2,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Fine-tune
trainer.train()

# Save model
model.save_pretrained("/content/t5_finetuned")
tokenizer.save_pretrained("/content/t5_finetuned")
print("Model fine-tuned and saved!")

# Download the model (optional)
!zip -r /content/t5_finetuned.zip /content/t5_finetuned
from google.colab import files
files.download("/content/t5_finetuned.zip")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/2425 [00:00<?, ? examples/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbluefoodgames[0m ([33mbluefoodgames-univeristy-of-minnesota[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,1.7055,1.437874
2,1.488,1.33187
3,1.3438,1.311662


Model fine-tuned and saved!
  adding: content/t5_finetuned/ (stored 0%)
  adding: content/t5_finetuned/model.safetensors (deflated 10%)
  adding: content/t5_finetuned/spiece.model (deflated 48%)
  adding: content/t5_finetuned/generation_config.json (deflated 29%)
  adding: content/t5_finetuned/checkpoint-970/ (stored 0%)
  adding: content/t5_finetuned/checkpoint-970/model.safetensors (deflated 11%)
  adding: content/t5_finetuned/checkpoint-970/rng_state.pth (deflated 25%)
  adding: content/t5_finetuned/checkpoint-970/generation_config.json (deflated 29%)
  adding: content/t5_finetuned/checkpoint-970/optimizer.pt (deflated 7%)
  adding: content/t5_finetuned/checkpoint-970/config.json (deflated 63%)
  adding: content/t5_finetuned/checkpoint-970/scheduler.pt (deflated 55%)
  adding: content/t5_finetuned/checkpoint-970/trainer_state.json (deflated 68%)
  adding: content/t5_finetuned/checkpoint-970/training_args.bin (deflated 52%)
  adding: content/t5_finetuned/added_tokens.json (deflated 8

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip install hf_xet

In [None]:
import transformers
print(transformers.__version__)

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the fine-tuned model
tokenizer = T5Tokenizer.from_pretrained("/content/t5_finetuned")
model = T5ForConditionalGeneration.from_pretrained("/content/t5_finetuned")

# Test transcript (same as provided)
test_transcript = """Joining in the Q&A after Bob and Mike's comments will be Jacob Thaysen, President of Agilent's Life Sciences and Applied Markets Group; Sam Raha, President of Agilent's Diagnostics and Genomics Group; and Padraig McDonnell, President of Agilent CrossLab Group.
You will find the most directly comparable GAAP financial metrics and reconciliations on our website.
Core revenue growth excludes the impact of currency and the acquisitions and divestitures completed within the past 12 months.
The Agilent team delivered another excellent quarter to close out an outstanding record-setting 2021.
At $6.32 billion for fiscal 2021, revenues are almost $1 billion higher than last year.
Full year core growth is up 15% on top of growing 1% last year.
The strength is broad-based for the three business units, all growing more than 10% core for the year.
Our full year operating margin was up 200 basis points.
Earnings per share are $4.34 or up 32%.
Let's now take a closer look at our strong finish to 2021 and review Q4 results.
Our momentum continues as orders increase faster than revenue in Q4.
And at the same time, we delivered our fourth straight quarter of double-digit revenue growth.
At $1.66 billion, revenues are up 12% on a reported basis.
Our core revenues grew 11%, exceeding our expectations.
This is on top of 6% core growth last year.
Our Q4 operating margin is 26.5%.
This is up 160 basis points from last year.
EPS is $1.21, up 23% year-over-year.
Our earnings growth also exceeded our expectations.
We continue to perform extremely well in Pharma, our largest market, growing 21%, driven by our Biopharma business.
Total pharma now represents 36% of our overall revenue.
This compared to 31% of our revenues just two years ago.
The strong growth in our Chemical and Energy business continues as we delivered 11% growth in the quarter.
This is on top of growing 3% in Q4 of last year.
PMI numbers are positive and we expect that chemical and energy will continue its strong growth trajectory into fiscal 2022.
In Diagnostics and Clinical, revenues grew 11% on top of growing 1% last year as testing volume started to recover.
On a geographic basis, our results are led by strong performance in the Americas and China.
Our business in the Americas grew 15% on top of 5% last year.
China grew 8% core on top of strong 13% growth in Q4 of last year.
China order growth outpaced revenue growth for the third quarter in a row.
Now, looking at a performance by business unit, the Life Sciences and Applied Markets Group generate revenue of $747 million.
LSAG is up 11% of both the reported and a core basis.
LSAG's growth is broad based and led by strength in liquid chromatography and cell analysis.
The Pharma and Chemical Energy markets were particularly strong for new instrument purchases.
Our cell analysis business crossed $100 million revenue mark in the quarter for the first time.
During the quarter, the LSAG team announced a new high mobility LC/Q-TOF and enhancements to our VWorks automation software suite.
These new well received offerings are used to improved analysis of proteins and peptides to speed development of new protein-based therapeutics.
The Agilent CrossLab Group posted revenue of $572 million.
This is up reported 10% and 9% core.
Growth is broad based, driven by strength in service contracts and on-demand services as well as our chemistries and supplies.
Our focus on increasing connect rates continues to pay off for us.
The strong expansion of our installed base in 2021 and increasing connect rates bodes well continued to strengthen our ACG business moving forward.
Our ability to drive growth and leverage our scale produce operating margins of roughly 30%, not more than 200 basis points from the prior year.
The Diagnostics and Genomics Group delivered revenue of $341 million, up 16% reported and up 13% core.
Our NASD oligo business led the way with robust double-digit growth in the quarter and achieved full year revenues exceeding $225 million.
We expect another year of strong double-digit growth as the team continues to do a great job of increasing throughput with existing capacity.
The expansion of our Train B oligo manufacturing facility in Frederick, Colorado is proceeding as planned.
We expect this additional capacity to come online by the end of calendar year 2022.
Moving on from our other business group updates, there are several other significant developments for Agilent this quarter.
We announced our commitment to achieving net zero greenhouse gas emissions by 2050.
We believe our approach delivers the same rigorous sustainability that'd be applied to everything else we do.
We also believe these actions are not only the right thing to do, but fundamental to achieving long-term success.
Our sustainable leadership continues to be primarily recognized as well.
You may have seen that Investor's Business Daily recently named Agilent to its Top 100 ESG Companies list.
We're also a company where diversity and inclusion represent a company priority and is a core element of our culture.
During the quarter, we achieved recognition by Forbes as one of the World's Best Employers and as a Best Workplace for Women.
While the Agilent team has a strong track record of delivering above-market growth and leading customer satisfaction, we're always looking to do more.
To further accelerate growth and strengthen our focus on customers, we are implementing a new One Agilent commercialization, combining for the first time all customer-facing activities under one leader.
The new organization brings together and strengthens our sales, marketing, digital channel and services team.
The new enterprise level commercialization is led by Padraig McDonnell.
Padraig will continue to lead the Agilent CrossLab Group as Business Group President as well as serves Agilent's first ever Chief Commercial Officer.
The way I'd like to characterize this move is to say we are doubling down on the success we've achieved with ACG, applying a holistic customer-focused approach to all aspects of our business.
We're also moving the chemistries and supplies division to LSAG.
This close organizational alignment between instrument and chemistries development will further accelerate our progress on instrument connect rates for chemistries and consumables.
We believe that structure of follow strategy and that this new organizational structure will further enhance our customer focus and the execution of our growth strategies.
Looking ahead to the coming year, we are in a strong position to continue to deliver on our build and buy growth strategy.
Agilent's business remains strong.
We enter the new year with a robust backlog and have multiple growth drivers, coupled with the proven execution excellence of the Agilent team.
A year ago to our Agilent Investor Day, we raised our long-term annual growth outlook to the 5% to 7% range, while reaffirming our commitment to annual operating margin improvement and double-digit earnings per share growth.
We are now one year in and well on our way to achieving these long-term goals.
Bob will provide more details, but for fiscal 2022, our initial full year guide calls for a core growth in the range of 5.5% to 7%.
We expect to continue our top line growth as we launch market-leading products and services, invest in fast-growing businesses and deliver outstanding customer service.
My confidence in the unstoppable One Agilent team and our ability to execute and deliver remains firmly intact.
This is our formula for delivering solid financial results, outstanding shareholder returns and continued strong growth.
We are very pleased with our performance in 2021 but not satisfied.
As I tell the Agilent team, the best is yet to come for our customers, our team and our shareholders.
I will now hand the call off to Bob.
In my remarks today, I'll provide some additional details on revenue and take you through the income statement and some other key financial metrics.
I'll then finish up with our initial outlook for the upcoming year and for the first quarter.
As Mike mentioned, we had very strong results in the fourth quarter.
Revenue was $1.66 billion, reflecting reported growth of 12%.
Core revenue growth at 11% was a point above our top end guidance range.
Currency accounted for 0.8% of growth, while M&A contributed half a point of growth during Q4.
And as expected, COVID-19-related revenues were roughly flat sequentially and resulted in just over a point headwind to the quarterly revenue growth.
Late in the quarter, we did see transit times that were in certain cases greater than anticipated, resulting in some revenues being deferred into Q1.
Our results were driven by a continuation of outstanding momentum in Pharma and in Biopharma in particular, while Chemical and Energy and Diagnostics and clinical also delivered strong results for us.
Our largest market, Pharma, grew 21% during the quarter against a tough compare of 12% last year.
The Small Molecule segment delivered mid-teens growth, while Large Molecule grew 30%.
Pharma was a standout all year, growing 24% for the full year after growing 6% in 2020.
And in FY '22, we expect our Pharma business to grow in the high-single digits.
Chemical and Energy continue to show strength growing 11% with instrument growth in the mid-teens during the quarter.
This impressive performance was against a 3% increase last year.
The C&E business grew 12% for the year after declining 3% in 2020.
Growth was driven by continued momentum in chemicals and engineered materials and we expect our C&E business to continue to grow solidly next year in the high-single digits.
Diagnostics and Clinical grew 11% with all three groups growing nicely during the quarter.
While the largest dollar contributor to this market is DGG, driven by our pathology-related businesses, the LSAG business continues to penetrate the clinical market and drive growth with strong performances by Cell Analysis and Mass Spec.
We saw mid-teens growth in the Americas and strong growth in China, albeit off a small base.
For the year, the Diagnostics and Clinical business grew 15% for the year after declining slightly by 1% in 2020.
And we expect to continue to grow in the mid to high-single digits in 2022.
Academia and Government, which can be lumpy and represents less than 10% of our business, was up 1% in Q4 versus a flat growth last year.
Most research labs continue to remain open globally and increase capacity to pre-pandemic levels.
China came in at low-single digits, while the Americas and Europe were roughly flat.
For the year, we grew 7% after declining 4% last year.
We expect this market will continue to improve slightly in fiscal year 2022 and expect growth of low to mid-single digits.
Food was flat during the quarter against a very tough 16% compare.
Europe and the Americas grew while China declined.
For the year, food grew 13% after growing 7% in 2020.
Looking forward, we expect food to return to historical growth rates in the low-single digits.
And rounding out the markets, Environmental and Forensics declined 2% in the fourth quarter off of 5% decline last year as growth in Environmental was overshadowed by a decline in Forensics.
For the year, we grew 5%, off a 2% decline in 2020.
And looking forward, like Food, we expect Environmental and Forensics to grow in the low-single digits in the coming year.
For Agilent overall, on a geographic basis, all regions again grew in Q4, led by Americas at 15% China grew 8% in Europe grew 4%.
And for the year, Americas led the way with 21% growth, followed by China at 13% and Europe at 12%.
Now let's turn to the rest of the P&L.
Fourth quarter gross margin was 55.9%, up 90 basis points from a year ago.
Gross margin performance, along with continued operating expense leverage, resulted in an operating margin for the fourth quarter of 26.5%, improving 160 basis points over last year.
Putting it all together, we delivered earnings per share of $1.21, up 23% versus last year.
And during the quarter, we benefited from some additional tax savings, resulting in a quarterly tax rate of 13% and our full year tax rate was 14.25%.
Our share count was 305 million shares as expected.
And for the year, earnings per share came in at $4.34, an increase of 32% from 2020.
We continued our strong cash flow generation, resulting in $441 million for the quarter, an increase of 17% versus last year.
For all of 2021, we generated almost $1.5 billion in operating cash and invested $188 million in capital expenditures.
During the quarter, we returned $195 million to our shareholders paying out $59 million in dividends and repurchasing roughly 830,000 shares for $136 million.
And for the year, we returned over $1 billion to shareholders in the forms of dividends and share repurchases.
And we ended the year with $1.5 billion in cash and $2.7 billion in outstanding debt and a net leverage ratio of 0.7 times.
All in all, a great end to an outstanding year.
Now let's move on to the outlook for fiscal 2022.
While we are still dealing with the pandemic and we have the additional challenges around logistics and inflationary pressures, we enter the year with strong backlog and momentum.
For the full year, we're expecting revenue to range between $6.65 billion and $6.73 billion, representing reported growth of 5% to 6.5% and core growth of 5.5% to 7%, consistent with our long-range goals.
And this incorporates absorbing roughly 0.5% headwind associated with COVID-related revenues with the majority of that impact coming in Q1.
We're expecting all three of our businesses to grow, led by DGG.
We expect DGG to grow high-single digits with the continued contribution of NASD in cancer diagnostics.
We expect ACG to grow at high-single digits with both services in our chemistries and supplies businesses growing comparably while LSAG is expected to grow in mid-single digits.
We expect operating margin expansion of 60 to 80 basis points for the year as we absorb the build-out costs of Train B at our Frederick, Colorado NASD site.
And in helping you build out your models, we're planning for a tax rate of 14.25%, consistent with current tax policies and $305 million fully diluted shares outstanding.
All this translates to a fiscal 2022 non-GAAP earnings per share expected to be between $4.76 to $4.86 per share, resulting in double-digit growth.
And finally, we expect operating cash flow of approximately $1.4 billion to $1.5 billion and capital expenditures of $300 million.
This capital investment represents an increase over 2021 as we continue our focus on growth, bringing our NASD Train B expansion online and expanding consumables manufacturing capacity for our Cell Analysis and Genomics businesses.
We have also announced raising our dividend by 8%, continuing an important streak of dividend increases and providing another source of value to our shareholders.
Now let's move on to our first quarter guidance.
But before I get into the specifics, some additional context.
Lunar New Year is February 1 this year, a shift from last year when it was in mid-February.
As a result, we expect some Q1 revenue to shift to the second quarter of this year as customers shut down ahead of the holiday.
In addition, as I mentioned, we do expect to see the largest impact of COVID-related revenue headwinds in the first quarter.
We estimate these two factors will impact our base business growth by 2 to 3 points and roughly equal in impact.
For Q1, we are expecting revenue to range from $1.64 billion to $1.66 billion, representing reported and core growth of 5.9% to 7.2%.
Adjusting for the timing of Lunar New Year and COVID-related headwinds, core growth would be roughly 8% to 10% in the quarter.
First quarter 2022 non-GAAP earnings are expected to be in the range of $1.16 to $1.18.
In conjunction with the new One Agilent commercial organization Mike talked about, we will be reporting under the new structure starting in Q1.
In addition, we'll be providing a recast of certain LSAG and ACG historical financials to account for the segment changes after the filing of our Annual Report on Form 10-K in December.
I am extremely proud of what the Agilent team achieved in 2021 and look forward to another strong performance in 2022.
With that for me, back to you for Q&A.
Bethany, if you could please provide instructions for the Q&A now.
"""
# Post-process to remove redundancy and shorten
def refine_summary(summary):
    sentences = summary.split(". ")
    seen = set()
    refined_sentences = []
    for sentence in sentences:
        if sentence and sentence not in seen:
            seen.add(sentence)
            refined_sentences.append(sentence)
    refined_summary = ". ".join(refined_sentences)
    if len(refined_summary.split()) > 75:  # Limit to ~75 words
        refined_summary = " ".join(refined_summary.split()[:75])
    return refined_summary.strip() + (". " if not refined_summary.endswith(".") else "")



# Split the transcript into chunks of ~400 words
def split_text(text, max_words=400):
    words = text.split()
    chunks = []
    current_chunk = []
    word_count = 0
    for word in words:
        current_chunk.append(word)
        word_count += 1
        if word_count >= max_words:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            word_count = 0
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# Split the transcript
chunks = split_text(test_transcript)

# Summarize each chunk
chunk_summaries = []
for chunk in chunks:
    inputs = tokenizer("summarize: " + chunk, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=150,
        min_length=30,
        num_beams=4,
        no_repeat_ngram_size=2,  # Prevent repetition
        early_stopping=False
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    if summary.strip():  # Only add non-empty summaries
        chunk_summaries.append(summary)

# Combine chunk summaries
final_summary = " ".join(chunk_summaries)
refined_summary = refine_summary(final_summary)
print(f"Refined Summary: {refined_summary}")

In [None]:
!pip install pinecone sentence-transformers pandas

In [None]:
# Uninstall pinecone-client
!pip uninstall -y pinecone-client

# Ensure pinecone is installed
!pip install pinecone --upgrade

# Verify the installed package
!pip show pinecone

In [None]:

# Import libraries
import pinecone
from sentence_transformers import SentenceTransformer
import pandas as pd

# Initialize Pinecone client
client = pinecone.Pinecone(api_key="pcsk_3z4iTz_BoTpE3CrVJVTUvF37hWHeN5Eo6XnosKTjidbKsvb9FotdUETgUnDNpXqHkCFVN9")

# Define index name
index_name = "financial-docs"

# Create index if it doesn't exist
if index_name not in client.list_indexes().names():
    client.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=pinecone.ServerlessSpec(cloud="aws", region="us-east-1")
    )

# Connect to the index
index = client.Index(index_name)

# Load dataset
df = pd.read_csv("/content/financial_data.csv")

# Load embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Generate embeddings
embeddings = embedder.encode(df["transcript"].tolist(), show_progress_bar=True)

# Prepare vectors with truncated metadata
ids = [str(i) for i in range(len(df))]
vectors = [(ids[i], embeddings[i].tolist(), {"text": df["transcript"][i][:500]}) for i in range(len(df))]  # Limit to 500 characters

# Batch upsert
batch_size = 50  # Can increase batch size since metadata is smaller
for i in range(0, len(vectors), batch_size):
    batch = vectors[i:i + batch_size]
    index.upsert(vectors=batch)
    print(f"Uploaded batch {i // batch_size + 1} of {len(vectors) // batch_size + 1}")


print("Documents stored in Pinecone!")
# Save transcript mapping
df["pinecone_id"] = ids
df[["pinecone_id", "transcript"]].to_csv("/content/transcript_mapping.csv", index=False)
print("Transcript mapping saved to /content/transcript_mapping.csv")

# Save to Google Drive
from google.colab import drive
drive.mount('/content/drive')
!cp /content/transcript_mapping.csv /content/drive/MyDrive/transcript_mapping.csv
print("Transcript mapping copied to Google Drive!")

# Test Pinecone retrieval
queries = [
    "What is the revenue growth for Pharma?",
    "Pharma grew 21% in Q4 2021, driven by Biopharma.",
    "How did the pharmaceutical sector perform?"
]

for query in queries:
    query_embedding = embedder.encode([query])[0].tolist()
    results = index.query(vector=query_embedding, top_k=3, include_metadata=True)
    print(f"\nQuery: {query}")
    for match in results['matches']:
        print(f"Score: {match['score']}, ID: {match['id']}, Text: {match['metadata']['text'][:100]}...")

# Step 5: Combined Search and Summarization System

# Load the fine-tuned T5 model (after re-uploading or retraining)
from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained("/content/t5_finetuned")
model = T5ForConditionalGeneration.from_pretrained("/content/t5_finetuned")

# Load transcript mapping
mapping = pd.read_csv("/content/transcript_mapping.csv")

# Search and summarize
query = "What is the revenue growth for Pharma?"
query_embedding = embedder.encode([query])[0].tolist()
results = index.query(vector=query_embedding, top_k=1, include_metadata=True)

if results['matches']:
    top_id = results['matches'][0]['id']
    top_transcript = mapping[mapping["pinecone_id"] == top_id]["transcript"].values[0]
    inputs = tokenizer("summarize: " + top_transcript, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=100,
        min_length=20,
        num_beams=6,
        no_repeat_ngram_size=3,
        length_penalty=1.0,
        early_stopping=False
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    print(f"\nQuery: {query}")
    print(f"Summary: {summary}")
else:
    print("No matching transcripts found.")

In [None]:
query = "What is the revenue growth for Pharma?"
query_embedding = embedder.encode([query])[0].tolist()
results = index.query(vector=query_embedding, top_k=3, include_metadata=True)
for match in results['matches']:
    print(f"Score: {match['score']}, Text: {match['metadata'].get('text', 'No metadata')[:100]}...")

In [None]:
!zip -r /content/t5_finetuned.zip /content/t5_finetuned

In [None]:
# Step 1: Upload necessary files
from google.colab import files

# Upload t5_finetuned.zip
print("Upload t5_finetuned.zip:")
uploaded = files.upload()
!unzip /content/t5_finetuned.zip -d /content
print("t5_finetuned directory restored!")

# Upload financial_data.csv and transcript_mapping.csv
print("Upload financial_data.csv and transcript_mapping.csv:")
uploaded = files.upload()
print("Files uploaded!")

# Step 2: Install dependencies
!pip install pinecone sentence-transformers transformers pandas gradio
print("Dependencies installed!")

# Step 3: Rebuild or reconnect to the Pinecone index
import pinecone
from sentence_transformers import SentenceTransformer
import pandas as pd

# Initialize Pinecone client
client = pinecone.Pinecone(api_key="pcsk_3z4iTz_BoTpE3CrVJVTUvF37hWHeN5Eo6XnosKTjidbKsvb9FotdUETgUnDNpXqHkCFVN9")

# Define index name
index_name = "financial-docs"

# Check if the index exists; if not, create it
if index_name not in client.list_indexes().names():
    print(f"Index {index_name} does not exist. Creating it...")
    client.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=pinecone.ServerlessSpec(cloud="aws", region="us-east-1")
    )

    # Connect to the index
    index = client.Index(index_name)

    # Load dataset
    df = pd.read_csv("/content/financial_data.csv")

    # Load embedding model
    embedder = SentenceTransformer("all-MiniLM-L6-v2")

    # Generate embeddings
    embeddings = embedder.encode(df["transcript"].tolist(), show_progress_bar=True)

    # Prepare vectors with truncated metadata
    ids = [str(i) for i in range(len(df))]
    vectors = [(ids[i], embeddings[i].tolist(), {"text": df["transcript"][i][:500]}) for i in range(len(df))]

    # Batch upsert
    batch_size = 50
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i + batch_size]
        index.upsert(vectors=batch)
        print(f"Uploaded batch {i // batch_size + 1} of {len(vectors) // batch_size + 1}")

    print("Documents stored in Pinecone!")
else:
    print(f"Index {index_name} already exists. Connecting to it...")
    index = client.Index(index_name)

# Step 4: Recreate the Gradio UI
import gradio as gr
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Connect to the Pinecone index (already connected, but ensuring it's available)
index = client.Index("financial-docs")

# Load the fine-tuned T5 model
tokenizer = T5Tokenizer.from_pretrained("/content/t5_finetuned")
model = T5ForConditionalGeneration.from_pretrained("/content/t5_finetuned")

# Load transcript mapping
mapping = pd.read_csv("/content/transcript_mapping.csv")

# Define the search and summarize function for the UI
def search_and_summarize(query):
    # Search Pinecone
    query_embedding = embedder.encode([query])[0].tolist()
    results = index.query(vector=query_embedding, top_k=1, include_metadata=True)

    if not results['matches']:
        return "No matching transcripts found.", ""

    # Retrieve the top result
    top_match = results['matches'][0]
    top_id = top_match['id']
    score = top_match['score']
    truncated_text = top_match['metadata']['text']

    # Get the full transcript
    top_transcript = mapping[mapping["pinecone_id"] == top_id]["transcript"].values[0]

    # Generate summary
    inputs = tokenizer("summarize: " + top_transcript, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=100,
        min_length=20,
        num_beams=6,
        no_repeat_ngram_size=3,
        length_penalty=1.0,
        early_stopping=False
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Format the output
    search_result = f"**Top Matching Transcript (ID: {top_id}, Score: {score:.2f})**\n{truncated_text}..."
    return search_result, summary

# Build the Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# Financial Report Summarizer")
    gr.Markdown("Enter a query to search financial transcripts and get a summary.")

    with gr.Row():
        query_input = gr.Textbox(label="Query", placeholder="e.g., What is the revenue growth for Pharma?")
        submit_button = gr.Button("Search and Summarize")

    with gr.Row():
        with gr.Column():
            search_output = gr.Textbox(label="Search Result (Top Matching Transcript)")
        with gr.Column():
            summary_output = gr.Textbox(label="Summary")

    # Connect the button to the function
    submit_button.click(
        fn=search_and_summarize,
        inputs=query_input,
        outputs=[search_output, summary_output]
    )

# Launch the UI
demo.launch()

Upload t5_finetuned.zip:


KeyboardInterrupt: 

In [None]:
import zipfile

def create_zip(file_path, zip_file_path):
    """
    Creates a zip file containing the file specified by file_path.

    Args:
        file_path: The path to the file to be zipped.
        zip_file_path: The path to the output zip file.
    """
    with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(file_path)
file_path = 't5_finetuned'
zip_file_path = 't5_finetuned_weights.zip'
create_zip(file_path, zip_file_path)

In [None]:
import shutil
import os

def zip_folder(folder_path, output_path):
    """
    Zips a folder and its contents.

    Args:
        folder_path (str): The path to the folder to be zipped.
        output_path (str): The path and name of the output zip file (without extension).
    """
    if not os.path.exists(folder_path):
        raise FileNotFoundError(f"Folder not found: {folder_path}")

    shutil.make_archive(output_path, 'zip', folder_path)
    print(f"Folder '{folder_path}' zipped to '{output_path}.zip'")

# Example usage
folder_to_zip = 't5_finetuned' # Replace with your folder path
output_zip_name = 't5_finetuned_weights.zip' # Replace with your desired output name

# Create a dummy folder and file for testing
os.makedirs(folder_to_zip, exist_ok=True)
with open(os.path.join(folder_to_zip, 'test.txt'), 'w') as f:
    f.write('This is a test file.')

zip_folder(folder_to_zip, output_zip_name)

In [None]:
!ls -lh t5_finetuned.zip

-rw-r--r-- 1 root root 2.1G May 20 01:59 t5_finetuned.zip


In [None]:
print("Contents of checkpoint-970:")
!ls -lh /content/t5_finetuned/checkpoint-970
print("\nContents of checkpoint-1940:")
!ls -lh /content/t5_finetuned/checkpoint-1940
print("\nContents of checkpoint-2910:")
!ls -lh /content/t5_finetuned/checkpoint-2910

In [None]:
# Check the true size of the directory
print("True size of t5_finetuned directory (including subdirectories):")
!du -sh /content/t5_finetuned

# Remove unnecessary files
!rm -rf /content/t5_finetuned/checkpoint-*  # Remove checkpoints
!rm -rf /content/t5_finetuned/runs          # Remove training logs (if any)
!rm /content/t5_finetuned/test.txt          # Remove test.txt

# Verify the cleaned directory
print("\nContents of t5_finetuned after cleanup:")
!ls -lh /content/t5_finetuned
!du -sh /content/t5_finetuned

# Recompress the cleaned directory
!zip -9 -r /content/t5_finetuned_cleaned.zip /content/t5_finetuned
print("t5_finetuned directory recompressed into t5_finetuned_cleaned.zip!")

# Verify the new ZIP file size
!ls -lh /content/t5_finetuned_cleaned.zip

# Save to Google Drive
from google.colab import drive
drive.mount('/content/drive')
!cp /content/t5_finetuned_cleaned.zip /content/drive/MyDrive/t5_finetuned_cleaned.zip
print("t5_finetuned_cleaned.zip copied to Google Drive!")

True size of t5_finetuned directory (including subdirectories):
2.3G	/content/t5_finetuned
rm: cannot remove '/content/t5_finetuned/test.txt': No such file or directory

Contents of t5_finetuned after cleanup:
total 232M
-rw-r--r-- 1 root root 2.6K May 20 01:56 added_tokens.json
-rw-r--r-- 1 root root 1.5K May 20 01:56 config.json
-rw-r--r-- 1 root root  142 May 20 01:56 generation_config.json
-rw-r--r-- 1 root root 231M May 20 01:56 model.safetensors
-rw-r--r-- 1 root root 2.5K May 20 01:56 special_tokens_map.json
-rw-r--r-- 1 root root 774K May 20 01:56 spiece.model
-rw-r--r-- 1 root root  21K May 20 01:56 tokenizer_config.json
232M	/content/t5_finetuned
  adding: content/t5_finetuned/ (stored 0%)
  adding: content/t5_finetuned/model.safetensors