In [5]:
import pandas as pd

In [6]:
df = pd.read_csv('Structured_Meeting_Notes.csv')
df.head()

Unnamed: 0,date,speaker,note,project,action_item,deadline
0,2025-03-28,Jordan,[Project: AI assistant integration] Jordan agr...,AI assistant integration,Set up Airflow DAGs,Friday
1,2025-04-04,Jordan,[Project: Internal documentation cleanup] Jord...,Internal documentation cleanup,Test Gemini with internal queries,Friday
2,2025-03-24,Jordan,[Project: Data pipeline improvement] Jordan hi...,Data pipeline improvement,Organize guild workshop,Monday
3,2025-02-13,Omar,[Project: Knowledge sharing guilds] Omar highl...,Knowledge sharing guilds,Summarize feedback for product,Thursday
4,2025-03-27,Anaïs,[Project: Dashboard redesign] Anaïs requested ...,Dashboard redesign,Summarize feedback for product,Thursday


### Check for duplicates

In [7]:
df_clean = df.dropna(subset=['note'])


### Capital letter for strings in ```speaker``` and ```project```

In [8]:
df_clean['speaker'] = df_clean['speaker'].str.title()
df_clean['project'] = df_clean['project'].str.title()
df_clean.head()

Unnamed: 0,date,speaker,note,project,action_item,deadline
0,2025-03-28,Jordan,[Project: AI assistant integration] Jordan agr...,Ai Assistant Integration,Set up Airflow DAGs,Friday
1,2025-04-04,Jordan,[Project: Internal documentation cleanup] Jord...,Internal Documentation Cleanup,Test Gemini with internal queries,Friday
2,2025-03-24,Jordan,[Project: Data pipeline improvement] Jordan hi...,Data Pipeline Improvement,Organize guild workshop,Monday
3,2025-02-13,Omar,[Project: Knowledge sharing guilds] Omar highl...,Knowledge Sharing Guilds,Summarize feedback for product,Thursday
4,2025-03-27,Anaïs,[Project: Dashboard redesign] Anaïs requested ...,Dashboard Redesign,Summarize feedback for product,Thursday


### Truncate text to avoid overflow or confuse the model

In [9]:
df_clean['note_short'] = df_clean['note'].apply(lambda x: x[:300])


### Map the day of the week to the actual deadline date in datetime

In [10]:
from datetime import datetime, timedelta

def convert_weekday_to_date(row):
    weekdays = {
        "Monday": 0, "Tuesday": 1, "Wednesday": 2,
        "Thursday": 3, "Friday": 4, "Saturday": 5, "Sunday": 6
    }
    try:
        note_date = datetime.strptime(row["date"], "%Y-%m-%d") + timedelta(days=1)  # Shift by 1 day
        target_weekday = weekdays.get(row["deadline"])
        if target_weekday is None:
            return None
        days_ahead = (target_weekday - note_date.weekday()) % 7
        return (note_date + timedelta(days=days_ahead)).strftime("%Y-%m-%d")
    except:
        return None

df_clean.head()


Unnamed: 0,date,speaker,note,project,action_item,deadline,note_short
0,2025-03-28,Jordan,[Project: AI assistant integration] Jordan agr...,Ai Assistant Integration,Set up Airflow DAGs,Friday,[Project: AI assistant integration] Jordan agr...
1,2025-04-04,Jordan,[Project: Internal documentation cleanup] Jord...,Internal Documentation Cleanup,Test Gemini with internal queries,Friday,[Project: Internal documentation cleanup] Jord...
2,2025-03-24,Jordan,[Project: Data pipeline improvement] Jordan hi...,Data Pipeline Improvement,Organize guild workshop,Monday,[Project: Data pipeline improvement] Jordan hi...
3,2025-02-13,Omar,[Project: Knowledge sharing guilds] Omar highl...,Knowledge Sharing Guilds,Summarize feedback for product,Thursday,[Project: Knowledge sharing guilds] Omar highl...
4,2025-03-27,Anaïs,[Project: Dashboard redesign] Anaïs requested ...,Dashboard Redesign,Summarize feedback for product,Thursday,[Project: Dashboard redesign] Anaïs requested ...


In [11]:
df_clean["deadline_date"] = df_clean.apply(convert_weekday_to_date, axis=1)
df_clean["deadline_date"] = pd.to_datetime(df_clean["deadline_date"])
df_clean["date"] = pd.to_datetime(df_clean["date"])



In [12]:
df_clean["days_until_deadline"] = (df_clean["deadline_date"] - df_clean["date"]).dt.days
df_clean.head()

Unnamed: 0,date,speaker,note,project,action_item,deadline,note_short,deadline_date,days_until_deadline
0,2025-03-28,Jordan,[Project: AI assistant integration] Jordan agr...,Ai Assistant Integration,Set up Airflow DAGs,Friday,[Project: AI assistant integration] Jordan agr...,2025-04-04,7
1,2025-04-04,Jordan,[Project: Internal documentation cleanup] Jord...,Internal Documentation Cleanup,Test Gemini with internal queries,Friday,[Project: Internal documentation cleanup] Jord...,2025-04-11,7
2,2025-03-24,Jordan,[Project: Data pipeline improvement] Jordan hi...,Data Pipeline Improvement,Organize guild workshop,Monday,[Project: Data pipeline improvement] Jordan hi...,2025-03-31,7
3,2025-02-13,Omar,[Project: Knowledge sharing guilds] Omar highl...,Knowledge Sharing Guilds,Summarize feedback for product,Thursday,[Project: Knowledge sharing guilds] Omar highl...,2025-02-20,7
4,2025-03-27,Anaïs,[Project: Dashboard redesign] Anaïs requested ...,Dashboard Redesign,Summarize feedback for product,Thursday,[Project: Dashboard redesign] Anaïs requested ...,2025-04-03,7


In [13]:
df_clean["llm_input"] = df_clean.apply(
    lambda row: f"Meeting on {row['date']} about '{row['project']}' led by {row['speaker']}: {row['note_short']}",
    axis=1
)

df_clean["llm_input"].head()

0    Meeting on 2025-03-28 00:00:00 about 'Ai Assis...
1    Meeting on 2025-04-04 00:00:00 about 'Internal...
2    Meeting on 2025-03-24 00:00:00 about 'Data Pip...
3    Meeting on 2025-02-13 00:00:00 about 'Knowledg...
4    Meeting on 2025-03-27 00:00:00 about 'Dashboar...
Name: llm_input, dtype: object

In [14]:
import re

def clean_note(text):
    return re.sub(r'\[Project: .*?\]\s*', '', text)


df_clean['note_clean'] = df_clean['note_short'].apply(clean_note) 

df_clean['note_clean'].head()

0    Jordan agreed on implementing the ai assistant...
1    Jordan raised concerns about the internal docu...
2    Jordan highlighted the need for the data pipel...
3    Omar highlighted the need for the knowledge sh...
4    Anaïs requested a report on the dashboard rede...
Name: note_clean, dtype: object

In [1]:
from transformers import pipeline

summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

df_clean["summary"] = df_clean["note_clean"].apply(
    lambda x: summarizer(x, max_length=40, min_length=10, do_sample=False)[0]['summary_text']
)

df_clean[["note_clean", "summary"]].head()





  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'init_empty_weights' is not defined

In [2]:
!pip install accelerate


Collecting accelerate
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.6.0-py3-none-any.whl (354 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.6.0


In [20]:
!pip install torch


Collecting torch
  Downloading torch-2.6.0-cp310-none-macosx_11_0_arm64.whl.metadata (28 kB)
Collecting networkx (from torch)
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Downloading jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting sympy==1.13.1 (from torch)
  Using cached sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy==1.13.1->torch)
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Collecting MarkupSafe>=2.0 (from jinja2->torch)
  Downloading MarkupSafe-3.0.2-cp310-cp310-macosx_11_0_arm64.whl.metadata (4.0 kB)
Downloading torch-2.6.0-cp310-none-macosx_11_0_arm64.whl (66.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.5/66.5 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached sympy-1.13.1-py3-none-any.whl (6.2 MB)
Downloading jinja2-3.1.6-py3-none-any.whl (134 kB)
Downloading networkx-3.4.2-py3-none-any.whl (1.7 MB)


In [17]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.51.0-py3-none-any.whl.metadata (38 kB)
Collecting filelock (from transformers)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Using cached huggingface_hub-0.30.1-py3-none-any.whl.metadata (13 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp310-cp310-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting requests (from transformers)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl.metadata (3.8 kB)
Collecting tqdm>=4.27 (from transformers)
 