<a href="https://colab.research.google.com/github/pavanudhay/AI-Powered-Meetings-to-Minutes-Generator/blob/main/AI_Powered_Meetings_to_Minutes_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install spacy transformers torch textwrap
!python -m spacy download en_core_web_sm

[31mERROR: Could not find a version that satisfies the requirement textwrap (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for textwrap[0m[31m
[0mCollecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m61.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
import datetime
from typing import List, Dict, Optional
from dataclasses import dataclass
import spacy
from transformers import pipeline
from collections import defaultdict
import textwrap
import torch

@dataclass
class DiscussionPoint:
    topic: str
    key_points: List[str]
    decisions: List[str]
    action_items: List[Dict[str, str]]
    participants: List[str]

@dataclass
class MeetingMinutes:
    title: str
    date: datetime.datetime
    attendees: List[str]
    discussion_points: List[DiscussionPoint]
    key_decisions: List[str]
    summary: str

class MeetingMinutesGenerator:
    def __init__(self):
        self.device = 0 if torch.cuda.is_available() else -1
        self.nlp = spacy.load("en_core_web_sm")
        self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=self.device)
        self.classifier = pipeline("zero-shot-classification", device=self.device)

    def extract_attendees(self, text: str) -> List[str]:
        doc = self.nlp(text)
        attendees = []
        for ent in doc.ents:
            if ent.label_ == "PERSON":
                next_text = doc[ent.end:ent.end + 10].text if ent.end + 10 < len(doc) else doc[ent.end:].text
                if '(' in next_text and ')' in next_text:
                    role = next_text[next_text.find('(')+1:next_text.find(')')]
                    attendees.append(f"{ent.text} ({role})")
                else:
                    attendees.append(ent.text)
        return list(set(attendees))

    def extract_key_points(self, text: str) -> List[str]:
        doc = self.nlp(text)
        key_points = []
        importance_indicators = [
            "highlighted", "emphasized", "discussed", "presented",
            "reported", "mentioned", "suggested", "proposed",
            "identified", "noted", "shared", "explained", "provided",
            "update", "raised", "reviewed"
        ]
        for sent in doc.sents:
            if any(indicator in sent.text.lower() for indicator in importance_indicators):
                key_points.append(sent.text.strip())
            if sent.text.strip().startswith(('-', '•', '*')) or \
               any(sent.text.strip().startswith(f"{i}.") for i in range(1, 10)):
                key_points.append(sent.text.strip())
        return key_points

    def extract_decisions(self, text: str) -> List[str]:
        doc = self.nlp(text)
        decisions = []
        decision_indicators = [
            "decided", "agreed", "approved", "concluded",
            "resolved", "confirmed", "finalized", "determined"
        ]
        for sent in doc.sents:
            if any(indicator in sent.text.lower() for indicator in decision_indicators):
                decisions.append(sent.text.strip())
        return decisions

    def extract_action_items(self, text: str) -> List[Dict[str, str]]:
        doc = self.nlp(text)
        action_items = []
        action_indicators = ["will", "should", "needs to", "must", "to do", "assigned to"]
        for sent in doc.sents:
            for indicator in action_indicators:
                if indicator in sent.text.lower():
                    item = {
                        "action": sent.text.strip(),
                        "assignee": None,
                        "deadline": None
                    }
                    for ent in sent.ents:
                        if ent.label_ == "PERSON":
                            item["assignee"] = ent.text
                            break
                    for ent in sent.ents:
                        if ent.label_ == "DATE":
                            item["deadline"] = ent.text
                            break
                    action_items.append(item)
                    break
        return action_items

    def identify_topics(self, text: str) -> List[Dict[str, str]]:
        sentences = [sent.text.strip() for sent in self.nlp(text).sents]
        topic_categories = [
            "project status", "technical discussion", "planning",
            "risk assessment", "resource allocation", "timeline",
            "decisions", "action items", "next steps", "updates",
            "design", "development", "marketing", "review"
        ]
        topics = []
        current_topic = ""
        current_content = []
        for sentence in sentences:
            result = self.classifier(
                sentence,
                candidate_labels=topic_categories,
                multi_label=False
            )
            if result["scores"][0] > 0.6:
                if current_topic and current_content:
                    topics.append({
                        "topic": current_topic,
                        "content": " ".join(current_content)
                    })
                current_topic = result["labels"][0]
                current_content = [sentence]
            else:
                current_content.append(sentence)
        if current_topic and current_content:
            topics.append({
                "topic": current_topic,
                "content": " ".join(current_content)
            })
        if not topics:
            topics.append({
                "topic": "general discussion",
                "content": text
            })
        return topics

    def generate_summary(self, text: str) -> str:
        max_chunk_length = 1024
        chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
        summaries = []
        for chunk in chunks:
            summary = self.summarizer(chunk, max_length=150, min_length=50, do_sample=False)
            summaries.append(summary[0]['summary_text'])
        return " ".join(summaries)

    def generate_minutes(self, text: str, title: str, date: Optional[datetime.datetime] = None) -> MeetingMinutes:
        if date is None:
            date = datetime.datetime.now()
        attendees = self.extract_attendees(text)
        topics = self.identify_topics(text)
        summary = self.generate_summary(text)
        discussion_points = []
        key_decisions = []
        for topic in topics:
            key_points = self.extract_key_points(topic["content"])
            decisions = self.extract_decisions(topic["content"])
            action_items = self.extract_action_items(topic["content"])
            key_decisions.extend(decisions)
            doc = self.nlp(topic["content"])
            participants = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
            discussion_points.append(DiscussionPoint(
                topic=topic["topic"],
                key_points=key_points,
                decisions=decisions,
                action_items=action_items,
                participants=list(set(participants))
            ))
        return MeetingMinutes(
            title=title,
            date=date,
            attendees=attendees,
            discussion_points=discussion_points,
            key_decisions=list(set(key_decisions)),
            summary=summary
        )

    def optimize_memory(self):
        import gc
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

def format_text(text, width=80, initial_indent='', subsequent_indent='    '):
    wrapped_text = textwrap.fill(text, width=width, break_long_words=True,
                                 replace_whitespace=False,
                                 initial_indent=initial_indent,
                                 subsequent_indent=subsequent_indent)
    return wrapped_text


In [3]:
meeting_text = """
The meeting began at 10:00 AM, and the minutes from the previous meeting were reviewed and approved without any changes.

Action items from the last meeting were discussed: Sarah completed the project timeline revision, James worked on the feature integration, and Emma delivered the mockups for the next iteration.

Sarah provided an update on the current project, noting that the project is on track despite a small delay with the third-party API integration, which is not expected to affect the overall timeline. Sarah will follow up with the third-party team for an updated integration timeline.

James shared that back-end development for new features has been completed, and front-end work is set to begin next week. He will coordinate with Emma on the UI changes for the front-end development.

Emma presented the new design mockups for the user dashboard, and feedback was received. She will implement the necessary changes and finalize the designs by the end of the week, after which she will send them to the development team.

Michael provided an update on the marketing strategy for the product launch. The social media campaigns are scheduled to start in two weeks, and the draft for the blog post, focusing on the key features of the product, has been prepared. Michael will finalize the marketing copy and share it with the team for feedback.

John raised the issue of potentially extending the project timeline by an additional week due to the delay with the third-party API. The team agreed to reassess the timeline after the next meeting.

Action items for the upcoming week include Sarah following up with the third-party API team, James coordinating with Emma on front-end development, Emma finalizing the design adjustments, and Michael finalizing the marketing copy.

The next meeting will be held on March 13, 2025, at 10:00 AM. The meeting was adjourned at 11:00 AM.
"""

generator = MeetingMinutesGenerator()
minutes = generator.generate_minutes(
    text=meeting_text,
    title="Project Status Meeting"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0
No model was supplied, defaulted to facebook/bart-large-mnli and revision d7645e1 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [4]:
print(f"Meeting: {minutes.title}")
print(f"Date: {minutes.date}")
print(f"\nAttendees:")
for attendee in minutes.attendees:
    print(f"- {attendee}")

print(f"\nSummary:")
for line in format_text(minutes.summary, width=80).split('\n'):
    print(line)

print(f"\nDiscussion Points:")
for point in minutes.discussion_points:
    print(f"\nTopic: {point.topic}")
    print("Key Points:")
    for kp in point.key_points:
        for line in format_text(kp, width=75, initial_indent='- ', subsequent_indent='  ').split('\n'):
            print(line)

    if point.decisions:
        print("Decisions:")
        for decision in point.decisions:
            for line in format_text(decision, width=75, initial_indent='- ', subsequent_indent='  ').split('\n'):
                print(line)

    if point.action_items:
        print("Action Items:")
        for action in point.action_items:
            assignee = action["assignee"] or "Unassigned"
            deadline = action["deadline"] or "No deadline"
            action_text = f"{action['action']} (Assignee: {assignee}, Deadline: {deadline})"
            for line in format_text(action_text, width=75, initial_indent='- ', subsequent_indent='  ').split('\n'):
                print(line)

print(f"\nKey Decisions:")
for decision in minutes.key_decisions:
    for line in format_text(decision, width=75, initial_indent='- ', subsequent_indent='  ').split('\n'):
        print(line)


Meeting: Project Status Meeting
Date: 2025-05-08 06:46:21.987446

Attendees:
- Sarah
- John
- James
- Michael
- Emma

Summary:
The meeting began at 10:00 AM, and the minutes from the previous meeting were
    reviewed and approved without any changes. Action items from the last
    meeting were discussed: Sarah completed the project timeline revision, James
    worked on the feature integration, and Emma delivered the mockups for the
    next iteration. Michael provided an update on the marketing strategy for the
    product launch. John raised the issue of potentially extending the project
    timeline by an additional week due to the delay with the third-party API.
    The next meeting will be held on March 13, 2025, at 10:00 AM.

Discussion Points:

Topic: general discussion
Key Points:
- The meeting began at 10:00 AM, and the minutes from the previous meeting
  were reviewed and approved without any changes.
- Action items from the last meeting were discussed: Sarah completed the
 

In [5]:
with open('meeting_minutes.txt', 'w') as f:
    f.write(f"Meeting: {minutes.title}\n")
    f.write(f"Date: {minutes.date}\n")
    f.write(f"\nAttendees:\n")
    for attendee in minutes.attendees:
        f.write(f"- {attendee}\n")

    f.write(f"\nSummary:\n")
    for line in format_text(minutes.summary, width=80).split('\n'):
        f.write(f"{line}\n")

    f.write(f"\nDiscussion Points:\n")
    for point in minutes.discussion_points:
        f.write(f"\nTopic: {point.topic}\n")
        f.write("Key Points:\n")
        for kp in point.key_points:
            for line in format_text(kp, width=75, initial_indent='- ', subsequent_indent='  ').split('\n'):
                f.write(f"{line}\n")

        if point.decisions:
            f.write("Decisions:\n")
            for decision in point.decisions:
                for line in format_text(decision, width=75, initial_indent='- ', subsequent_indent='  ').split('\n'):
                    f.write(f"{line}\n")

        if point.action_items:
            f.write("Action Items:\n")
            for action in point.action_items:
                assignee = action["assignee"] or "Unassigned"
                deadline = action["deadline"] or "No deadline"
                action_text = f"{action['action']} (Assignee: {assignee}, Deadline: {deadline})"
                for line in format_text(action_text, width=75, initial_indent='- ', subsequent_indent='  ').split('\n'):
                    f.write(f"{line}\n")

    f.write(f"\nKey Decisions:\n")
    for decision in minutes.key_decisions:
        for line in format_text(decision, width=75, initial_indent='- ', subsequent_indent='  ').split('\n'):
            f.write(f"{line}\n")

print("Meeting minutes saved to 'meeting_minutes.txt'")

Meeting minutes saved to 'meeting_minutes.txt'
