# Corporate Synergy Bot 7B - Training Notebook

This notebook trains a LoRA adapter on Mistral-7B for corporate speak transformation.

**Important**: Make sure to select GPU runtime (Runtime → Change runtime type → T4 GPU)

## 1. Install Dependencies

In [None]:
# Check GPU availability and CUDA version first
!nvidia-smi
!nvcc --version

# Install CUDA dependencies for Google Colab
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

# Install required packages with specific versions
!pip install -q transformers==4.36.2
!pip install -q datasets==2.14.7
!pip install -q peft==0.7.1
!pip install -q accelerate==0.25.0
!pip install -q bitsandbytes==0.41.3
!pip install -q tensorboard
!pip install -q huggingface-hub

# Verify CUDA installation
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

'nvidia-smi' is not recognized as an internal or external command,
operable program or batch file.


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Fri_Jun_14_16:44:19_Pacific_Daylight_Time_2024
Cuda compilation tools, release 12.6, V12.6.20
Build cuda_12.6.r12.6/compiler.34431801_0


## 2. Check GPU Availability

In [None]:
# Check GPU
!nvidia-smi

import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

## 3. Generate Corporate Speak Dataset

In [None]:
import random
from enum import Enum
from typing import List, Dict, Tuple
import json

class Domain(Enum):
    TECH_STARTUP = "tech_startup"
    CONSULTING = "consulting"
    FINANCE = "finance"
    HEALTHCARE = "healthcare"
    RETAIL = "retail"
    MANUFACTURING = "manufacturing"

class SeniorityLevel(Enum):
    JUNIOR = 1
    MID = 2
    SENIOR = 3
    EXECUTIVE = 4

# Corporate vocabulary by domain
DOMAIN_VOCAB = {
    Domain.TECH_STARTUP: {
        "nouns": ["MVP", "pivot", "scale", "disruption", "iteration", "velocity", "sprint", "backlog"],
        "verbs": ["iterate", "pivot", "scale", "disrupt", "innovate", "optimize", "refactor"],
        "phrases": ["move fast and break things", "fail fast", "growth hack", "10x engineer"]
    },
    Domain.CONSULTING: {
        "nouns": ["deliverables", "framework", "methodology", "stakeholders", "engagement", "deck"],
        "verbs": ["leverage", "align", "synthesize", "cascade", "socialize", "operationalize"],
        "phrases": ["circle back", "touch base", "deep dive", "low-hanging fruit", "move the needle"]
    },
    Domain.FINANCE: {
        "nouns": ["ROI", "EBITDA", "runway", "burn rate", "valuation", "portfolio", "assets"],
        "verbs": ["monetize", "capitalize", "hedge", "diversify", "liquidate", "optimize returns"],
        "phrases": ["top line growth", "bottom line impact", "risk-adjusted returns", "market dynamics"]
    },
    Domain.HEALTHCARE: {
        "nouns": ["outcomes", "protocols", "compliance", "wellness", "interventions", "metrics"],
        "verbs": ["implement", "standardize", "coordinate", "integrate", "monitor", "assess"],
        "phrases": ["patient-centered care", "evidence-based practice", "quality metrics", "care coordination"]
    },
    Domain.RETAIL: {
        "nouns": ["touchpoints", "omnichannel", "conversion", "basket size", "footfall", "inventory"],
        "verbs": ["engage", "convert", "upsell", "personalize", "optimize", "streamline"],
        "phrases": ["customer journey", "seamless experience", "drive traffic", "enhance engagement"]
    },
    Domain.MANUFACTURING: {
        "nouns": ["throughput", "efficiency", "lean", "six sigma", "capacity", "yield", "supply chain"],
        "verbs": ["optimize", "streamline", "scale", "automate", "standardize", "implement"],
        "phrases": ["continuous improvement", "operational excellence", "just-in-time", "quality control"]
    }
}

# Seniority-based language patterns
SENIORITY_PATTERNS = {
    SeniorityLevel.JUNIOR: {
        "starters": ["I think", "Maybe we could", "I was wondering if", "Would it be possible to"],
        "confidence": 0.6,
        "jargon_density": 0.3
    },
    SeniorityLevel.MID: {
        "starters": ["I recommend", "We should consider", "Based on my analysis", "I suggest"],
        "confidence": 0.8,
        "jargon_density": 0.5
    },
    SeniorityLevel.SENIOR: {
        "starters": ["We need to", "Let's", "I'm implementing", "We're driving"],
        "confidence": 0.9,
        "jargon_density": 0.7
    },
    SeniorityLevel.EXECUTIVE: {
        "starters": ["We will", "I'm directing", "Our strategy", "The vision is"],
        "confidence": 1.0,
        "jargon_density": 0.9
    }
}

print("✅ Corporate vocabulary and patterns defined")

## 4. Create Dataset Generator

In [None]:
class CorporateDatasetGenerator:
    def __init__(self):
        self.casual_to_corporate = {
            # Basic transformations
            "let's meet": ["let's sync up", "let's align", "let's circle back", "let's touch base"],
            "good job": ["excellent execution", "great deliverables", "strong performance", "impactful work"],
            "i need help": ["i require support", "i need assistance", "seeking guidance", "require collaboration"],
            "let's talk": ["let's discuss", "let's have a dialogue", "let's connect", "let's interface"],
            "i'm busy": ["my bandwidth is limited", "i'm at capacity", "my plate is full", "i'm resource-constrained"],
            "works for me": ["that aligns with my schedule", "i can accommodate that", "that's feasible", "i'm aligned"],
            "problem": ["challenge", "opportunity for improvement", "pain point", "area of concern"],
            "fix": ["resolve", "address", "remediate", "optimize"],
            "use": ["leverage", "utilize", "employ", "harness"],
            "think about": ["consider", "evaluate", "assess", "analyze"],
            "work together": ["collaborate", "synergize", "partner", "align efforts"],
            "improve": ["optimize", "enhance", "elevate", "drive improvement"],
            "start": ["initiate", "commence", "kick off", "launch"],
            "end": ["conclude", "finalize", "wrap up", "bring to closure"],
            "make": ["create", "develop", "produce", "generate"],
            "check": ["validate", "verify", "assess", "review"],
            "send": ["distribute", "disseminate", "cascade", "share"],
            "get": ["obtain", "acquire", "secure", "procure"],
            "show": ["demonstrate", "illustrate", "present", "showcase"],
            "tell": ["communicate", "inform", "advise", "brief"]
        }
        
        # Reverse mappings for bidirectional translation
        self.corporate_to_casual = {}
        for casual, corporate_list in self.casual_to_corporate.items():
            for corporate in corporate_list:
                self.corporate_to_casual[corporate] = casual
        
        # Additional corporate phrases
        self.corporate_phrases = [
            "moving forward", "going forward", "at the end of the day", "net net",
            "circle back", "touch base", "reach out", "loop in", "take this offline",
            "bandwidth", "capacity", "deliverables", "action items", "key takeaways",
            "synergies", "best practices", "lessons learned", "pain points", "win-win",
            "low-hanging fruit", "move the needle", "paradigm shift", "game changer",
            "think outside the box", "30,000 foot view", "drill down", "deep dive",
            "leverage our learnings", "operationalize", "productize", "incentivize"
        ]
    
    def generate_examples(self, num_examples: int = 1000) -> List[Dict]:
        examples = []
        
        # Generate different types of examples
        for i in range(num_examples):
            example_type = random.choice([
                "casual_to_corporate",
                "corporate_to_casual",
                "domain_specific",
                "seniority_based",
                "conversation",
                "email",
                "meeting"
            ])
            
            if example_type == "casual_to_corporate":
                example = self._generate_casual_to_corporate()
            elif example_type == "corporate_to_casual":
                example = self._generate_corporate_to_casual()
            elif example_type == "domain_specific":
                example = self._generate_domain_specific()
            elif example_type == "seniority_based":
                example = self._generate_seniority_based()
            elif example_type == "conversation":
                example = self._generate_conversation()
            elif example_type == "email":
                example = self._generate_email()
            else:
                example = self._generate_meeting()
            
            # Add the formatted text field
            example['text'] = f"### Instruction: {example['instruction']}\n### Input: {example['input']}\n### Response: {example['output']}"
            examples.append(example)
        
        return examples
    
    def _generate_casual_to_corporate(self) -> Dict:
        casual_phrase = random.choice(list(self.casual_to_corporate.keys()))
        corporate_phrase = random.choice(self.casual_to_corporate[casual_phrase])
        
        # Create variations
        templates = [
            (f"I think we should {casual_phrase}", f"I believe we should {corporate_phrase}"),
            (f"Can we {casual_phrase}?", f"Could we {corporate_phrase}?"),
            (f"Let's {casual_phrase} about this", f"Let's {corporate_phrase} regarding this matter"),
            (f"We need to {casual_phrase}", f"We need to {corporate_phrase}")
        ]
        
        input_text, output_text = random.choice(templates)
        
        return {
            "instruction": "Transform to corporate speak",
            "input": input_text,
            "output": output_text
        }
    
    def _generate_corporate_to_casual(self) -> Dict:
        corporate_phrase = random.choice(list(self.corporate_to_casual.keys()))
        casual_phrase = self.corporate_to_casual[corporate_phrase]
        
        templates = [
            (f"We need to {corporate_phrase} on this initiative", f"We need to {casual_phrase} on this"),
            (f"Let's {corporate_phrase} to ensure alignment", f"Let's {casual_phrase}"),
            (f"I'll {corporate_phrase} with the team", f"I'll {casual_phrase} with the team")
        ]
        
        input_text, output_text = random.choice(templates)
        
        return {
            "instruction": "Translate corporate speak to plain English",
            "input": input_text,
            "output": output_text
        }
    
    def _generate_domain_specific(self) -> Dict:
        domain = random.choice(list(Domain))
        vocab = DOMAIN_VOCAB[domain]
        
        noun = random.choice(vocab["nouns"])
        verb = random.choice(vocab["verbs"])
        phrase = random.choice(vocab["phrases"])
        
        templates = [
            ("how's the project?", f"We're {verb}ing our {noun} to {phrase}"),
            ("what's the plan?", f"The plan is to {verb} the {noun} and {phrase}"),
            ("any updates?", f"Yes, we've been able to {phrase} by {verb}ing our {noun}")
        ]
        
        input_text, output_text = random.choice(templates)
        
        return {
            "instruction": f"Transform to {domain.value.replace('_', ' ')} corporate speak",
            "input": input_text,
            "output": output_text,
            "context": {"domain": domain.value}
        }
    
    def _generate_seniority_based(self) -> Dict:
        seniority = random.choice(list(SeniorityLevel))
        patterns = SENIORITY_PATTERNS[seniority]
        starter = random.choice(patterns["starters"])
        
        casual_inputs = [
            "we should try something new",
            "this isn't working",
            "i have an idea",
            "let's change this"
        ]
        
        input_text = random.choice(casual_inputs)
        
        # Generate output based on seniority
        if seniority == SeniorityLevel.JUNIOR:
            output_text = f"{starter} we could explore alternative approaches"
        elif seniority == SeniorityLevel.MID:
            output_text = f"{starter} pivoting our strategy to drive better outcomes"
        elif seniority == SeniorityLevel.SENIOR:
            output_text = f"{starter} implementing a strategic pivot to optimize results"
        else:  # EXECUTIVE
            output_text = f"{starter} to transform our approach and maximize value creation"
        
        return {
            "instruction": f"Transform to corporate speak (seniority: {seniority.name})",
            "input": input_text,
            "output": output_text,
            "context": {"seniority": seniority.value}
        }
    
    def _generate_conversation(self) -> Dict:
        scenarios = [
            {
                "input": "thanks for your help",
                "output": "I appreciate your collaboration on this initiative. Your contributions have been invaluable."
            },
            {
                "input": "sorry i'm late",
                "output": "Apologies for the delay. I was addressing another priority that required immediate attention."
            },
            {
                "input": "can you explain this?",
                "output": "I'd be happy to provide additional context and clarification on this matter."
            },
            {
                "input": "i don't understand",
                "output": "Let me break this down further to ensure we're aligned on the key concepts."
            }
        ]
        
        scenario = random.choice(scenarios)
        
        return {
            "instruction": "Generate a professional response",
            "input": scenario["input"],
            "output": scenario["output"],
            "context": {"type": "conversation"}
        }
    
    def _generate_email(self) -> Dict:
        email_types = [
            {
                "input": "need this done today",
                "output": "This deliverable has been identified as a critical priority with an EOD deadline. Please advise on your capacity to accommodate this urgent request."
            },
            {
                "input": "following up on my last email",
                "output": "I wanted to circle back on my previous communication to ensure alignment and address any outstanding questions or concerns."
            },
            {
                "input": "fyi",
                "output": "Please find below information for your awareness and consideration."
            }
        ]
        
        email_type = random.choice(email_types)
        
        return {
            "instruction": "Transform to professional email language",
            "input": email_type["input"],
            "output": email_type["output"],
            "context": {"type": "email"}
        }
    
    def _generate_meeting(self) -> Dict:
        meeting_phrases = [
            {
                "input": "let's start",
                "output": "Let's kick off today's session. Thank you all for joining."
            },
            {
                "input": "any questions?",
                "output": "Are there any questions, concerns, or additional perspectives to consider?"
            },
            {
                "input": "that's all",
                "output": "That concludes our agenda items. Thank you for your valuable contributions and engagement."
            }
        ]
        
        phrase = random.choice(meeting_phrases)
        
        return {
            "instruction": "Transform to professional meeting language",
            "input": phrase["input"],
            "output": phrase["output"],
            "context": {"type": "meeting"}
        }

# Create generator instance
generator = CorporateDatasetGenerator()
print("✅ Dataset generator created")

## 5. Generate Training Dataset

In [None]:
from datasets import Dataset, DatasetDict
import pandas as pd

# Generate examples
print("Generating dataset...")
num_examples = 8000  # Adjust based on your needs
all_examples = generator.generate_examples(num_examples)

# Convert to DataFrame for easy splitting
df = pd.DataFrame(all_examples)

# Split into train/validation/test (80/10/10)
train_size = int(0.8 * len(df))
val_size = int(0.1 * len(df))

train_df = df[:train_size]
val_df = df[train_size:train_size + val_size]
test_df = df[train_size + val_size:]

# Create HuggingFace datasets
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df),
    "test": Dataset.from_pandas(test_df)
})

print(f"\n✅ Dataset generated!")
print(f"Train: {len(dataset['train'])} examples")
print(f"Validation: {len(dataset['validation'])} examples")
print(f"Test: {len(dataset['test'])} examples")

# Show some examples
print("\n📝 Sample examples:")
for i in range(3):
    example = dataset['train'][i]
    print(f"\nExample {i+1}:")
    print(f"Instruction: {example['instruction']}")
    print(f"Input: {example['input']}")
    print(f"Output: {example['output']}")

## 6. Login to Hugging Face

In [None]:
from huggingface_hub import login

# Login to Hugging Face - you'll need to enter your token
print("Please enter your Hugging Face token:")
login()

## 7. Initialize Model and Tokenizer

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Model name
model_name = "mistralai/Mistral-7B-Instruct-v0.3"

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print(f"✅ Tokenizer loaded: {model_name}")

## 8. Load Model with 4-bit Quantization

In [None]:
# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load model
print("Loading model with 4-bit quantization...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

model.config.use_cache = False
model.config.pretraining_tp = 1

print("✅ Model loaded successfully!")

## 9. Configure LoRA

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Prepare model for k-bit training
print("Preparing model for LoRA training...")
model = prepare_model_for_kbit_training(model)

# LoRA configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

# Apply LoRA
model = get_peft_model(model, peft_config)
print("\n✅ LoRA configuration applied!")
model.print_trainable_parameters()

## 10. Tokenize Dataset

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        padding="max_length"
    )

# Tokenize the dataset
print("Tokenizing dataset...")
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

print("✅ Dataset tokenized successfully!")
print(f"Training examples: {len(tokenized_dataset['train'])}")
print(f"Validation examples: {len(tokenized_dataset['validation'])}")

## 11. Setup Training Arguments

In [None]:
from transformers import TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir="./corporate-synergy-bot-7b",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.1,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    logging_steps=25,
    save_steps=100,
    eval_steps=100,
    save_total_limit=3,
    evaluation_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    push_to_hub=True,
    hub_model_id="phxdev/corporate-synergy-bot-7b",
)

print("✅ Training arguments configured!")

## 12. Create Trainer

In [None]:
from transformers import Trainer, DataCollatorForLanguageModeling

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

print("✅ Trainer created and ready!")

## 13. Start Training

In [None]:
# Train the model
print("🚀 Starting training...")
print("This will take approximately 2-3 hours on a T4 GPU")
print("-" * 50)

trainer.train()

print("\n✅ Training complete!")

## 14. Save and Push Model

In [None]:
# Save the model locally
print("Saving model...")
trainer.save_model()
print("✅ Model saved locally!")

# Push to Hugging Face Hub
print("\nPushing to Hugging Face Hub...")
trainer.push_to_hub()
tokenizer.push_to_hub("phxdev/corporate-synergy-bot-7b")

print("\n🎉 Model successfully pushed to: https://huggingface.co/phxdev/corporate-synergy-bot-7b")

## 15. Test the Trained Model

In [None]:
def generate_response(prompt, max_length=150):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract just the response part
    if "### Response:" in response:
        response = response.split("### Response:")[-1].strip()
    return response

# Test examples
test_cases = [
    "### Instruction: Transform to corporate speak\n### Input: let's meet tomorrow\n### Response:",
    "### Instruction: Transform to corporate speak\n### Input: I need help\n### Response:",
    "### Instruction: Translate corporate speak to plain English\n### Input: We need to leverage our synergies\n### Response:",
    "### Instruction: Transform to tech corporate speak (seniority: senior)\n### Input: good job on the project\n### Response:"
]

print("🧪 Testing the model...\n")
for test in test_cases:
    print(f"Input: {test.split('### Input: ')[1].split('### Response:')[0].strip()}")
    print(f"Output: {generate_response(test)}")
    print("-" * 50)

## 🎉 Congratulations!

Your Corporate Synergy Bot 7B has been trained and uploaded to Hugging Face!

**Next Steps:**
1. Check your model at: https://huggingface.co/phxdev/corporate-synergy-bot-7b
2. Create a demo Space using the `app.py` file
3. Share your bot with the community!

Remember: To maximize stakeholder value, we must leverage our synergies through collaborative paradigm shifts! 😄