# E-2 Visa Business Plan Generator Training

This notebook will help you train the model using your business plan templates. Follow these steps:

1. Upload your DOCX files
2. Preprocess the data
3. Train the model
4. Test the results

In [None]:
# Install required dependencies
!pip install -q transformers==4.30.0 torch==2.0.1 peft==0.4.0 datasets==2.12.0
!pip install -q accelerate==0.20.0 bitsandbytes==0.41.0 sentencepiece==0.1.99
!pip install -q python-docx==0.8.11

In [None]:
# Create directories
!mkdir -p templates/docx_files data model/fine_tuned_model

In [None]:
# Mount Google Drive to access your files
from google.colab import drive
drive.mount('/content/drive')

## Step 1: Upload Your Business Plan Templates
Upload your DOCX files to the `templates/docx_files` directory

In [None]:
from google.colab import files
uploaded = files.upload()

# Move uploaded files to templates directory
import os
for filename in uploaded.keys():
    if filename.endswith('.docx'):
        !mv "{filename}" "templates/docx_files/{filename}"

## Step 2: Preprocess Business Plan Templates

In [None]:
# Copy the process_templates.py content here
%%writefile process_templates.py

import os
import json
from docx import Document
import re

def extract_sections_from_docx(doc_path):
    """Extract structured sections from a business plan DOCX file"""
    doc = Document(doc_path)
    current_section = None
    sections = {}
    content = []
    
    for paragraph in doc.paragraphs:
        text = paragraph.text.strip()
        if not text:
            continue
            
        # Check if this is a section header (all caps or numbered)
        if text.isupper() or re.match(r'^\d+\.[\.\d]*\s+[A-Z]', text):
            current_section = text
            sections[current_section] = []
        elif current_section:
            sections[current_section].append(text)
        
        content.append(text)
    
    return sections, "\n".join(content)

def process_business_plan(doc_path):
    """Process a business plan document and extract structured information"""
    sections, full_text = extract_sections_from_docx(doc_path)
    
    # Try to extract key information
    investment_match = re.search(r'\$(\d+(?:,\d{3})*(?:\.\d{2})?[Kk]?)', full_text)
    investment = investment_match.group(0) if investment_match else "$500K"
    
    # Try to find business type from filename or content
    filename = os.path.basename(doc_path)
    business_type = filename.split('-')[-1].split('.')[0].strip()
    
    # Format the training example with sections
    formatted_text = ""
    for section, paragraphs in sections.items():
        formatted_text += f"{section}\n"
        formatted_text += "\n".join(paragraphs) + "\n\n"
    
    return {
        "input": f"Generate a business plan for a {investment} {business_type} E-2 Visa startup",
        "output": formatted_text
    }

def process_docx_files(docx_dir="templates/docx_files", output_file="data/training_data.json"):
    """Process all DOCX files and convert them to training data"""
    training_data = []
    
    for file in os.listdir(docx_dir):
        if file.endswith('.docx') and 'Questionnaire' not in file:
            doc_path = os.path.join(docx_dir, file)
            try:
                example = process_business_plan(doc_path)
                training_data.append(example)
                print(f"Processed: {file}")
            except Exception as e:
                print(f"Error processing {file}: {str(e)}")
    
    # Save training data
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(training_data, f, indent=2, ensure_ascii=False)
    
    print(f"\nProcessed {len(training_data)} business plans")
    return training_data

if __name__ == "__main__":
    process_docx_files()

In [None]:
# Run preprocessing
!python process_templates.py

## Step 3: Train the Model

In [None]:
# Copy the fine_tune.py content and run training
!python src/fine_tune.py

## Step 4: Test the Model

In [None]:
from transformers import pipeline

def test_model(prompt):
    model_path = "model/fine_tuned_model"
    generator = pipeline("text-generation", model=model_path)
    return generator(prompt, max_length=2000)

# Test with a sample prompt
test_prompt = "Generate a business plan for a $500K restaurant E-2 Visa startup"
response = test_model(test_prompt)
print(response)

## Step 5: Save the Model

After training is complete, you can download the model or save it to your Google Drive

In [None]:
# Copy model to Google Drive
!cp -r model/fine_tuned_model "/content/drive/My Drive/fine_tuned_model"