In [7]:
# Cell 1: imports + utilities
import os
import re
import numpy as np
import pandas as pd
from pptx import Presentation
from pptx.util import Inches
import joblib

# ML imports (we'll import SBERT and LightGBM later inside try/except)
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Create required folders if not present
os.makedirs("data/pitches", exist_ok=True)
os.makedirs("models", exist_ok=True)

print("✅ Basic imports ready, folders ok.")


✅ Basic imports ready, folders ok.


In [8]:
from pptx import Presentation
import os

def create_dummy_ppt(path, project_name, problem, solution, tech, demo, future):
    prs = Presentation()

    # Title slide
    slide = prs.slides.add_slide(prs.slide_layouts[0])
    slide.shapes.title.text = project_name
    slide.placeholders[1].text = "Hackathon Pitch Deck"

    # Problem slide
    slide = prs.slides.add_slide(prs.slide_layouts[1])
    slide.shapes.title.text = "Problem"
    slide.placeholders[1].text = problem

    # Solution slide
    slide = prs.slides.add_slide(prs.slide_layouts[1])
    slide.shapes.title.text = "Solution"
    slide.placeholders[1].text = solution

    # Tech Stack slide
    slide = prs.slides.add_slide(prs.slide_layouts[1])
    slide.shapes.title.text = "Tech Stack"
    slide.placeholders[1].text = tech

    # Demo slide
    slide = prs.slides.add_slide(prs.slide_layouts[1])
    slide.shapes.title.text = "Demo / Prototype"
    slide.placeholders[1].text = demo

    # Future Scope slide
    slide = prs.slides.add_slide(prs.slide_layouts[1])
    slide.shapes.title.text = "Future Scope"
    slide.placeholders[1].text = future

    prs.save(path)
    print(f"✅ Created: {path}")


# 📁 Folder where PPTs will be stored
os.makedirs(r"C:\Users\Rachit Upadhyay\OneDrive\Desktop\Mini_Project\data\pitches", exist_ok=True)

# 🎯 Create 10 sample projects
projects = [
    ("HackLens", "Hackathon judging is slow", "AI-based automatic judging", "Python, FastAPI, LightGBM", "Basic working prototype", "Add AI for code evaluation"),
    ("SmartApp", "People forget meds", "Reminder app with AI schedule", "Python, Flutter, MongoDB", "Prototype tested", "Integrate with smartwatch"),
    ("HealthTech", "Rural patients lack doctors", "Telemedicine + AI triage", "React, Node, TensorFlow", "Working demo", "Add multilingual support"),
    ("AgriAI", "Low crop yield due to poor soil", "Soil health prediction model", "Python, Scikit-learn, IoT sensors", "Demo tested on farms", "Scale to nationwide data"),
    ("EduAI", "Students need adaptive learning", "Personalized course engine", "FastAPI, PyTorch, NLP", "Prototype with course data", "Add real-time progress tracking"),
    ("FinTechX", "SMEs lack credit access", "AI-based credit scoring", "Python, Pandas, LightGBM", "Prototype with financial data", "Expand dataset"),
    ("GreenTech", "Waste management inefficiency", "IoT-based smart bins", "Python, IoT sensors", "Route demo ready", "Pilot deployment in cities"),
    ("MediAssist", "Doctors lack automated notes", "Voice-to-text assistant", "Python, Whisper, NLP", "Functional prototype", "Add hospital integrations"),
    ("RoboHelper", "Home chores automation", "Mini robot for household help", "ROS, Python", "Demo robot built", "Add speech commands"),
    ("CyberSafe", "Phishing attacks on SMBs", "Email threat detection system", "Python, ML, Flask", "Detection demo ready", "Improve accuracy & add dashboard"),
]

# Generate the PPTs
for name, problem, solution, tech, demo, future in projects:
    path = fr"C:\Users\Rachit Upadhyay\OneDrive\Desktop\Mini_Project\data\pitches\{name}.pptx"
    create_dummy_ppt(path, name, problem, solution, tech, demo, future)

print("🎉 All dummy PPTs recreated successfully!")


✅ Created: C:\Users\Rachit Upadhyay\OneDrive\Desktop\Mini_Project\data\pitches\HackLens.pptx
✅ Created: C:\Users\Rachit Upadhyay\OneDrive\Desktop\Mini_Project\data\pitches\SmartApp.pptx
✅ Created: C:\Users\Rachit Upadhyay\OneDrive\Desktop\Mini_Project\data\pitches\HealthTech.pptx
✅ Created: C:\Users\Rachit Upadhyay\OneDrive\Desktop\Mini_Project\data\pitches\AgriAI.pptx
✅ Created: C:\Users\Rachit Upadhyay\OneDrive\Desktop\Mini_Project\data\pitches\EduAI.pptx
✅ Created: C:\Users\Rachit Upadhyay\OneDrive\Desktop\Mini_Project\data\pitches\FinTechX.pptx
✅ Created: C:\Users\Rachit Upadhyay\OneDrive\Desktop\Mini_Project\data\pitches\GreenTech.pptx
✅ Created: C:\Users\Rachit Upadhyay\OneDrive\Desktop\Mini_Project\data\pitches\MediAssist.pptx
✅ Created: C:\Users\Rachit Upadhyay\OneDrive\Desktop\Mini_Project\data\pitches\RoboHelper.pptx
✅ Created: C:\Users\Rachit Upadhyay\OneDrive\Desktop\Mini_Project\data\pitches\CyberSafe.pptx
🎉 All dummy PPTs recreated successfully!


In [9]:
import os
import re
import numpy as np
import pandas as pd
from pptx import Presentation
from sentence_transformers import SentenceTransformer
import textstat

# --- Load the sentence transformer model ---
print("⏳ Loading SBERT model (this may take a few seconds)...")
sbert = SentenceTransformer("all-MiniLM-L6-v2")
print("✅ SBERT model loaded!")

# --- Define feature extractor ---
def extract_features_from_ppt(path):
    prs = Presentation(path)
    slide_count = len(prs.slides)

    all_text = []
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text") and shape.text.strip():
                all_text.append(shape.text.strip())
    joined_text = " ".join(all_text)

    # --- Basic text stats ---
    word_count = len(joined_text.split())
    avg_words_per_slide = word_count / slide_count if slide_count else 0
    readability = textstat.flesch_reading_ease(joined_text)

    # --- Keyword flags ---
    keywords = {
        "has_problem": int(bool(re.search(r"problem", joined_text, re.I))),
        "has_solution": int(bool(re.search(r"solution", joined_text, re.I))),
        "has_tech": int(bool(re.search(r"tech|technology|stack", joined_text, re.I))),
        "has_future": int(bool(re.search(r"future|scope|next", joined_text, re.I))),
        "has_demo": int(bool(re.search(r"demo|prototype|working", joined_text, re.I))),
    }

    # --- Embeddings ---
    embedding = sbert.encode(joined_text)

    # --- Combine all features into one list ---
    features = [
        slide_count, word_count, avg_words_per_slide, readability,
        keywords["has_problem"], keywords["has_solution"], keywords["has_tech"],
        keywords["has_future"], keywords["has_demo"]
    ] + embedding.tolist()

    return features

# --- Build dataset ---
folder = r"C:\Users\Rachit Upadhyay\OneDrive\Desktop\Mini_Project\data\pitches"
labels = {
    "HackLens.pptx": 8.5,
    "SmartApp.pptx": 7.0,
    "HealthTech.pptx": 9.0,
    "AgriAI.pptx": 6.5,
    "EduAI.pptx": 7.5,
    "FinTechX.pptx": 8.0,
    "GreenTech.pptx": 6.0,
    "MediAssist.pptx": 9.5,
    "RoboHelper.pptx": 5.5,
    "CyberSafe.pptx": 8.8
}

rows = []
for fname in os.listdir(folder):
    if fname.endswith(".pptx"):
        path = os.path.join(folder, fname)
        feats = extract_features_from_ppt(path)
        rows.append([fname] + feats + [labels[fname]])

cols = ["filename", "slide_count", "word_count", "avg_words_per_slide", "readability",
        "has_problem", "has_solution", "has_tech", "has_future", "has_demo"] + \
       [f"emb_{i}" for i in range(384)] + ["judge_score"]

df = pd.DataFrame(rows, columns=cols)

print("✅ Dataset created successfully!")
print("Shape:", df.shape)
print(df[["filename", "judge_score"]])


⏳ Loading SBERT model (this may take a few seconds)...
✅ SBERT model loaded!
✅ Dataset created successfully!
Shape: (10, 395)
          filename  judge_score
0      AgriAI.pptx          6.5
1   CyberSafe.pptx          8.8
2       EduAI.pptx          7.5
3    FinTechX.pptx          8.0
4   GreenTech.pptx          6.0
5    HackLens.pptx          8.5
6  HealthTech.pptx          9.0
7  MediAssist.pptx          9.5
8  RoboHelper.pptx          5.5
9    SmartApp.pptx          7.0


In [10]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import joblib
import os

# --- Step 1: Prepare data ---
X = df.drop(columns=["filename", "judge_score"])
y = df["judge_score"]

print(f"✅ Training data ready — Shape: {X.shape}")

# --- Step 2: Handle NaN values (if any) ---
if X.isna().sum().sum() > 0:
    print(f"⚠️ Found NaN values: {X.isna().sum().sum()} — replacing with 0")
    X = X.fillna(0)

# --- Step 3: Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)
print(f"📊 Split: {len(X_train)} train / {len(X_test)} test samples")

# --- Step 4: Create LightGBM datasets ---
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test)

# --- Step 5: Define Model Parameters ---
params = {
    "objective": "regression",
    "metric": "rmse",
    "min_data_in_leaf": 1,
    "min_data_in_bin": 1,
    "verbosity": -1
}

# --- Step 6: Train the Model ---
print("🚀 Training LightGBM model...")
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    num_boost_round=200,
    callbacks=[
        lgb.early_stopping(stopping_rounds=20),
        lgb.log_evaluation(period=50)
    ]
)
print("✅ Model training complete!")

# --- Step 7: Evaluate the Model ---
preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print(f"📈 RMSE on test set: {rmse:.3f}")

# Compare predicted vs actual
results = list(zip(df.loc[X_test.index, "filename"], preds, y_test))
print("\n🔍 Predicted vs Actual:")
for fname, pred, actual in results:
    print(f"{fname:20s} → Predicted: {pred:.2f} | Actual: {actual:.2f}")

# --- Step 8: Save the Trained Model ---
os.makedirs("models", exist_ok=True)
model_path = r"C:\Users\Rachit Upadhyay\OneDrive\Desktop\Mini_Project\models\pitch_model.pkl"
joblib.dump(model, model_path)
print(f"\n💾 Model saved successfully at: {model_path}")


✅ Training data ready — Shape: (10, 393)
📊 Split: 7 train / 3 test samples
🚀 Training LightGBM model...
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1]	training's rmse: 1.06799	valid_1's rmse: 1.54507
✅ Model training complete!
📈 RMSE on test set: 1.545

🔍 Predicted vs Actual:
RoboHelper.pptx      → Predicted: 7.68 | Actual: 5.50
CyberSafe.pptx       → Predicted: 7.63 | Actual: 8.80
HackLens.pptx        → Predicted: 7.48 | Actual: 8.50

💾 Model saved successfully at: C:\Users\Rachit Upadhyay\OneDrive\Desktop\Mini_Project\models\pitch_model.pkl
