In [2]:
import os
import sys
import numpy as np
import random
import pickle

# Add src/ to path
sys.path.append('C:/Users/acer/OneDrive/Desktop/ML/ML_Hackathon_Hackman/src')
from hmm_model import HMM

ALPHABET = 'abcdefghijklmnopqrstuvwxyz'
INDEX = {c: i for i, c in enumerate(ALPHABET)}
MAX_LIVES = 6

In [7]:
# --------------------------------------------------------------
# SECTION 2 – LOAD DATA & HMM (WITH DEBUG)
# --------------------------------------------------------------

import os

# --- 1. Load corpus.txt (with debug) ---
corpus_path = '../data/corpus.txt'
print(f"Looking for corpus at: {os.path.abspath(corpus_path)}")

if not os.path.exists(corpus_path):
    raise FileNotFoundError(f"corpus.txt not found at {corpus_path}")

raw_lines = []
with open(corpus_path, 'r', encoding='utf-8') as f:
    raw_lines = f.readlines()

print(f"Total lines in file: {len(raw_lines)}")

# Filter valid words
words = []
for line in raw_lines:
    w = line.strip().lower()
    if w and w.isalpha():
        words.append(w)

print(f"Valid words after filtering: {len(words)}")

if len(words) == 0:
    raise ValueError("No valid words found in corpus.txt! "
                     "It must contain English words (letters only), one per line.")

# --- 2. Load or create HMM ---
hmm_path = '../models/hmm.pkl'
if os.path.exists(hmm_path):
    print("Loading HMM from file...")
    with open(hmm_path, 'rb') as f:
        hmm = pickle.load(f)
else:
    print("No HMM found – training from scratch...")
    from src.hmm_model import HMM
    hmm = HMM(words)
    hmm.save(hmm_path)
    print("HMM trained and saved!")

# --- 3. Final setup ---
MAX_LEN = max(len(w) for w in words)
print(f"Max word length: {MAX_LEN}")
print(f"Loaded {len(words):,} words and HMM.")

Looking for corpus at: C:\Users\acer\OneDrive\Desktop\ML\ML_Hackathon_Hackman\data\corpus.txt
Total lines in file: 50000
Valid words after filtering: 49979
Loading HMM from file...
Max word length: 24
Loaded 49,979 words and HMM.


In [5]:
# --------------------------------------------------------------
# TRAIN & SAVE REAL HMM (Run ONCE) - FIXED PATHS
# --------------------------------------------------------------

# Fix for any working directory
import sys
import os

# Add src to path (absolute)
project_root = r'C:/Users/acer/OneDrive/Desktop/ML/ML_Hackathon_Hackman'
sys.path.insert(0, os.path.join(project_root, 'src'))

from hmm_model import HMM

# Ensure models folder exists
models_dir = os.path.join(project_root, 'models')
os.makedirs(models_dir, exist_ok=True)

# Load corpus
corpus_path = os.path.join(project_root, 'data', 'corpus.txt')
if not os.path.exists(corpus_path):
    raise FileNotFoundError(f"Missing {corpus_path}")

print("Loading corpus...")
with open(corpus_path, 'r', encoding='utf-8') as f:
    words = [line.strip().lower() for line in f if line.strip() and line.isalpha()]

print(f"Loaded {len(words):,} words. Training HMM...")

# Train & save
hmm = HMM(words)
model_path = os.path.join(models_dir, 'hmm.pkl')
hmm.save(model_path)

print(f"HMM trained & saved to: {model_path}")
print(f"File size: {os.path.getsize(model_path):,} bytes")

Loading corpus...
Loaded 0 words. Training HMM...
✅ HMM trained & saved to: C:/Users/acer/OneDrive/Desktop/ML/ML_Hackathon_Hackman\models\hmm.pkl
File size: 5,839 bytes
