In [None]:
{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# BERT Fine-tuning PoC (Synthetic Data)\n",
        "\n",
        "This notebook:\n",
        "1) installs deps\n",
        "2) generates `data/train.csv` and `data/valid.csv`\n",
        "3) fine-tunes a small BERT\n",
        "4) saves model to `outputs/final_model`\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import sys, os, platform\n",
        "print('Python:', sys.version)\n",
        "print('Platform:', platform.platform())\n",
        "try:\n",
        "    import torch\n",
        "    print('Torch:', torch.__version__)\n",
        "    print('CUDA available:', torch.cuda.is_available())\n",
        "except Exception as e:\n",
        "    print('Torch not installed yet:', e)\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Install dependencies\n",
        "If you’re on JupyterLab/VSCode Notebook, `%pip` installs into the notebook kernel environment."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "%pip install -r requirements.txt"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Generate synthetic train/valid CSVs"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "!python make_data.py\n",
        "\n",
        "import pandas as pd\n",
        "print('\\nTrain head:')\n",
        "display(pd.read_csv('data/train.csv').head())\n",
        "print('\\nValid head:')\n",
        "display(pd.read_csv('data/valid.csv').head())\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Fine-tune BERT (PoC)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "!python train.py"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Load saved model and run quick predictions"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import torch\n",
        "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n",
        "\n",
        "model_dir = 'outputs/final_model'\n",
        "tokenizer = AutoTokenizer.from_pretrained(model_dir)\n",
        "model = AutoModelForSequenceClassification.from_pretrained(model_dir)\n",
        "\n",
        "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
        "model.to(device)\n",
        "model.eval()\n",
        "\n",
        "texts = [\n",
        "    'I loved this product. It was excellent and reliable.',\n",
        "    'This service was awful and frustrating.',\n",
        "    'Better than expected — fantastic experience.',\n",
        "    'I regret using this feature. It was confusing and messy.'\n",
        "]\n",
        "\n",
        "enc = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)\n",
        "enc = {k: v.to(device) for k, v in enc.items()}\n",
        "\n",
        "with torch.no_grad():\n",
        "    logits = model(**enc).logits\n",
        "    probs = torch.softmax(logits, dim=-1).cpu().numpy()\n",
        "\n",
        "for t, p in zip(texts, probs):\n",
        "    print('\\nText:', t)\n",
        "    print('Prob(neg,pos):', (float(p[0]), float(p[1])))\n"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "name": "python",
      "version": "3.x"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}
