# Sentiment Analysis Model Training

Run this notebook to generate synthetic data, train the logistic regression model, and download the `.pkl` artifacts for your local application.

In [None]:
# Install dependencies
!pip install pandas scikit-learn joblib



In [2]:
import pandas as pd
import random
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

MODEL_FILE = "sentiment_v1.pkl"
VECTORIZER_FILE = "vectorizer_v1.pkl"
DATA_FILE = "sentiment_dataset.csv"

## 1. Generate Synthetic Data

In [4]:
def generate_training_data():
    print("Generating synthetic training data...")
    
    positive_templates = [
        "I love this {noun}!", "The {noun} is amazing.", "Great {noun}.", "Best {noun} ever.",
        "Highly recommend this {noun}.", "So happy with the {noun}.", "Excellent quality.",
        "Works perfectly.", "Five stars!", "Incredible experience."
    ]
    
    negative_templates = [
        "I hate this {noun}.", "The {noun} is terrible.", "Worst {noun} ever.", "Do not buy this {noun}.",
        "Very disappointed with the {noun}.", "Waste of money.", "Broken on arrival.",
        "Poor quality.", "Does not work.", "One star."
    ]
    
    nouns = ["product", "service", "app", "experience", "item", "purchase", "support", "quality", "interface", "performance"]
    
    data = []
    
    # Generate 1000 positive samples
    for _ in range(1000):
        template = random.choice(positive_templates)
        noun = random.choice(nouns)
        text = template.format(noun=noun)
        data.append({"text": text, "sentiment": 1}) 
        
    # Generate 1000 negative samples
    for _ in range(1000):
        template = random.choice(negative_templates)
        noun = random.choice(nouns)
        text = template.format(noun=noun)
        data.append({"text": text, "sentiment": 0}) 
        
    df = pd.DataFrame(data)
    # Shuffle
    df = df.sample(frac=1).reset_index(drop=True)
    
    df.to_csv(DATA_FILE, index=False)
    print(f"Saved {DATA_FILE} with {len(df)} samples.")
    return df

## 2. Train Model

In [5]:
def train_model(df):
    X = df['text']
    y = df['sentiment']

    print(f"Training on {len(df)} samples...")
    
    # Vectorizer
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    X_vec = vectorizer.fit_transform(X)
    
    # Model
    model = LogisticRegression(solver='liblinear')
    model.fit(X_vec, y)
    
    # Evaluate (simple)
    score = model.score(X_vec, y)
    print(f"Model Accuracy (Training): {score:.4f}")
    
    # Save
    print("Saving model artifacts...")
    joblib.dump(model, MODEL_FILE)
    joblib.dump(vectorizer, VECTORIZER_FILE)
    print(f"Saved {MODEL_FILE} and {VECTORIZER_FILE}")

# Execute Pipeline
df = generate_training_data()
train_model(df)

Generating synthetic training data...
Saved sentiment_dataset.csv with 2000 samples.
Training on 2000 samples...
Model Accuracy (Training): 1.0000
Saving model artifacts...
Saved sentiment_v1.pkl and vectorizer_v1.pkl


## 3. Download Artifacts
Run this cell to download the files to your local machine (if using Google Colab).

In [6]:
from google.colab import files
try:
    files.download(MODEL_FILE)
    files.download(VECTORIZER_FILE)
except ImportError:
    print("Not running in Google Colab, skipping download.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>