# Job Title Classification Training
This notebook demonstrates the training process for job title classification.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import joblib

## 1. Load Processed Data

In [None]:
# Load processed data
df = pd.read_csv('../data/processed_job_descriptions.csv')

# Display basic information
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
df.head()

## 2. Prepare Data for Training

In [None]:
# Split data into training and testing sets
X = df['cleaned_text']
y = df['Job Title']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training set size:", len(X_train))
print("Testing set size:", len(X_test))

## 3. Create and Train Model

In [None]:
# Create pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=5000,
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.95
    )),
    ('clf', RandomForestClassifier(
        n_estimators=200,
        max_depth=50,
        random_state=42
    ))
])

In [None]:
# Train the model
pipeline.fit(X_train, y_train)
print("Model training completed!")

## 4. Evaluate Model

In [None]:
# Make predictions
y_pred = pipeline.predict(X_test)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Plot confusion matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## 5. Test with Sample Descriptions

In [None]:
# Sample test cases
test_cases = [
    "Looking for a software engineer with experience in Python and machine learning",
    "Seeking a digital marketing specialist with social media experience",
    "Need a financial advisor with investment banking background"
]

for test_case in test_cases:
    prediction = pipeline.predict([test_case])[0]
    probabilities = pipeline.predict_proba([test_case])[0]
    
    print(f"\nTest case: {test_case}")
    print(f"Predicted job title: {prediction}")
    
    # Get top 3 predictions
    top_3_indices = probabilities.argsort()[-3:][::-1]
    print("Top 3 predictions with probabilities:")
    for idx in top_3_indices:
        print(f"- {pipeline.classes_[idx]}: {probabilities[idx]:.4f}")

## 6. Save Model

In [None]:
# Save the model
joblib.dump(pipeline, '../models/job_classifier.joblib')
print("Model saved successfully!")