In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Load dataset
file_path = 'data/spam_data.csv'
data = pd.read_csv(file_path)

# Inspect dataset
data.columns = [col.strip().lower() for col in data.columns]  # Normalize column names
print(data.head())


data['text'] = data['text'].fillna('').astype(str)

# Split dataset
X = data['text']
y = data['is_spam']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Build pipeline with RandomForest
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.9, min_df=5, ngram_range=(1, 2))),
    ('clf', RandomForestClassifier(random_state=42))
])

# Hyperparameter tuning
param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 10, 20],
    'clf__min_samples_split': [2, 5]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_
print(f'Best Params: {grid_search.best_params_}')

# Evaluate model
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(report)

# Save model
model_path = 'spam_classifier_model.pkl'
joblib.dump(best_model, model_path)
print(f'Model saved to {model_path}')


   is_spam                                               text
0        1  Save up to 70% on Life Insurance.\nWhy Spend M...
1        1  1) Fight The Risk of Cancer!\nhttp://www.adcli...
2        1  1) Fight The Risk of Cancer!\nhttp://www.adcli...
3        1  ##############################################...
4        1  I thought you might like these:\n1) Slim Down ...
Fitting 3 folds for each of 12 candidates, totalling 36 fits
