In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Load dataset
file_path = 'data/spam_data.csv'
data = pd.read_csv(file_path)

# Inspect dataset
data.columns = [col.strip().lower() for col in data.columns]  # Normalize column names
print(data.head())


data['text'] = data['text'].fillna('').astype(str)

# Split dataset
X = data['text']
y = data['is_spam']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Build pipeline with RandomForest
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.9, min_df=5, ngram_range=(1, 2))),
    ('clf', RandomForestClassifier(random_state=42))
])

# Hyperparameter tuning
param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 10, 20],
    'clf__min_samples_split': [2, 5]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_
print(f'Best Params: {grid_search.best_params_}')

# Evaluate model
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(report)

# Save model
model_path = 'spam_classifier_model.pkl'
joblib.dump(best_model, model_path)
print(f'Model saved to {model_path}')


   is_spam                                               text
0        1  Congratulations! You've been selected for a lu...
1        1  URGENT: Your account has been compromised. Cli...
2        1  You've won a free iPhone! Claim your prize by ...
3        1  Act now and receive a 50% discount on all purc...
4        1  Important notice: Your subscription will expir...


Training Progress:   0%|          | 0/12 [01:59<?, ?it/s]
Traceback (most recent call last):
  File "/home/raagav/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/raagav/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 444, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "/home/raagav/.local/lib/python3.10/site-packages/sklearn/pipeline.py", line 718, in score
    Xt = transform.transform(Xt)
  File "/home/raagav/.local/lib/python3.10/site-packages/sklearn/feature_extraction/text.py", line 2157, in transform
    X = super().transform(raw_documents)
  File "/home/raagav/.local/lib/python3.10/site-packages/sklearn/feature_extraction/text.py", line 1433, in transform
    _, X = self._count_vocab(raw_documents, fixed_vocab=True)
  File "/home/raagav/.local/lib/python3.10/site-packages/sklearn/feature_extraction/text.py", line 1275, in _

ValueError: np.nan is an invalid document, expected byte or unicode string.