In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
df = pd.read_csv("data/spam.csv", encoding="latin-1")
df.head()
df.describe()

In [None]:
df.columns
df['label'].value_counts()


In [None]:
sns.countplot(x='label', data=df)
plt.title("Spam vs Ham Distribution")
plt.show()


In [None]:
df['msg_length'] = df['message'].apply(len)
sns.histplot(df[df['label']=='spam']['msg_length'], color='red', label='Spam', kde=True)
sns.histplot(df[df['label']=='ham']['msg_length'], color='blue', label='Ham', kde=True)
plt.legend()
plt.title("Message Length Distribution")
plt.show()



In [None]:
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})
X = df['message']
y = df['label_num']


In [None]:
tfidf = TfidfVectorizer(stop_words='english',max_features=3000)
X_tfidf = tfidf.fit_transform(X)
X_tfidf.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
print(X_train.shape,X_test.shape)

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
print("Accuracy:", accuracy_score(y_test, lr_pred))
print(classification_report(y_test, lr_pred))
sns.heatmap(confusion_matrix(y_test, lr_pred),annot=True, fmt='d', cmap='Blues')
plt.title("Logistic Regression Confusion Matrix")
plt.show()


# Conclusion and Insights
TF-IDF vectorization effectively captured text patterns.
Spam messages are generally longer and more repetitive.

In [None]:
import joblib
joblib.dump(tfidf, "models/tfidf_vectorizer.pkl")

In [None]:
loaded_model = joblib.load("models/spam_model.pkl")
loaded_vectorizer = joblib.load("models/tfidf_vectorizer.pkl")
sample = ["Congratulations! You won a free prize"]
sample_vec = loaded_vectorizer.transform(sample)
loaded_model.predict(sample_vec)
