In [2]:
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already present
nltk.download("stopwords")

# Now you can use stopwords
stop_words = set(stopwords.words("english"))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\agnih\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from sklearn.svm import SVC
import joblib


In [1]:
import re  # For regular expressions
import pandas as pd  # For handling dataframes
import numpy as np  # For numerical operations
import pickle  # For saving/loading models

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


In [None]:
# Load fraud_call.file
fraud_call_path = 'data/fraud_call.file'

# Read the file (assuming tab-separated)
try:
    with open(fraud_call_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
except FileNotFoundError:
    print(f'Error: {fraud_call_path} file not found.')
    exit(1)

# Convert to DataFrame
texts = []
labels = []

for line in lines:
    if not line.strip():  # Ignore empty lines
        continue
    parts = re.split(r'\t|,|\s{2,}', line.strip())  # Auto-detect separator
    if len(parts) >= 2:  # Ensure we have at least two parts
        labels.append(1 if parts[0].strip().lower() == 'fraud' else 0)  # Case insensitive
        texts.append(parts[1].strip())  # Clean text

spam_df = pd.DataFrame({'text': texts, 'label': labels})

# Check dataset
print(spam_df.head())
print('Total rows:', len(spam_df))

# Preprocess Text Data
stop_words = set(stopwords.words('english'))  # Define stopwords

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    return ' '.join([word for word in text.split() if word not in stop_words])

spam_df['clean_text'] = spam_df['text'].apply(clean_text)

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=500)

# Convert text to TF-IDF features
X_spam = vectorizer.fit_transform(spam_df['clean_text']).toarray()
y_spam = spam_df['label']

# Train-test split
X_train_spam, X_test_spam, y_train_spam, y_test_spam = train_test_split(X_spam, y_spam, test_size=0.2, random_state=42)

# Train an SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_spam, y_train_spam)

# Save the trained model & vectorizer
joblib.dump(svm_model, 'spam_text_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

# Evaluate the model
y_pred_spam = svm_model.predict(X_test_spam)
print('Spam Detection Model Accuracy:', accuracy_score(y_test_spam, y_pred_spam))
print(classification_report(y_test_spam, y_pred_spam))
