In [12]:
import kagglehub
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, roc_auc_score
import nltk

# Загрузка необходимых ресурсов NLTK
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bozal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Bozal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
# Шаг 1: Загрузка данных через kagglehub
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")
df = pd.read_csv(f"{path}/spam.csv", encoding="latin-1")

# Проверка столбцов
print("Столбцы в датасете:", df.columns)

# Оставляем только первые два столбца
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

# Преобразуем метки в числовой формат: ham → 0, spam → 1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

Столбцы в датасете: Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [15]:
# Шаг 2: Предобработка текста
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text).lower()
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return ' '.join(words)

df['cleaned_message'] = df['message'].apply(preprocess_text)





In [16]:
# Шаг 3: Векторизация текста
bow_vectorizer = CountVectorizer(max_features=5000)
X_bow = bow_vectorizer.fit_transform(df['cleaned_message'])

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_message'])

y = df['label']

In [17]:
# Шаг 4: Разделение на обучающую и тестовую выборки
X_train_bow, X_test_bow, y_train, y_test = train_test_split(X_bow, y, test_size=0.3, random_state=42)
X_train_tfidf, X_test_tfidf, _, _ = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)


In [18]:
# Шаг 5: Построение моделей и их оценка
models = {
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": MultinomialNB()
}

for name, model in models.items():
    print(f"\nМодель: {name}")
    
    # Bag of Words
    model.fit(X_train_bow, y_train)
    y_pred_bow = model.predict(X_test_bow)
    print("\nBoW:")
    print(classification_report(y_test, y_pred_bow))
    print("ROC-AUC:", roc_auc_score(y_test, model.predict_proba(X_test_bow)[:, 1]))
    
    # TF-IDF
    model.fit(X_train_tfidf, y_train)
    y_pred_tfidf = model.predict(X_test_tfidf)
    print("\nTF-IDF:")
    print(classification_report(y_test, y_pred_tfidf))
    print("ROC-AUC:", roc_auc_score(y_test, model.predict_proba(X_test_tfidf)[:, 1]))



Модель: Logistic Regression

BoW:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1453
           1       0.99      0.84      0.91       219

    accuracy                           0.98      1672
   macro avg       0.99      0.92      0.95      1672
weighted avg       0.98      0.98      0.98      1672

ROC-AUC: 0.986830899383106

TF-IDF:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1453
           1       0.96      0.68      0.79       219

    accuracy                           0.95      1672
   macro avg       0.96      0.84      0.88      1672
weighted avg       0.95      0.95      0.95      1672

ROC-AUC: 0.9876794036586247

Модель: Naive Bayes

BoW:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1453
           1       0.90      0.93      0.92       219

    accuracy                           0.98      1672
  