#Дудник М. В. ИУ5-22М РК2 ММО

Необходимо сформировать два варианта векторизации признаков - на основе CountVectorizer и на основе TfidfVectorizer.
В качестве классификаторов необходимо использовать RandomForestClassifier и LogisticRegression. Для каждого метода необходимо оценить качество классификации. Сделайте вывод о том, какой вариант векторизации признаков в паре с каким классификатором показал лучшее качество.

In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
import zipfile
import requests
from io import BytesIO
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Download and extract the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
response = requests.get(url)
zip_file = zipfile.ZipFile(BytesIO(response.content))
file_name = zip_file.namelist()[0]
data = zip_file.read('SMSSpamCollection').decode('utf-8')

# Load the dataset into a DataFrame
df = pd.read_csv(BytesIO(data.encode('utf-8')), sep='\t', names=['label', 'message'])

# Encode the labels
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

# Define a function to perform the classification and evaluate the results
def evaluate_vectorizer(vectorizer, classifier, X_train, X_test, y_train, y_test):
    # Transform the text data using the provided vectorizer
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Train the classifier
    classifier.fit(X_train_vec, y_train)

    # Predict on the test set
    y_pred = classifier.predict(X_test_vec)

    # Evaluate the accuracy
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# Define the vectorizers
count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

# Define the classifiers
rf_classifier = RandomForestClassifier(random_state=42)
lr_classifier = LogisticRegression(max_iter=1000, random_state=42)

# Evaluate CountVectorizer + RandomForestClassifier
count_rf_accuracy = evaluate_vectorizer(count_vectorizer, rf_classifier, X_train, X_test, y_train, y_test)
print(f"CountVectorizer + RandomForestClassifier Accuracy: {count_rf_accuracy}")

# Evaluate TfidfVectorizer + RandomForestClassifier
tfidf_rf_accuracy = evaluate_vectorizer(tfidf_vectorizer, rf_classifier, X_train, X_test, y_train, y_test)
print(f"TfidfVectorizer + RandomForestClassifier Accuracy: {tfidf_rf_accuracy}")

# Evaluate CountVectorizer + LogisticRegression
count_lr_accuracy = evaluate_vectorizer(count_vectorizer, lr_classifier, X_train, X_test, y_train, y_test)
print(f"CountVectorizer + LogisticRegression Accuracy: {count_lr_accuracy}")

# Evaluate TfidfVectorizer + LogisticRegression
tfidf_lr_accuracy = evaluate_vectorizer(tfidf_vectorizer, lr_classifier, X_train, X_test, y_train, y_test)
print(f"TfidfVectorizer + LogisticRegression Accuracy: {tfidf_lr_accuracy}")

# Print the results
results = {
    "CountVectorizer + RandomForestClassifier": count_rf_accuracy,
    "TfidfVectorizer + RandomForestClassifier": tfidf_rf_accuracy,
    "CountVectorizer + LogisticRegression": count_lr_accuracy,
    "TfidfVectorizer + LogisticRegression": tfidf_lr_accuracy
}

best_combination = max(results, key=results.get)
print(f"The best combination is {best_combination} with an accuracy of {results[best_combination]}")


CountVectorizer + RandomForestClassifier Accuracy: 0.9847533632286996
TfidfVectorizer + RandomForestClassifier Accuracy: 0.9838565022421525
CountVectorizer + LogisticRegression Accuracy: 0.9883408071748879
TfidfVectorizer + LogisticRegression Accuracy: 0.9766816143497757
The best combination is CountVectorizer + LogisticRegression with an accuracy of 0.9883408071748879
