In [12]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

In [13]:
# Выбор новостных категорий из датасета
categories = ['sci.med','sci.electronics', 'sci.space', 'rec.sport.baseball', 'soc.religion.christian']

news_data = fetch_20newsgroups(subset='all', categories=categories)

# Создание обучающей и тестовой выборок
X, y = news_data.data, news_data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.45, random_state=42)

# Количество документов в наборе данных
print(len(news_data.data))

4952


In [14]:
# Пример документа
print(news_data.data[0])

Organization: University of Illinois at Chicago, academic Computer Center
From: <U49839@uicvm.uic.edu>
Subject: Re: Harry Caray
Distribution: na
Lines: 17


last night bill veeck cam to me in my dreams and this is what he said:

cubs suck cubs suck cubs suck cubs suck cubs suck cubs suck cubs suck
cubs suck cubs suck cubs suck cubs suck cubs suck cubs suck cubs scuk
cubs suck cubs suck cubs suck cubs cuck cubs suck cubs suck cubs suck
cubs suck cubs suck cubs suck cubs suck cubs suck cubs suck cubs suck
cubs suck cubs suck cubs suck cubs suck cubs suck cubs suck cubs suck

oh yeah, he aqlso added that harry is a drunken idiot who shoulda
stayed in st louis where his heart is, but also added that fair weathered
fans all like to be together.  i guess this is the reason harry is now
a cub fan, bud man.  note he never really left st, louis.

jim walker

go sox, cubs suck!



In [15]:
# Проверим наличие и структуру метаданных в документах

import random

def metadata_demo(data, num_indices = 3):
    max_index = len(data)
    document_num = [random.randint(0, max_index - 1) for _ in range(num_indices)]
    for i in document_num:
        print(data[i][:200])
        print("-" * 50)
metadata_demo(X)

From: sbishop@desire.wright.edu
Subject: Re: Hismanal, et. al.--side effects
Organization:  Wright State University 
Lines: 22

In article <1993Apr21.024103.29880@spdcc.com>, dyer@spdcc.com (Steve Dye
--------------------------------------------------
From: hudson@athena.cs.uga.edu (Paul Hudson Jr)
Subject: Re: Hell_2:  Black Sabbath
Organization: University of Georgia, Athens
Lines: 8

In article <Apr.22.00.57.03.1993.2118@geneva.rutgers.edu> jprz
--------------------------------------------------
From: geb@cs.pitt.edu (Gordon Banks)
Subject: Re: CAN'T BREATHE
Article-I.D.: pitt.19438
Reply-To: geb@cs.pitt.edu (Gordon Banks)
Organization: Univ. of Pittsburgh Computer Science
Lines: 33

In artic
--------------------------------------------------


In [16]:
# Удаление метаданных, символов и пунктуации

import re

def clean_text(text):
    # Удаление метаданных (начинающихся с ключевых слов)
    text = re.sub(r'^(From|Subject|Organization|Lines):.*$', '', text, flags=re.MULTILINE)
    text = re.sub(r'\b\w*postinghost\w*\b', '', text, flags=re.IGNORECASE)
    # Удаление цитат (строки, начинающиеся с ">")
    text = re.sub(r'^>.*$', '', text, flags=re.MULTILINE)
    # Удаление пустых строк
    text = re.sub(r'\n+', '\n', text)
    # Удаление адресов электронной почты
    text = re.sub(r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})|()', '', text)
    # Удаление символов и пунктуации
    text = re.sub(r'[!"#$%&\'()*+,./:;<=>?@\[\]^_`{|}~«»—*\-\—]', '', text)
    # Удаление чисел
    text = re.sub(r'\d+', '', text)
    return text.strip()

X_train = [clean_text(document) for document in X_train]
X_test = [clean_text(document) for document in X_test]

metadata_demo(X_train)

Keywords WHY
In article   William Pollak writes
Deletions
Geez Dal must have slipped something into Teds drink sometime  Comparing
Prince to Pagnozzi offensively is laughable  Prince has never hit wel
--------------------------------------------------
NntpPostingHost aisunaiugaedu
In article   Peter Tryndoch writes
Up to  microamperes     on hook
Over something like  mA   off hook
In between  defective line and the phone company comes looking
     
--------------------------------------------------
Keywords hearing loss vitamin A
ArticleID bananaApr
Distribution sci
i heard a news report indicating research showing improved         
hearing in people taking vitamin A the research showed that new
--------------------------------------------------


In [17]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\qqqq\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\qqqq\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\qqqq\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\qqqq\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
# Токенизация по словам
tokenized_news_data = [word_tokenize(document.lower()) for document in X_train]
tokenized_test_news_data = [word_tokenize(document.lower()) for document in X_test]
# Удаление стоп-слов
stop_words = sorted(stopwords.words('english'))
nostopword_tokenized_news_data = [
    [w for w in document if w.strip() and w not in stop_words]
    for document in tokenized_news_data
]
nostopword_tokenized_test_news_data = [
    [w for w in document if w.strip() and w not in stop_words]
    for document in tokenized_test_news_data
]


In [19]:
from nltk.stem import WordNetLemmatizer

nltk_lemmatizer = WordNetLemmatizer()
# Лемматизация обработанного двумерного массива слов
lemmatized_tokens = [
    [nltk_lemmatizer.lemmatize(w) for w in document] 
    for document in nostopword_tokenized_news_data
]
lemmatized_test_tokens = [
    [nltk_lemmatizer.lemmatize(w) for w in document] 
    for document in nostopword_tokenized_test_news_data
]

# Преобразование вложенных массивов в строки
lemmatized_documents = [' '.join(tokens) for tokens in lemmatized_tokens]
lemmatized_test_documents = [' '.join(tokens) for tokens in lemmatized_test_tokens]

print(lemmatized_tokens[0])

['distribution', 'world', 'nntppostinghost', 'dolphinzoocsyaleedu', 'recently', 'ive', 'come', 'upon', 'body', 'literature', 'promotes', 'colon', 'cleansing', 'vital', 'aid', 'preventive', 'medicine', 'nutrition', 'particular', 'dr', 'bernard', 'jenssen', 'book', 'colon', 'cleansing', 'health', 'longevity', 'title', 'actually', 'escape', 'similar', 'claim', 'regular', 'selfadministered', 'colonic', 'along', 'certain', 'orally', 'ingested', 'debrisloosening', 'agent', 'boost', 'immune', 'system', 'significant', 'degree', 'also', 'plug', 'unique', 'appliance', 'called', 'colema', 'board', 'facilitates', 'selfadministration', 'colonic', 'sell', 'californiabased', 'company', 'also', 'plug', 'vitratox', 'product', 'chemical', 'agent', 'choice', 'include', 'volcanic', 'ash', 'supposedly', 'electrical', 'charge', 'psyllium', 'powder', 'bulkiness', 'anyone', 'know', 'anything', 'colon', 'cleansing', 'theory', 'particular', 'colema', 'board', 'related', 'product', 'id', 'interested', 'hear', 'r

In [20]:
import time
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [21]:
# Инициализация моделей для классификации

# Векторизация по униграммам
model1_LR = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words='english')),
    ('logistic regression', LogisticRegressionCV(cv=3, n_jobs=-1))
])

# Векторизация по униграммам и биграммам 
model2_LR = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('logistic regression', LogisticRegressionCV(cv=3, n_jobs=-1))
])

# Векторизация по униграммам и биграммам, параметр кросс-валидации увеличен до cv=5
model3_LR = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('logistic regression', LogisticRegressionCV(cv=5, n_jobs=-1))
])

# Векторизация по униграммам и триграммам
model4_LR = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words='english', ngram_range=(2, 3))),
    ('logistic regression', LogisticRegressionCV(cv=3, n_jobs=-1))
])

# Метод случайного леса
model5_RFC = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('random_forest', RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1))
])

In [11]:
start_time = time.time()
model1_LR.fit(lemmatized_documents, y_train)
train_time = time.time() - start_time

train_accuracy = accuracy_score(y_train, model1_LR.predict(lemmatized_documents))
test_accuracy = accuracy_score(y_test, model1_LR.predict(lemmatized_test_documents))
print(f'Accuracy on train for model1_LR: {train_accuracy:.4f}\nAccuracy on test for model1_LR: {test_accuracy:.4f}')
print(f'Train time for model1_LR: {train_time}')

start_time = time.time()
model2_LR.fit(lemmatized_documents, y_train)
train_time = time.time() - start_time

train_accuracy = accuracy_score(y_train, model2_LR.predict(lemmatized_documents))
test_accuracy = accuracy_score(y_test, model2_LR.predict(lemmatized_test_documents))
print(f'Accuracy on train for model2_LR: {train_accuracy:.4f}\nAccuracy on test for model2_LR: {test_accuracy:.4f}')
print(f'Train time for model2_LR: {train_time}')

start_time = time.time()
model3_LR.fit(lemmatized_documents, y_train)
train_time = time.time() - start_time

train_accuracy = accuracy_score(y_train, model3_LR.predict(lemmatized_documents))
test_accuracy = accuracy_score(y_test, model3_LR.predict(lemmatized_test_documents))
print(f'Accuracy on train for model3_LR: {train_accuracy:.4f}\nAccuracy on test for model3_LR: {test_accuracy:.4f}')
print(f'Train time for model3_LR: {train_time}')

start_time = time.time()
model4_LR.fit(lemmatized_documents, y_train)
train_time = time.time() - start_time

train_accuracy = accuracy_score(y_train, model4_LR.predict(lemmatized_documents))
test_accuracy = accuracy_score(y_test, model4_LR.predict(lemmatized_test_documents))
print(f'Accuracy on train for model4_LR: {train_accuracy:.4f}\nAccuracy on test for model4_LR: {test_accuracy:.4f}')
print(f'Train time for model4_LR: {train_time}')

Accuracy on train for model1_LR: 1.0000
Accuracy on test for model1_LR: 0.9480
Train time for model1_LR: 4.732477903366089
Accuracy on train for model2_LR: 1.0000
Accuracy on test for model2_LR: 0.9556
Train time for model2_LR: 20.87546396255493
Accuracy on train for model3_LR: 1.0000
Accuracy on test for model3_LR: 0.9547
Train time for model3_LR: 33.637930393218994
Accuracy on train for model4_LR: 1.0000
Accuracy on test for model4_LR: 0.8618
Train time for model4_LR: 30.345017671585083


In [22]:
start_time = time.time()
model5_RFC.fit(lemmatized_documents, y_train)
train_time = time.time() - start_time

train_accuracy = accuracy_score(y_train, model5_RFC.predict(lemmatized_documents))
test_accuracy = accuracy_score(y_test, model5_RFC.predict(lemmatized_test_documents))
print(f'Accuracy on train for model4_RFC: {train_accuracy:.4f}\nAccuracy on test for model4_RFC: {test_accuracy:.4f}')
print(f'Train time for model4_RFC: {train_time}')

Accuracy on train for model4_RFC: 1.0000
Accuracy on test for model4_RFC: 0.8816
Train time for model4_RFC: 10.096070051193237
