# Построить классификатор для тем: 

'Россия'
'Мир'
'Экономика'
'Спорт'
'Культура' 

In [None]:
import pandas as pd
import numpy as np
import re
import joblib as pickle

#['Россия', 'Мир', 'Экономика', 'Спорт', 'Культура']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import tree
import nltk

import pymorphy2
import pymorphy3
#from nltk.corpus import stopwords

In [None]:
%%time
df = pd.read_csv('lenta-ru-news.csv')

In [None]:
df.sample(2, random_state=9)

In [None]:
df[df['topic'] == 'Культура'].shape

In [None]:
df.shape

In [None]:
len(pd.unique(df['topic']))

In [None]:
df.topic.value_counts()

In [None]:
df.topic.value_counts(normalize=True)

In [None]:
df.isna().sum()

In [None]:
# уберем строки с пустыми ячейками
df.dropna( inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.shape

In [None]:
RANDOM_STATE = 9

In [None]:
# Процессы обработки, обучения и тп. или не завершаются 
# попробую убавить количество данных

In [None]:
needed_topics = ['Россия', 'Мир', 'Экономика', 'Спорт', 'Культура']
usefull_columns = ['title','text','topic','tags']
topic_filter = df['topic'].isin(needed_topics)
df = df.loc[topic_filter,usefull_columns]

In [None]:
df.sample(2, random_state=RANDOM_STATE)

In [None]:
df.isna().sum()

In [None]:
df.topic.value_counts()

In [None]:
df.shape

### Базовая обработка

In [None]:
%%time
text_transformer = TfidfVectorizer()

In [None]:
%%time
text = text_transformer.fit_transform(df['text'])
#CPU times: total: 1min 25s
#Wall time: 1min 25s
#

In [None]:
%%time
X_train, X_test, y_train, y_test = train_test_split(
    text, 
    df['topic'], 
    test_size=0.20, 
    random_state=RANDOM_STATE)

#CPU times: total: 375 ms
#Wall time: 380 ms

In [None]:
df['text']

In [None]:
text.shape

In [None]:
%%time
print(X_train.shape, X_test.shape)

In [None]:
%%time
clf=tree.DecisionTreeClassifier()

In [None]:
%%time
clf.fit(X_train, y_train)

# bCPU times: total: 1h 49min 53s
# Wall time: 1h 50min 11s

In [None]:
%%time
predict = clf.predict(X_test)
print('Качество модели F1', f1_score(y_test,predict,average='weighted'))

# Качество модели F1 0.7874701153209642
# CPU times: total: 859 ms
# Wall time: 863 ms

# Изменим Токенизатор

In [None]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

#nltk.download()

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords

In [None]:
stop_russian = stopwords.words('russian')

In [None]:
print(stop_russian)

In [None]:
%%time
text_transformer = TfidfVectorizer(stop_words=stop_russian, 
                                   ngram_range=(1,1), 
                                   lowercase=True,
                                   max_features=10_000)

text = text_transformer.fit_transform(df['text'])

X_train, X_test, y_train, y_test = train_test_split(
    text, df['topic'], test_size=0.20, random_state=RANDOM_STATE)

#
#CPU times: total: 1min 39s
#Wall time: 1min 39s
#

In [None]:
X_train.shape

In [None]:
%%time
clf2 = tree.DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)

#CPU times: total: 23.2 s
#Wall time: 23.3 s
#

In [None]:
#Качество модели по метрике F1 0.3221061595134945
#CPU times: total: 4.11 s
#Wall time: 4.12 s
#
%%time
print('Качество модели по метрике F1', f1_score(y_train,clf2.predict(X_train), average='weighted'))

#Качество модели по метрике F1 0.3221061595134945
#CPU times: total: 4.11 s
#Wall time: 4.12 s

# Очистка текста

In [None]:
pattern = r'[^А-Яа-я]+'

In [None]:
def remove_trash(list): 
    pattern = r'[^А-Яа-я]+'
    try:
      list = [re.sub(pattern, ' ', i) for i in list] 
#       print (list)
    except Exception as e:
      print(e)
    return list

In [None]:
df['text_clean'] = remove_trash(df['text'])

In [None]:
df[['text', 'text_clean']].head(2)

In [None]:
df[['text', 'text_clean']].head(2).iloc[1][0]
#df['text'][1]

In [None]:
df[['text', 'text_clean']].head(2).iloc[1][1]
#df['text_clean'][1]

In [None]:
df.isna().sum()

In [None]:
df.head(2)

# Лемматизация

In [None]:
%%time
morph = pymorphy3.MorphAnalyzer()
#CPU times: total: 172 ms
#Wall time: 210 ms

In [None]:
def lemmatize(row):
    t = []
    text = row['text_clean']
    for word in text.split():
        if len(word)<=1:
            continue
        p = morph.parse(word)[0]
        t.append(p.normal_form)
    return " ".join(t)

In [None]:
%%time
df['text_clean_normal'] = df.apply(lemmatize, axis=1)
#CPU times: total: 2h 29min 7s
#Wall time: 2h 29min 41s

# качество модели после лемматизации

In [None]:
%%time
text_transformer = TfidfVectorizer(stop_words=stop_russian, 
                                   ngram_range=(1,1), 
                                   lowercase=True, 
                                   max_features=10000)

text_norm = text_transformer.fit_transform(df['text_clean_normal'])

X_train, X_test, y_train, y_test = train_test_split(
    text_norm, 
    df['topic'], 
    test_size=0.20, 
    random_state=RANDOM_STATE)

#CPU times: total: 1min 10s
#Wall time: 1min 10s

In [None]:
%%time
clf_norm = tree.DecisionTreeClassifier()
clf_norm.fit(X_train,y_train)
pred_norm = clf_norm.predict(X_test)

#CPU times: total: 44min 27s
#Wall time: 44min 28s

In [None]:
%%time
print('Качество модели по метрике F1 после лемматизации', f1_score(y_test,pred_norm,average='weighted'))
#Качество модели по метрике F1 после лемматизации 0.8115521029047505
#CPU times: total: 672 ms
#Wall time: 661 ms

In [None]:
df['text_clean_normal'].head(2)
df.columns