In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow import keras
from tensorflow.keras.layers import Dense
import warnings
warnings.filterwarnings('ignore')


# загрузка датасета
https://www.kaggle.com/hgultekin/bbcnewsarchive

In [3]:
data_load = pd.read_csv('bbc-news-data.csv', sep='\t')
data_load

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...
...,...,...,...,...
2220,tech,397.txt,BT program to beat dialler scams,BT is introducing two initiatives to help bea...
2221,tech,398.txt,Spam e-mails tempt net shoppers,Computer users across the world continue to i...
2222,tech,399.txt,Be careful how you code,A new European directive could put software w...
2223,tech,400.txt,US cyber security chief resigns,The man making sure US computer networks are ...


# кол-во классов целевой переменной

In [4]:
number_team = len(data_load['category'].unique())
number_team

5

# функция очистки данных(входит в предобработку)

In [5]:
def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
    
#     text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))
    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    return text



# функция предобработки данных

In [6]:
def transform(data_load, column, test_size=0.2):
    data = data_load.copy()
    data.replace({'category':{'business':0, 'entertainment':1, 'politics':2, 'sport':3, 'tech':4}}, inplace=True)
    target = data['category']
    data = data[column]
    data = data.apply(lambda x: clean_text(x))
    tfidf = TfidfVectorizer(max_features=10000, stop_words='english')
    data = tfidf.fit_transform(data).toarray()
    if test_size == 0:
        return data, target
    x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=test_size, random_state=42)
    return x_train, x_test, y_train, y_test
    


In [7]:
# x_train, x_test, y_train, y_test = transform(data_load, 'title')
# x_train

# функция тренировки модели(keras)

In [8]:
def train_model(x_train, y_train):
    model = keras.Sequential([Dense(32, activation='sigmoid'),
#                           Dense(16, activation='relu'),
                          Dense(number_team, activation='softmax')])    
    model.compile(loss='MeanSquaredError', metrics=['CategoricalAccuracy'], optimizer='Adam')
    y_train_cat = keras.utils.to_categorical(y_train, number_team)
    model.fit(x_train, y_train_cat, batch_size=5, epochs=25, verbose=False)
    return model

# тренировка только на столбце 'content'

In [9]:
%%time
x_train, x_test, y_train, y_test = transform(data_load, 'content')
model_content = train_model(x_train, y_train)
y = model_content.predict(x_test)
y_pred_content = np.argmax(y, axis=1)
print(f'accuracy = {sum(y_pred_content == y_test) / len(y_pred_content)}')

accuracy = 0.9752808988764045
Wall time: 13.7 s


# проверка модели на всех данных 

In [10]:
data, target = transform(data_load, 'content', 0)
y = model_content.predict(data)
y_pred_content = np.argmax(y, axis=1)
print(f'accuracy = {sum(y_pred_content == target) / len(y_pred_content)}')

accuracy = 0.9950561797752809


# объеденим  'title' и 'content' и обучим модель снова

In [11]:
%%time
data_load['title_content'] = data_load['title'] + ' ' + data_load['content']
x_train, x_test, y_train, y_test = transform(data_load, 'title_content')

model_dual = train_model(x_train, y_train)

Wall time: 12.5 s


In [12]:
y = model_dual.predict(x_test)
y_pred = np.argmax(y, axis=1)
print(f'accuracy = {sum(y_pred == y_test) / len(y_pred)}')

accuracy = 0.9775280898876404


# метрика не значительно выросла от использования объединения 'title' и 'content'

# посмотрим матрицу ошибок

In [13]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[111,   1,   1,   0,   2],
       [  0,  72,   0,   0,   0],
       [  2,   0,  73,   0,   1],
       [  1,   0,   0, 101,   0],
       [  0,   2,   0,   0,  78]], dtype=int64)

#  лематизация

In [14]:
from pywsd.utils import lemmatize_sentence
from nltk.corpus import stopwords


Warming up PyWSD (takes ~10 secs)... took 3.6274592876434326 secs.


In [15]:
# x = lemmatize_sentence(data_load['content'][0])
# text = ' '.join(x)
# stopWords = set(stopwords.words('english'))
# len(' '.join(word_tokenize(text)))

## перепишем функцию очистки

In [16]:
def transform_lem(data_load, column, test_size=0.2):
    data = data_load.copy()
    data.replace({'category':{'business':0, 'entertainment':1, 'politics':2, 'sport':3, 'tech':4}}, inplace=True)
    target = data['category']
    data = data[column]
    data = data.apply(lambda x: clean_text(x))
    stopWords = set(stopwords.words('english'))
    for string in range(len(data)):                         # приведение в нормальную форму
        x = lemmatize_sentence(data[string])
        list_words = []
        for word in x:                                      # добавим удаление стоп слов 
            word = word.strip()
            if not word in stopWords and len(word) > 2:
                list_words.append(word)
        data[string] = ' '.join(list_words)
    tfidf = TfidfVectorizer(max_features=10000)
    data = tfidf.fit_transform(data).toarray()
    if test_size == 0:
        return data, target
    x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=test_size, random_state=42)
    return x_train, x_test, y_train, y_test

In [17]:
# %%time
# a = transform_lem(data_load[:3], 'content', 0)

In [18]:
# a[0][0], len(a[0][0])

# обучим ту же модель по лематизированным данным

## без объединения 'title' и 'content'

In [19]:
%%time
x_train, x_test, y_train, y_test = transform_lem(data_load, 'content')
model_content_lem = train_model(x_train, y_train)

Wall time: 2min 32s


In [20]:
y = model_content_lem.predict(x_test)
y_pred = np.argmax(y, axis=1)
print(f'accuracy = {sum(y_pred == y_test) / len(y_pred)}')

accuracy = 0.9775280898876404


In [21]:
confusion_matrix(y_test, y_pred)

array([[113,   1,   1,   0,   0],
       [  0,  72,   0,   0,   0],
       [  3,   0,  71,   1,   1],
       [  1,   0,   0, 101,   0],
       [  0,   2,   0,   0,  78]], dtype=int64)

## объединенные 'title' и 'content'

In [22]:
%%time
# data_load['title_content'] = data_load['title'] + ' ' + data_load['content']
x_train, x_test, y_train, y_test = transform_lem(data_load, 'title_content')
model_dual_lem = train_model(x_train, y_train)

Wall time: 2min 34s


In [23]:
y = model_dual_lem.predict(x_test)
y_pred = np.argmax(y, axis=1)
print(f'accuracy = {sum(y_pred == y_test) / len(y_pred)}')

accuracy = 0.9752808988764045


## объединение заголовка с телом поста не приносит плодов, исключим эту функцию на конечную модель. лематизация приносит так же незначительное улучшение метрик. в финальной модели оставим лематизацию.

### сохраняем модель

In [31]:
# наилучшая модель model_content_lem

name_model = 'finaly_model_kurs.h5'

model_content_lem.save(name_model)
# model_content.save_weights('weights.h5')

### загрузим и проверим модель

In [32]:
from tensorflow.keras.models import load_model
model_load = load_model(name_model)

# model_load.load_weights('weights.h5')

In [34]:
%%time
x_train, x_test, y_train, y_test = transform_lem(data_load, 'content')
y = model_load.predict(x_test)
y_pred = np.argmax(y, axis=1)
print(f'accuracy = {sum(y_pred == y_test) / len(y_pred)}')

accuracy = 0.9775280898876404
Wall time: 67.1 ms
