In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow import keras
from tensorflow.keras.layers import Dense


In [2]:
data_load = pd.read_csv('bbc-news-data.csv', sep='\t')
data_load

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...
...,...,...,...,...
2220,tech,397.txt,BT program to beat dialler scams,BT is introducing two initiatives to help bea...
2221,tech,398.txt,Spam e-mails tempt net shoppers,Computer users across the world continue to i...
2222,tech,399.txt,Be careful how you code,A new European directive could put software w...
2223,tech,400.txt,US cyber security chief resigns,The man making sure US computer networks are ...


In [3]:
number_team = len(data_load['category'].unique())
number_team

5

In [4]:
def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))
    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    return text



In [5]:
def transform(data_load, column, test_size=0.2):
    data = data_load.copy()
    data.replace({'category':{'business':0, 'entertainment':1, 'politics':2, 'sport':3, 'tech':4}}, inplace=True)
    target = data['category']
    data = data[column]
    data = data.apply(lambda x: clean_text(x))
    tfidf = TfidfVectorizer(max_features=3000, stop_words='english')
    data = tfidf.fit_transform(data).toarray()
    if test_size == 0:
        return data, target
    x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=test_size, random_state=42)
    return x_train, x_test, y_train, y_test
    


In [6]:
# x_train, x_test, y_train, y_test = transform(data_load, 'title')
# x_train

In [7]:
model = keras.Sequential([Dense(100, activation='relu'),
                          Dense(50, activation='relu'),
                          Dense(number_team, activation='softmax')])    
model.compile(loss='MeanSquaredError', metrics=['MeanSquaredError', 'AUC'], optimizer='adam')


In [8]:
def train_model(x_train, y_train):
    y_train_cat = keras.utils.to_categorical(y_train, number_team)
    model.fit(x_train, y_train_cat, batch_size=5, epochs=100, verbose=False)
    return model

In [9]:
x_train, x_test, y_train, y_test = transform(data_load, 'title')
y = train_model(x_train, y_train).predict(x_test)
y_pred = np.argmax(y, axis=1)
print(f'accuracy = {sum(y_pred == y_test) / len(y_pred)}')

  text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)


accuracy = 0.8089887640449438


In [10]:
x_train, x_test, y_train, y_test = transform(data_load, 'content')
y = train_model(x_train, y_train).predict(x_test)
y_pred_content = np.argmax(y, axis=1)
print(f'accuracy = {sum(y_pred_content == y_test) / len(y_pred_content)}')

accuracy = 0.9640449438202248


In [11]:
out = pd.DataFrame()
out['title'] = y_pred
out['content'] = y_pred_content
out

Unnamed: 0,title,content
0,0,0
1,0,0
2,3,3
3,0,0
4,2,2
...,...,...
440,3,1
441,0,0
442,2,2
443,1,1
