In [0]:
#importing important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [0]:
url = 'https://raw.githubusercontent.com/mdsohaibuddin/BBC-News-Classification/master/bbc-text.csv'
df = pd.read_csv(url, error_bad_lines=False)

In [3]:
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
category    2225 non-null object
text        2225 non-null object
dtypes: object(2)
memory usage: 34.9+ KB


In [5]:
df['category'].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

In [6]:
df.columns

Index(['category', 'text'], dtype='object')

In [7]:
#cleaning the texts
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus=[]
for i in range(0, 2225):
    text = re.sub('[^a-zA-Z]',' ', df['text'][i])
    text = text.lower()
    text = text.split()
    ps= PorterStemmer()
    text=[ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
    text=' '.join(text)
    corpus.append(text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
corpus[1]

'worldcom boss left book alon former worldcom boss berni ebber accus overse bn bn fraud never made account decis wit told juror david myer made comment question defenc lawyer argu mr ebber respons worldcom problem phone compani collaps prosecutor claim loss hidden protect firm share mr myer alreadi plead guilti fraud assist prosecutor monday defenc lawyer reid weingarten tri distanc client alleg cross examin ask mr myer ever knew mr ebber make account decis awar mr myer repli ever know mr ebber make account entri worldcom book mr weingarten press repli wit mr myer admit order fals account entri request former worldcom chief financi offic scott sullivan defenc lawyer tri paint mr sullivan admit fraud testifi later trial mastermind behind worldcom account hous card mr ebber team meanwhil look portray affabl boss admiss pe graduat economist whatev abil mr ebber transform worldcom rel unknown bn telecom giant investor darl late worldcom problem mount howev competit increas telecom boom pet

In [0]:
#keras.preprocessing.text.Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, document_count=0)

In [10]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, document_count=0)

Using TensorFlow backend.


In [0]:
tokenizer.fit_on_texts(corpus) 
X = tokenizer.texts_to_sequences(corpus)

In [12]:
X[1]

[1475,
 583,
 322,
 364,
 1329,
 174,
 1475,
 583,
 4264,
 1491,
 562,
 3262,
 35,
 35,
 838,
 361,
 45,
 332,
 183,
 1987,
 39,
 3898,
 416,
 4486,
 45,
 417,
 372,
 857,
 917,
 544,
 3,
 1491,
 328,
 1475,
 166,
 94,
 28,
 1607,
 1620,
 96,
 752,
 2474,
 399,
 32,
 88,
 3,
 4486,
 160,
 1903,
 1154,
 838,
 1262,
 1620,
 507,
 857,
 917,
 3749,
 6805,
 101,
 1673,
 1764,
 771,
 811,
 1904,
 236,
 3,
 4486,
 452,
 1374,
 3,
 1491,
 15,
 332,
 183,
 1136,
 3,
 4486,
 2475,
 452,
 181,
 3,
 1491,
 15,
 332,
 1343,
 1475,
 364,
 3,
 6805,
 677,
 2475,
 1987,
 3,
 4486,
 648,
 369,
 1765,
 332,
 1343,
 1645,
 174,
 1475,
 155,
 362,
 140,
 1540,
 1251,
 857,
 917,
 101,
 2539,
 3,
 1251,
 648,
 838,
 6806,
 326,
 503,
 6208,
 453,
 1475,
 332,
 233,
 356,
 3,
 1491,
 165,
 831,
 60,
 2476,
 12867,
 583,
 3163,
 7520,
 3488,
 926,
 1955,
 1025,
 3,
 1491,
 2333,
 1475,
 1421,
 3389,
 35,
 1178,
 608,
 737,
 5017,
 643,
 1475,
 166,
 2608,
 131,
 373,
 123,
 1178,
 1525,
 1018,
 32,
 126,
 1

In [13]:
#padding
from keras.preprocessing.sequence import pad_sequences

X = pad_sequences(X)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (2225, 2203)


In [14]:
X[1].shape

(2203,)

In [0]:
mapping = {i: n for n, i in enumerate(list(set(df.category)))}

In [16]:
mapping

{'business': 0, 'entertainment': 1, 'politics': 4, 'sport': 3, 'tech': 2}

In [17]:
list(set(df.category))

['business', 'entertainment', 'tech', 'sport', 'politics']

In [0]:
df['target'] = df.category.map(mapping)

In [19]:
df.head()

Unnamed: 0,category,text,target
0,tech,tv future in the hands of viewers with home th...,2
1,business,worldcom boss left books alone former worldc...,0
2,sport,tigers wary of farrell gamble leicester say ...,3
3,sport,yeading face newcastle in fa cup premiership s...,3
4,entertainment,ocean s twelve raids box office ocean s twelve...,1


In [0]:
# one hot encode
from keras.utils import to_categorical
Y = to_categorical(df['target'])

In [21]:
Y

array([[0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.]], dtype=float32)

In [0]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

In [0]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

 

In [0]:
N_words = len(tokenizer.word_counts)

In [26]:
embed_dim = 128
lstm_out = 200
batch_size = 32

model = Sequential()
model.add(Embedding(N_words + 1, embed_dim, input_length = X.shape[1], dropout = 0.2, name='emb'))
model.add(LSTM(lstm_out))
model.add(Dense(5,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())






  




Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
emb (Embedding)              (None, 2203, 128)         2431872   
_________________________________________________________________
lstm_1 (LSTM)                (None, 200)               263200    
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 1005      
Total params: 2,696,077
Trainable params: 2,696,077
Non-trainable params: 0
_________________________________________________________________
None


In [42]:
#Here we train the Network.

train = model.fit( X_train, y_train, batch_size = 128, epochs=10,  verbose = 5)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [43]:
result = model.evaluate(X_test, y_test)



In [44]:
print("SCORE, ACCURACY: ", result)

SCORE, ACCURACY:  [0.14192678839847278, 0.9730337078651685]
