In [10]:
import pandas as pd
import numpy as np
import torch
import torchvision                  
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import re,string
import torch.optim as optim
from sklearn.model_selection import train_test_split

from torchtext.legacy.data import field, TabularDataset, BucketIterator
from torch.nn.utils.rnn import pack_padded_sequence,pad_packed_sequence


In [3]:
# text preprocessing

df = pd.read_csv('bbc-text.csv')

# print(df.head())

print(df.category.unique())

text = np.array(df.text)

# print(text)

['tech' 'business' 'sport' 'entertainment' 'politics']


In [4]:
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [5]:
def Cleanup(data):
    data = data.lower()#converting string to lowercase
    res2 = re.sub(f'[{string.punctuation}]+',' ',data)#removing non english and special characters
    res3 = re.sub(r'[^a-z0-9A-Z\s]+',' ',res2)#removing anyother that is not consider in above
    res4 = re.sub(r'(\n)+',' ',res3)#removing all new line characters
    res = re.sub(r'\s{2,}',' ',res4)#remove all the one or more consecutive occurance of sapce
    res = res.strip()
    return res 


df['text'] = df['text'].apply(Cleanup)
df.head()


Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldcom...
2,sport,tigers wary of farrell gamble leicester say th...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [6]:
map_classes = {j:i for i,j in list(enumerate(df['category'].unique()))}
map_classes

{'tech': 0, 'business': 1, 'sport': 2, 'entertainment': 3, 'politics': 4}

In [7]:
df['category'] = df['category'].apply(lambda x: map_classes[x])
df.head()

Unnamed: 0,category,text
0,0,tv future in the hands of viewers with home th...
1,1,worldcom boss left books alone former worldcom...
2,2,tigers wary of farrell gamble leicester say th...
3,2,yeading face newcastle in fa cup premiership s...
4,3,ocean s twelve raids box office ocean s twelve...


In [8]:
df.shape

(2225, 2)

In [9]:
df.isna().sum()

category    0
text        0
dtype: int64

In [11]:
np.unique(np.array(df['category']),return_counts=True)

(array([0, 1, 2, 3, 4], dtype=int64),
 array([401, 510, 511, 386, 417], dtype=int64))

In [14]:
train,test = train_test_split(df,test_size=0.1)
print(train.shape)
print(test.shape)


(2002, 2)
(223, 2)


In [15]:
train_data,validation = train_test_split(train,test_size=0.2)
print(train_data.shape)
print(validation.shape)

(1601, 2)
(401, 2)


In [17]:
train_data = train_data.reset_index(drop=True)
test = test.reset_index(drop=True)
validation = validation.reset_index(drop=True)

print(train_data.shape)
print(validation.shape)
print(test.shape)

(1601, 2)
(401, 2)
(223, 2)


In [22]:
print(np.unique(train_data.category,return_counts=True))
print(np.unique(validation.category,return_counts=True))
print(np.unique(test.category,return_counts=True))

(array([0, 1, 2, 3, 4], dtype=int64), array([286, 360, 358, 288, 309], dtype=int64))
(array([0, 1, 2, 3, 4], dtype=int64), array([ 80,  88, 104,  59,  70], dtype=int64))
(array([0, 1, 2, 3, 4], dtype=int64), array([35, 62, 49, 39, 38], dtype=int64))


In [23]:
train_data.to_csv('train.csv',index=False)
print("train_data done")
validation.to_csv('validation.csv',index=False)
print("validation data done")
test.to_csv('test.csv',index=False)
print('test data done')

train_data done
validation data done
test data done
