In [1]:
from __future__ import print_function
import pandas as pd
import argparse
import torch
import json
from torch import nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
from __future__ import unicode_literals
from hazm import *

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
device

device(type='cuda')

In [4]:
df_train = pd.read_csv("./News/train.csv", sep='\t', error_bad_lines= False , encoding= 'utf-8')
df_test = pd.read_csv("./News/test.csv", sep='\t', error_bad_lines= False , encoding= 'utf-8')

In [11]:
from torch.nn.utils.rnn import pad_sequence
class DataPreprocessor:
    
    def __init__(self, train_data="./News/train.csv", test_data="./News/test.csv", mode="train"):
        self.read_data(train_data, test_data)
#         self.plot_distribution(mode=mode)
        self.clean_text(mode=mode)
        self.count_words()
        self.map_word_index()
        setattr(self,f"{mode}", self.tokenizer())
        
        
    def first_item(self):
        return self.train_data[0]
    
    def read_data(self ,train_data, test_data):
        self.train_data = pd.read_csv(train_data, sep="\t", error_bad_lines=False, encoding="utf-8")
        self.train_data = self.train_data.dropna(subset=['category'])
        self.train_data = self.train_data.loc[self.train_data['category']!='category']
        self.test_data = pd.read_csv(test_data, sep="\t", error_bad_lines=False, encoding="utf-8")
        self.test_data = self.test_data.dropna(subset=['category'])
        self.test_data = self.test_data.loc[self.test_data['category']!='category']
    
#     def plot_distribution(self, mode="train"):
#         if mode=="train":
#             self.train_data['category'].value_counts(sort=False).plot(kind='bar')
#         else:
#             self.test_data['category'].value_counts(sort=False).plot(kind='bar')
        
    
    def clean_text(self, mode="train"):
        ws = []
        source = self.train_data.text
        if mode=="test":
            source = self.test_data.text
        for a in source.astype(str):
            w = [tk for tk in word_tokenize(a) if tk.isalpha()] + ['<eos>']
            ws.append(w)
        self.cleaned_data = ws
        www = []
        for w in self.cleaned_data:
            ww = []
            for item in w:
                if "." in item:
                    if item.split(".")[0]!='':
                        ww.extend([item.split(".")[0]])
                elif not item.encode().isalpha():
                    ww.append(item)
            www.append(ww)
            
        self.tokenized_data = www
        print(www[:1])
            
    
    def count_words(self):
        www = self.tokenized_data
        di = dict()
        for ww in www:
            for i in ww:
                if i not in di:
                    di[i] = 1
                else:
                    di[i]+=1
        new_di = dict()
        for key in di:
            if di[key] < 5000:
                if "unk" not in new_di:
                    new_di["unk"] = 1
                else:
                    new_di["unk"] += 1
        
        for key in di:
            if di[key] >= 2000:
                new_di[key] = di[key]
#         new_di["unk"] = di["unk"]
#         srt = sorted(di.items(), key=lambda item: item[1],reverse=True)
#         asrt = srt
#         self.tokens = asrt
#         with open("‫‪frequencies‬‬.txt","w") as file:
#             file.write(str(srt[:200]))
        dictionary = new_di
#         print(f"tokens={sum(di.values())}, and unique tokens are={len(srt)}")
        self.frequencies_dict = dictionary
        self.n_tokens = sum(new_di.values())
        self.n_unique_tokens = len(new_di)

    def map_word_index(self):
        vocabs = self.frequencies_dict.keys()
        self.word_to_index = {}
        self.index_to_word = {}
        for index , vocab in enumerate(vocabs):
            self.index_to_word[index+1] = vocab
            self.word_to_index[vocab] = index+1
        print("Saving dictionary...")
#         with open("‫‪indeces‬‬.json","w") as file:
#             json.dump(self.word_to_index, file, ensure_ascii=False)
        print("Saving dictionary finished")
        
    
    def tokenizer(self):
        www = []
        cnts = []
        padding_value = 0
        size_lists = []
        for w in self.tokenized_data:
            ww = []
            cc  = []
            for item in w:
                if item in self.frequencies_dict:
                    ww.append(self.word_to_index[item])
                    cc.append(self.frequencies_dict[item])
#                 else:
#                     ww.append(self.word_to_index["unk"])
            www.append(torch.tensor(ww).type(torch.int64))
            cnts.append(torch.tensor(cc).type(torch.float))
            size_lists.append(www[-1].size())
        self.max_length_sequence = torch.tensor(size_lists).max().item()
#         www = []
# #         size_lists = []
#         for w in self.tokenized_data:
#             ww = []
#             for item in w:
#                 if item in self.frequencies_dict:
#                     ww.append(self.word_to_index[item])
#             while len(ww)<self.max_length_sequence:
#                 ww.insert(len(ww), padding_value)
#             www.append(torch.tensor(ww).type(torch.int64))
#         ret = pad_sequence(www,batch_first=True, padding_value=padding_value)
        print(www[0])
        print(f"MAX length was={self.max_length_sequence}")
        self.tokenized_tokens = www
        self.cnts = cnts
        return www
    

In [12]:
data_train = DataPreprocessor()
data_test = DataPreprocessor(mode="test")

[['به', 'گزارش', 'خبرنگار', 'حوزه', 'میراث', 'و', 'فرهنگی', 'باشگاه', 'خبرنگاران', 'جوان', 'محمد', 'حسن', 'خان', 'اعتماد', 'السلطنه', 'به', 'نقل', 'از', 'برخی', 'منابع', 'اسفراین', 'چمن', 'کالپوش', 'چمن', 'کالپوش', 'بر', 'اساس', 'آنچه', 'در', 'اعتقادات', 'اهالی', 'منطقه', 'مشهود', 'است', 'آخرین', 'منزلگاه', 'داریوش', 'سوم', 'است', 'بنابراین', 'باید', 'دشت', 'اسفراین', 'را', 'جولانگاه', 'مقدونیان', 'دانست', 'این', 'دشت', 'از', 'دستبرد', 'و', 'سم', 'ستوران', 'یونانی', 'در', 'امان', 'بنا', 'بر', 'این', 'گزارش', 'شهر', 'اسفراین', 'امروزی', 'در', 'شمال', 'غربی', 'استان', 'خراسان', 'شمالی', 'قرار', 'این', 'شهر', 'دربرگیرنده', 'بیش', 'از', 'بقعه', 'از', 'بزرگان', 'و', 'امامزادگان', 'است', 'شایان', 'ذکر', 'است', 'شهر', 'فعلی', 'اسفراین', 'از', 'محله', 'تشکیل', 'شده', 'و', 'زیستگاه', 'حیواناتی', 'چون', 'آهو', 'گرگ', 'گورکن', 'خرگوش', 'روباه', 'گراز', 'پلنگ', 'کفتار', 'و', 'بز', 'کوهی', 'است', 'منطقه', 'ساری', 'گل', 'در', 'شمال', 'شرقی', 'اسفراین', 'قرار', 'گفتنی', 'است', 'ابوعبدلله', 'حمد', 'بن

In [7]:
pd.Categorical(data_train.train_data.category)

[فرهنگی هنری, فرهنگی هنری, فرهنگی هنری, فرهنگی هنری, فرهنگی هنری, ..., ورزشی, ورزشی, ورزشی, ورزشی, ورزشی]
Length: 117153
Categories (10, object): [اجتماعی, اقتصادی, بین‌الملل, سیاسی, ..., فضای مجازی, فیلم و صوت, وب‌گردی, ورزشی]

In [None]:
import torch.nn as nn
class CustomDataset(Dataset):
    
    def __init__(self, text, label):
        self.text = text
        self.label = label
        
    def __len__(self):
        assert len(self.text)==len(self.label)
        return len(self.text)
    
    def __getitem__(self, idx):
#         if len(self.text[idx])>1000:
#             probes = nn.Softmax(dim=0)(self.frequency[idx])
#             indices = self.text[idx].multinomial(num_samples=1000, replacement=False)
#             ret = self.text[indices]
#         else:
#             ret = self.text[idx]
        return self.text[idx], self.label[idx]

In [59]:
def prune(tokens, frequency):
    ret = []
    for token, freq in zip(tokens, frequency):
        if len(token)> 1000:
            probes = nn.Softmax(dim=0)(freq)
            indices = probes.multinomial(num_samples=1000, replacement=False)
            ret.append(token[indices])
        else:
            ret.append(token)
    return pad_sequence(ret,batch_first=True,padding_value=0)
        

In [60]:
train_pruned = prune(data_train.tokenized_tokens, data_train.cnts)

In [90]:
test_pruned = prune(data_test.tokenized_tokens, data_test.cnts)

In [96]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()


In [97]:
pd.Categorical(data_train.train_data.category).codes

array([5, 5, 5, ..., 9, 9, 9], dtype=int8)

In [72]:
train_pruned = CustomDataset(train_pruned, pd.Categorical(data_train.train_data.category).codes)

In [92]:
test_pruned_dataset = CustomDataset(test_pruned, pd.Categorical(data_test.test_data.category).codes)

In [94]:
len(test_pruned_dataset)

21091

In [74]:
from torch.nn.utils.rnn import pad_sequence
def collate_batch(batch):
#     label_list, text_list, offsets = [], [], [0]
# #     for (_text, _label) in batch:
#     for _item in batch:
#          _label, _text = _item["label"], _item["text"]
#          label_list.append(_label)
#          processed_text = _text
#          text_list.append(processed_text)
# #          print("&&&&&&&", processed_text, _text)
#          offsets.append(processed_text.size(0))
#     label_list = torch.tensor(label_list, dtype=torch.int64)
#     offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
#     text_list = torch.cat(text_list)
    (xx, yy) = zip(*batch)
    pad_x = pad_sequence(xx, batch_first=True, padding_value=0)
#     pad_y = pad_sequence(yy, batch_first=True, padding_value=-1)
    return pad_x, yy


In [60]:
dataloader = DataLoader(train_dataset, batch_size=8, shuffle=False, collate_fn=collate_batch)

In [113]:
import math
from torch import nn

class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class, seq_len):
        super(TextClassificationModel, self).__init__()
        self.seq_len = seq_len
#         self.seq_len = embed_dim
        self.kernel_1 = 2
        self.kernel_2 = 3
        self.kernel_3 = 4
        self.kernel_4 = 5
        self.out_size = 32
        self.stride = 2
        self.embedding_size = embed_dim
        self.embedding = nn.Embedding(vocab_size + 1, embed_dim, padding_idx=0)
#         self.fc = nn.Linear(embed_dim, num_class)
        self.conv_1 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_1, self.stride)
        self.conv_2 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_2, self.stride)
        self.conv_3 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_3, self.stride)
        self.conv_4 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_4, self.stride)
        self.pool_1 = nn.MaxPool1d(self.kernel_1, self.stride)
        self.pool_2 = nn.MaxPool1d(self.kernel_2, self.stride)
        self.pool_3 = nn.MaxPool1d(self.kernel_3, self.stride)
        self.pool_4 = nn.MaxPool1d(self.kernel_4, self.stride)
        self.fc = nn.Linear(self.in_features_fc(), num_class)
        

#     def init_weights(self):
#         initrange = 0.5
#         self.embedding.weight.data.uniform_(-initrange, initrange)
#         self.fc.weight.data.uniform_(-initrange, initrange)
#         self.fc.bias.data.zero_()

        
    def in_features_fc(self):
        '''Calculates the number of output features after Convolution + Max pooling
        
        Convolved_Features = ((embedding_size + (2 * padding) - dilation * (kernel - 1) - 1) / stride) + 1
        Pooled_Features = ((embedding_size + (2 * padding) - dilation * (kernel - 1) - 1) / stride) + 1
        
        source: https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
        '''
        # Calcualte size of convolved/pooled features for convolution_1/max_pooling_1 features
        out_conv_1 = ((self.embedding_size - 1 * (self.kernel_1 - 1) - 1) / self.stride) + 1
        out_conv_1 = math.floor(out_conv_1)
        out_pool_1 = ((out_conv_1 - 1 * (self.kernel_1 - 1) - 1) / self.stride) + 1
        out_pool_1 = math.floor(out_pool_1)
        
        # Calcualte size of convolved/pooled features for convolution_2/max_pooling_2 features
        out_conv_2 = ((self.embedding_size - 1 * (self.kernel_2 - 1) - 1) / self.stride) + 1
        out_conv_2 = math.floor(out_conv_2)
        out_pool_2 = ((out_conv_2 - 1 * (self.kernel_2 - 1) - 1) / self.stride) + 1
        out_pool_2 = math.floor(out_pool_2)
        
        # Calcualte size of convolved/pooled features for convolution_3/max_pooling_3 features
        out_conv_3 = ((self.embedding_size - 1 * (self.kernel_3 - 1) - 1) / self.stride) + 1
        out_conv_3 = math.floor(out_conv_3)
        out_pool_3 = ((out_conv_3 - 1 * (self.kernel_3 - 1) - 1) / self.stride) + 1
        out_pool_3 = math.floor(out_pool_3)
        
        # Calcualte size of convolved/pooled features for convolution_4/max_pooling_4 features
        out_conv_4 = ((self.embedding_size - 1 * (self.kernel_4 - 1) - 1) / self.stride) + 1
        out_conv_4 = math.floor(out_conv_4)
        out_pool_4 = ((out_conv_4 - 1 * (self.kernel_4 - 1) - 1) / self.stride) + 1
        out_pool_4 = math.floor(out_pool_4)
        
        # Returns "flattened" vector (input for fully connected layer)
        return (out_pool_1 + out_pool_2 + out_pool_3 + out_pool_4) * self.out_size
        
    def forward(self, text):
#         embedded = self.embedding(text, offsets)
#         return self.fc(embedded)
#         print(text, "^^^^^^^^^^^^^^^^^^", text.size())
        # Sequence of tokes is filterd through an embedding layer
        x = self.embedding(text)
        
#         import pdb;pdb.set_trace()
#         x = torch.reshape(x,(self.out_size, self.seq_len,2))
#         print(x.size(),"^^^^^^^^^^^^^^^^^^^")
        # Convolution layer 1 is applied
        x1 = self.conv_1(x)
        x1 = torch.relu(x1)
        x1 = self.pool_1(x1)
        
        # Convolution layer 2 is applied
        x2 = self.conv_2(x)
        x2 = torch.relu((x2))
        x2 = self.pool_2(x2)
        
        # Convolution layer 3 is applied
        x3 = self.conv_3(x)
        x3 = torch.relu(x3)
        x3 = self.pool_3(x3)
        
        # Convolution layer 4 is applied
        x4 = self.conv_4(x)
        x4 = torch.relu(x4)
        x4 = self.pool_4(x4)
        
        # The output of each convolutional layer is concatenated into a unique vector
        union = torch.cat((x1, x2, x3, x4), 2)
        union = union.reshape(union.size(0), -1)

        # The "flattened" vector is passed through a fully connected layer
        out = self.fc(union)
        # Dropout is applied		
#         out = self.dropout(out)
        # Activation function is applied
#         out = torch.softmax(out)
        
        return out

In [114]:
vocab_size = len(data_train.word_to_index)
emsize = 64
BATCH_SIZE = 64
seq_len = len(train_dataset)//BATCH_SIZE
model = TextClassificationModel(vocab_size, emsize, 10, 1000)

In [115]:
train_dataset[1][0].size()

torch.Size([1000])

In [120]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()
    for idx, (text, label) in enumerate(dataloader):
        optimizer.zero_grad()
        predited_label = model(text)
#         print("^^^^^&&&&&&&&", label, predited_label)
        y_pred_softmax = torch.log_softmax(predited_label, dim = 1)
#         _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)
#         import pdb;pdb.set_trace()
        loss = criterion(y_pred_softmax, label.long())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predited_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (text, label) in enumerate(dataloader):
            predited_label = model(text)
            y_pred_softmax = torch.log_softmax(predited_label, dim = 1)
            loss = criterion(y_pred_softmax, label.long())
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [121]:
from torch.utils.data.dataset import random_split
# Hyperparameters
EPOCHS = 10 # epoch
LR = 5  # learning rate
BATCH_SIZE = 64 # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

train_dataset = train_pruned
test_dataset = test_pruned_dataset
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                              shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=True)


for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

| epoch   1 |   500/ 1739 batches | accuracy    0.782
| epoch   1 |  1000/ 1739 batches | accuracy    0.772
| epoch   1 |  1500/ 1739 batches | accuracy    0.770
-----------------------------------------------------------
| end of epoch   1 | time: 206.51s | valid accuracy    0.762 
-----------------------------------------------------------
| epoch   2 |   500/ 1739 batches | accuracy    0.805
| epoch   2 |  1000/ 1739 batches | accuracy    0.797
| epoch   2 |  1500/ 1739 batches | accuracy    0.792
-----------------------------------------------------------
| end of epoch   2 | time: 202.64s | valid accuracy    0.767 
-----------------------------------------------------------
| epoch   3 |   500/ 1739 batches | accuracy    0.819
| epoch   3 |  1000/ 1739 batches | accuracy    0.813
| epoch   3 |  1500/ 1739 batches | accuracy    0.808
-----------------------------------------------------------
| end of epoch   3 | time: 200.83s | valid accuracy    0.765 
----------------------------

In [122]:
print('Checking the results of test dataset.')
accu_test = evaluate(test_dataloader)
print('test accuracy {:8.3f}'.format(accu_test))

Checking the results of test dataset.
test accuracy    0.124


In [90]:
list(test_dataloader)

[(tensor([3, 0, 1, 4, 2, 9, 0, 2, 4, 9, 2, 4, 2, 1, 1, 0, 1, 2, 2, 2, 0, 1, 1, 1,
          2, 4, 1, 0, 4, 3, 1, 1, 9, 2, 2, 9, 9, 3, 9, 9, 3, 2, 2, 4, 3, 5, 9, 2,
          5, 5, 3, 2, 4, 3, 9, 5, 9, 8, 9, 2, 4, 3, 9, 6]),
  tensor([   2,   35,   85,  ...,   35, 1585,   15]),
  tensor([    0,   179,   306,   307,   336,   516,  1463,  1701,  1918,  2161,
           2275,  2713,  2958,  3092,  3093,  3094,  3401,  3568,  4088,  4150,
           4151,  4311,  4312,  4608,  4748,  4977,  5321,  5322,  5536,  5638,
           5677,  5697,  5698,  5699,  5835,  6042,  6067,  6267,  6376,  6565,
           6566,  6567,  6698,  6779,  6958,  7154,  7297,  7345,  7478,  8070,
           8176,  8224,  8298,  8606,  8732,  9103,  9325,  9326,  9358,  9594,
          10412, 10605, 10918, 11033])),
 (tensor([2, 4, 3, 1, 5, 5, 3, 4, 0, 5, 3, 6, 0, 5, 4, 5, 0, 9, 3, 5, 1, 9, 5, 4,
          9, 2, 6, 4, 6, 1, 6, 3, 3, 4, 3, 2, 1, 5, 2, 4, 2, 9, 0, 3, 4, 8, 3, 6,
          3, 2, 4, 9, 8, 9, 9, 2, 0, 