<a href="https://colab.research.google.com/github/mmsamiei/just-practice-deep/blob/master/digi_contest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import torch.nn as nn
import numpy as np

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [0]:
class SentimentModel(nn.Module):
  
  def __init__(self, hid_size, vocab_size, n_head, n_layers, pf_size, max_len, device):
    super().__init__()

    self.device = device
    
    self.hid_size = hid_size
    self.pf_size = pf_size
    self.max_len = max_len

    self.embedding = nn.Embedding(vocab_size, hid_size)

    self.position_enc = nn.Embedding(self.max_len, self.hid_size)
    self.position_enc.weight.data = self.position_encoding_init(self.max_len, self.hid_size)
    self.scale = torch.sqrt(torch.FloatTensor([self.hid_size])).to(device)

    self.layer_norm = nn.LayerNorm(self.hid_size)
    self.encoder_layer = nn.TransformerEncoderLayer(d_model=hid_size, nhead = n_head, dim_feedforward = pf_size)
    self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=n_layers, norm=self.layer_norm)
    self.fc = nn.Linear(hid_size, 2)

    self._init_weights()
  
  def forward(self, x):
    sent_len, batch_size = x.shape[0], x.shape[1]

    temp = x
    temp = self.embedding(temp)

    pos = torch.arange(0,sent_len).unsqueeze(1).repeat(1,batch_size).to(self.device)
    temp_pos_emb = self.position_enc(pos)

    temp = temp * self.scale + temp_pos_emb
    temp = self.encoder(temp)
    temp = self.fc(temp[0,:])
    return temp

  def _init_weights(self):
    for p in self.parameters():
      if p.dim() > 1:
        nn.init.xavier_uniform_(p)

  def append_decoder_layer(self):
    appended_mod = nn.TransformerEncoderLayer(d_model=hid_size, nhead = n_head, dim_feedforward = self.pf_size).to(self.device)
    for p in appended_mod.parameters():
      if p.dim() > 1:
        nn.init.xavier_uniform_(p)
    model.encoder.layers.append(appended_mod)
    model.encoder.num_layers += 1

  
  def position_encoding_init(self, n_position, d_pos_vec):
    ''' Init the sinusoid position encoding table '''

    # keep dim 0 for padding token position encoding zero vector
    position_enc = np.array([
        [pos / np.power(10000, 2*i/d_pos_vec) for i in range(d_pos_vec)]
        if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])

    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
    temp = torch.from_numpy(position_enc).type(torch.FloatTensor)
    temp = temp.to(self.device)
    return temp

In [0]:
hid_size = 16
vocab_size = 60 
n_head = 4
n_layers = 2
pf_size = 64
max_len = 1200
model = SentimentModel(hid_size, vocab_size, n_head, n_layers, pf_size, max_len, device).to(device)

In [9]:
test_len = 20
batch_size = 64
test_input = torch.LongTensor(test_len, batch_size).random_(1,vocab_size).to(device)
model(test_input).shape

torch.Size([64, 2])

In [10]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 30,066 trainable parameters


**Data Loading**

In [0]:
import torchtext
from torchtext.data import Field
from torchtext.data import Pipeline
import spacy

tokenize = lambda x: x.split()

def remove_repetitive_char(x):
  string = " "
  for char in x:
    if(char != string[-1]):
      string += char
  return string[1:]

def replace_bad_char(x):
  string = x
  string = string.replace("a", "xx")
  

pre_pipe = Pipeline(remove_repetitive_char)

TEXT = Field(sequential=True, tokenize=tokenize, lower=True, preprocessing=pre_pipe)
LABEL = Field(sequential=False, use_vocab=False)
ID = Field(sequential=False, use_vocab=False)

In [0]:
from torchtext.data import TabularDataset

tv_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                 ("title", None), ("comment", TEXT),
                 ("rate", None), ("verification_status", LABEL)]

tst_datafields = [("id", ID), # we won't be needing the id, so we pass in None as the field
                 ("title", None), ("comment", TEXT),
                 ("rate", None)]



In [0]:
def my_filter_pred(example, limited_word = 50):
  if(len(example.comment) <= limited_word):
    return True
  else:
    return False

train_dataset = TabularDataset(
               path='train_comments.csv',
               format='csv',
               skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
               fields=tv_datafields,
               filter_pred = my_filter_pred)

test_dataset = TabularDataset(
               path='test_nolabel_comments.csv',
               format='csv',
               skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
               fields=tst_datafields)



In [17]:
len(train_dataset)

75583

In [0]:
s = set()
for i, batch in enumerate(train_dataset):
  s = s.union(set(" ".join(batch.comment)))

In [28]:
s

{' ',
 '!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '|',
 '~',
 '¡',
 '«',
 '°',
 '»',
 '×',
 '͢',
 'ּ',
 'פ',
 '،',
 '؛',
 '\u061c',
 '؟',
 'ء',
 'آ',
 'أ',
 'ؤ',
 'إ',
 'ئ',
 'ا',
 'ب',
 'ة',
 'ت',
 'ث',
 'ج',
 'ح',
 'خ',
 'د',
 'ذ',
 'ر',
 'ز',
 'س',
 'ش',
 'ص',
 'ض',
 'ط',
 'ظ',
 'ع',
 'غ',
 'ـ',
 'ف',
 'ق',
 'ك',
 'ل',
 'م',
 'ن',
 'ه',
 'و',
 'ى',
 'ي',
 'ً',
 'ٌ',
 'ٍ',
 'َ',
 'ُ',
 'ِ',
 'ّ',
 'ْ',
 'ٔ',
 'ٕ',
 '٠',
 '١',
 '٢',
 '٣',
 '٤',
 '٥',
 '٦',
 '٪',
 '٫',
 '٬',
 'ٰ',
 'ٱ',
 'پ',
 'چ',
 'ڋ',
 'ڑ',
 'ڔ',
 'ڗ',
 'ژ',
 'ک',
 'ڪ',
 'گ',
 'ھ',
 'ۀ',
 'ہ',
 'ۂ',
 'ی',
 'ے',
 '۔',
 'ە',
 '۰',
 '۱',
 '۲',
 '۳',
 '۴',
 '۵',
 '۶',
 '۷',
 '۸',
 '۹',
 'ܢ'

In [0]:
train_dataset[0].__dict__.keys()

dict_keys(['comment', 'verification_status'])

In [0]:
train_dataset[0].verification_status

'0'

In [0]:
TEXT.build_vocab(train_dataset, min_freq=3)

KeyboardInterrupt: ignored

In [0]:
len(TEXT.vocab)

In [0]:
from torchtext.data import BucketIterator, interleave_keys, Iterator

batch_size = 512

train_iterator = BucketIterator(dataset= train_dataset, batch_size=batch_size,
                                device=device,
                                sort_key=lambda x: len(x.comment),
                                sort = True,
                                shuffle = True,
                                repeat = False)

test_iter = Iterator(test_dataset, batch_size=1, device=device, sort=False, sort_within_batch=False, repeat=False)


In [0]:
for batch in iter(train_iterator):
  print(batch.comment.shape)

**train section**

In [0]:
vocab_size = len(TEXT.vocab)
hid_size = 64
pf_size = 128
n_head = 8
n_layer= 1
model = SentimentModel(hid_size, vocab_size, n_head, n_layers, pf_size, max_len, device).to(device)

In [0]:
print(f'The model has {count_parameters(model):,} trainable parameters')

In [0]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))
    
    def zero_grad(self):
        self.optimizer.zero_grad()

In [0]:
from tqdm import tqdm

def train_one_epoch(model,train_iter, optimizer, criterion, clip):
  epoch_loss = 0
  model.train()
  for batch in tqdm(train_iter):
    optimizer.zero_grad()
    batch_text = batch.comment
    batch_target = batch.verification_status
    result = model(batch_text)
    loss = criterion(result, batch_target.view(-1))
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()
    epoch_loss += loss.item()
  return epoch_loss / len(train_iter)


In [0]:
def train(model, train_iter, optimizer, criterion, clip, N_EPOCH):
  for epoch in range(N_EPOCH):
    epoch_loss = train_one_epoch(model, train_iter, optimizer, criterion, clip)
    print("epoch is {} loss is {}".format(epoch, epoch_loss))

In [0]:
optimizer = NoamOpt(hid_size, 1, 2000,
              torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
criterion = torch.nn.CrossEntropyLoss()
train(model, train_iterator, optimizer, criterion, 1, 100)

In [0]:
model.eval()
test_preds = []
with torch.no_grad():
  for batch in iter(test_iter):
    row_id = batch.id.item()
    if(batch.comment.shape[0]==0):
      res_out = 0
      test_preds.append([row_id,res_out])
      continue
    res = model(batch.comment)
    res_out = torch.argmax(res).item()
    test_preds.append([row_id,res_out])

In [0]:
test_preds.sort(key=lambda x: x[0])

In [0]:
import pandas
df = pandas.DataFrame(test_preds, columns= ['id', 'verification_status'])
df.to_csv("./file.csv", sep=',',index=False)

In [0]:
model.append_decoder_layer()