In [1]:
import numpy as np
import pandas as pd
import os
import torch
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch.nn import Linear,Softmax,CrossEntropyLoss,Module,ReLU,DataParallel,Sequential
from torch.optim import Adam, SGD
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
from time import time
from sentence_transformers import SentenceTransformer
import re

import warnings
warnings.filterwarnings('ignore')

device = 'cuda:0'

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
train_df = pd.read_csv('../NADI-2020_release_1.0/NADI_release/train_labeled.tsv','\t')
dev_df = pd.read_csv('../NADI-2020_release_1.0/NADI_release/dev_labeled.tsv',sep='\t')

In [3]:
label_map = {}
y_train_original = train_df["#3 country_label"]
for u in range(len(y_train_original.unique())):
    label_map[y_train_original.unique()[u]] = int(u)
reverse_label_map = {value : key for (key, value) in label_map.items()}
def label_onehot(label):
    onehot = np.zeros((21))
    index = label_map[label]
    onehot[index] = 1
    return onehot

In [4]:
fast = Magnitude("../downloads/fasttext-arabic/fasttext-arabic.magnitude")
def fasttext(x):
    vectors = []
    for title in x:
        vectors.append(np.average(fast.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)

In [67]:
class ArabicDataset(Dataset):
    def __init__(self, csv_file=None, million_csv=None, transform=None):
        fast = Magnitude("../downloads/fasttext-arabic/fasttext-arabic.magnitude")
        def transform(x):
            vectors = []
            for title in tqdm(x):
                vectors.append(np.average(fast.query(word_tokenize(str(title))), axis = 0))
            return np.array(vectors)
        bert = SentenceTransformer('distiluse-base-multilingual-cased',device=device)
        def berttext(x):
            return bert.encode(x,show_progress_bar=False,
                                          batch_size=1000)
        
        self.csv_file = csv_file
        if csv_file:         
            if million_csv:
                self.text_df = pd.concat([pd.read_csv(csv_file, sep='\t'),
                                         pd.read_csv(million_csv, sep='\t')])
            else:
                self.text_df = pd.read_csv(csv_file,sep='\t')
        else:
            self.text_df = pd.read_csv(million_csv,sep='\t')
#         self.berttext_data = np.array(berttext(self.text_df['#2 tweet_content']))
        self.fasttext_data = transform(self.text_df['#2 tweet_content'])
#         self.combined_data = np.hstack([self.fasttext_data, self.berttext_data])
    def __len__(self):
        return len(self.text_df)
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        text = self.fasttext_data[idx]
#         text = fasttext([text]).reshape((300))
        if self.csv_file:
            label = self.text_df.iloc[idx]['#3 country_label']
            label = label_onehot(label)
            sample = {'text': text, 
                      'label': torch.from_numpy(label).to(device)}
        else:
            sample = {'text':text}
        return sample

In [68]:
class TuningNet(Module):
    def __init__(self, D_in, H,D_out):
        super(TuningNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.relu = ReLU()
#         self.linear1_1 = torch.nn.Linear(H, H)
#         self.relu2 = ReLU()
        self.linear2 = torch.nn.Linear(H, D_out)
        self.softmax = Softmax(dim=1)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
#         x = self.linear1_1(x)
#         x = self.relu2(x)
        x = self.linear2(x)
        y_pred = self.softmax(x)
        return y_pred

In [75]:
train_csv_path = '../tsv/task1-lvl2-2000_train.tsv'
dev_csv_path = '../tsv/final/dev_for_pytorch.tsv'
million_csv_path = '../semi_supervised_train/tsv_iters/iter1.tsv'
num_of_epochs = 50
learning_rate = 0.001
model = TuningNet(300,512,21).to(device)
train_batch_size = 32

In [70]:
learning_rate=0.01

In [71]:
model1 = torch.load('../semi-supervised_train/models/task1-sgd-lvl2-2000.pt').to(device)
model2 = torch.load('../semi-supervised_train/models/task1-sgd.pt').to(device)

In [76]:
traindataset = ArabicDataset(train_csv_path) #,million_csv_path)

100%|██████████| 135933/135933 [11:53<00:00, 190.44it/s]


In [77]:
trainloader = DataLoader(traindataset, batch_size=32,
                        shuffle=True)
# testloader = DataLoader(traindataset, batch_size=1000)
devdataset = ArabicDataset(dev_csv_path)
devloader = DataLoader(devdataset, batch_size=1000)

100%|██████████| 4957/4957 [00:55<00:00, 88.83it/s] 


In [79]:
criterian = CrossEntropyLoss().to(device)
# optimizer = SGD(model.parameters(), lr=learning_rate,momentum=0.9,nesterov=True)
optimizer = Adam(model.parameters(), lr=learning_rate)
train_f1 = []
dev_f1 = []
y_pred = []
y_true = []
num_of_epochs=1
for epoch in range(num_of_epochs):
#     i_batch = 0
#     print("epoch:",epoch)
#     for batch in tqdm(trainloader):
#         i_batch +=1
#         output = model(batch['text'].to(device))
#         loss = criterian(output,batch['label'].argmax(dim=1))
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
#         y_pred += list(output.argmax(dim=1).detach().cpu().numpy())
#         y_true += list(batch['label'].argmax(dim=1).detach().cpu().numpy()) 
#     print("training acc:",accuracy_score(y_pred,y_true),end=' ')
#     f1 = f1_score(y_pred,y_true,average='macro')
#     train_f1.append(f1)
#     print("training f1_score:", f1)
    
    
    y_pred = []
    y_true = []
    for batch in devloader:
        output1 = model1(batch['text'].to(device))
        output2 = model2(batch['text'].to(device))
        print(output1.shape)
        # averaging
        output = (output1+output2)/2
        print(output.shape)
        y_pred += list(output.argmax(dim=1).detach().cpu().numpy())
        y_true += list(batch['label'].argmax(dim=1).detach().cpu().numpy())
    print("dev acc:",accuracy_score(y_pred,y_true),end=' ')
    f1 = f1_score(y_pred,y_true,average='macro')
    dev_f1.append(f1)
    print("training f1_score:", f1)

torch.Size([1000, 21])
torch.Size([1000, 21])
torch.Size([1000, 21])
torch.Size([1000, 21])
torch.Size([1000, 21])
torch.Size([1000, 21])
torch.Size([1000, 21])
torch.Size([1000, 21])
torch.Size([957, 21])
torch.Size([957, 21])
dev acc: 0.36614888037119225 training f1_score: 0.2223585279304462


In [33]:
# torch.save(model,'../semi-supervised_train/models/task1-sgd-lvl2-2000-improve.pt')

In [25]:
# model = torch.load('../semi-supervised_train/models/fastest.pt').to(device)