In [1]:
import numpy as np
import pandas as pd
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn import Linear,Softmax,CrossEntropyLoss,Module,ReLU
from torch.optim import Adam
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *
from sklearn.metrics import accuracy_score, f1_score
from tqdm.notebook import tqdm

In [2]:
train_df = pd.read_csv('../NADI-2020_release_1.0/NADI_release/train_labeled.tsv',sep='\t')
dev_df = pd.read_csv('../NADI-2020_release_1.0/NADI_release/dev_labeled.tsv',sep='\t')

In [3]:
#fasttext-arabic
fast = Magnitude("../downloads/fasttext-arabic/fasttext-arabic.magnitude")
def fasttext(x):
    vectors = []
    for title in x:
        vectors.append(np.average(fast.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)

In [4]:
label_map = {}
y_train_original = train_df["#3 country_label"]
for u in range(len(y_train_original.unique())):
    label_map[y_train_original.unique()[u]] = int(u)
reverse_label_map = {value : key for (key, value) in label_map.items()}
def label_onehot(label):
    onehot = np.zeros((21))
    index = label_map[label]
    onehot[index] = 1
    return onehot

In [5]:
class ArabicDataset(Dataset):
    def __init__(self, csv_file=None, million_csv=None, transform=None):
        self.csv_file = csv_file
        if csv_file:         
            if million_csv:
                self.text_df = pd.concat([pd.read_csv(csv_file, sep='\t'),
                                         pd.read_csv(million_csv, sep='\t')])
            else:
                self.text_df = pd.read_csv(csv_file,sep='\t')
        else:
            self.text_df = pd.read_csv(million_csv,sep='\t')
    def __len__(self):
        return len(self.text_df)
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        text = self.text_df.iloc[idx]['#2 tweet_content']
        text = fasttext([text]).reshape((300))
        if self.csv_file:
            label = self.text_df.iloc[idx]['#3 country_label']
            label = label_onehot(label)
            sample = {'text': torch.from_numpy(text).cuda(), 
                      'label': torch.from_numpy(label).cuda()}
        else:
            sample = {'text':torch.from_numpy(text).cuda()}
        return sample

In [6]:
class TuningNet(Module):
    def __init__(self, D_in, H,D_out):
        super(TuningNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
        self.relu = ReLU()
        self.softmax = Softmax(dim=1)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        y_pred = self.softmax(x)
        return y_pred

In [7]:
train_csv_path = '../NADI-2020_release_1.0/NADI_release/train_labeled.tsv'
dev_csv_path = '../NADI-2020_release_1.0/NADI_release/dev_labeled.tsv'
million_csv_path = '../semi_supervised_train/tsv_iters/iter1.tsv'
num_of_epochs = 50
learning_rate = 0.01
model = TuningNet(300,512,21).cuda()
train_batch_size = 32

In [8]:
traindataset = ArabicDataset(train_csv_path) #,million_csv_path)
trainloader = DataLoader(traindataset, batch_size=train_batch_size,
                        shuffle=True)
testloader = DataLoader(traindataset, batch_size=1)
devdataset = ArabicDataset(dev_csv_path)
devloader = DataLoader(devdataset, batch_size=1)

In [9]:
criterian = CrossEntropyLoss().cuda()
optimizer = Adam(model.parameters(), lr=learning_rate)
train_f1 = []
for epoch in range(num_of_epochs):
    i_batch = 0
    print("epoch:",epoch)
    for batch in tqdm(trainloader):
        i_batch +=1
        output = model(batch['text'])
        loss = criterian(output,batch['label'].argmax(dim=1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    y_pred = []
    y_true = []
    for batch in testloader:
        output = model(batch['text'])
        y_pred.append(output.argmax(dim=1).detach().cpu().numpy()[0])
        y_true.append(batch['label'].argmax(dim=1).detach().cpu().numpy()[0]) 
    print("training acc:",accuracy_score(y_pred,y_true),end=' ')
    f1 = f1_score(y_pred,y_true,average='weighted')
    train_f1.append(f1)
    print("training f1_score:", f1)
    
    y_pred = []
    y_true = []
    for batch in devloader:
        output = model(batch['text'])
        y_pred.append(output.argmax(dim=1).detach().cpu().numpy()[0])
        y_true.append(batch['label'].argmax(dim=1).detach().cpu().numpy()[0])        
    print("dev acc:",accuracy_score(y_pred,y_true),end=' ')
    f1 = f1_score(y_pred,y_true,average='weighted')
    train_f1.append(f1)
    print("training f1_score:", f1)

epoch: 0


HBox(children=(FloatProgress(value=0.0, max=657.0), HTML(value='')))




KeyboardInterrupt: 

In [10]:
torch.save(model,'../models/fastext-step1.pt')

  "type " + obj.__name__ + ". It won't be checked "


In [None]:
#predict million

In [7]:
model = torch.load('../models/fastext-step1.pt')

In [8]:
milliondataset = ArabicDataset(million_csv='../tsv/cleaned10million.tsv')
millionloader = DataLoader(milliondataset, batch_size=1)

In [20]:
y_pred = []
for batch in tqdm(millionloader):
    output = model(batch['text'])
    y_pred.append(reverse_label_map[output.argmax(dim=1).detach().cpu().numpy()[0]])

HBox(children=(FloatProgress(value=0.0, max=9220830.0), HTML(value='')))




KeyboardInterrupt: 

In [None]:
million_df = pd.read_csv('../tsv/cleaned10million.tsv',sep='\t')
million_df['#3 country_label'] = y_pred
million_df.to_csv('../semi_supervised_train/tsv_iters/iter1.tsv',sep='\t')