In [1]:
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev


curl: /opt/conda/lib/libcurl.so.4: no version information available (required by curl)
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  6034  100  6034    0     0  45256      0 --:--:-- --:--:-- --:--:-- 45368
Updating... This may take around 2 minutes.
Updating TPU runtime to pytorch-nightly ...
Found existing installation: torch 1.7.1+cpu
Uninstalling torch-1.7.1+cpu:
  Successfully uninstalled torch-1.7.1+cpu
Found existing installation: torchvision 0.8.2+cpu
Uninstalling torchvision-0.8.2+cpu:
  Successfully uninstalled torchvision-0.8.2+cpu
Copying gs://tpu-pytorch/wheels/colab/torch-nightly-cp37-cp37m-linux_x86_64.whl...
- [1 files][116.0 MiB/116.0 MiB]                                                
Operation completed over 1 objects/116.0 MiB.                                    
Copying gs://tpu-pytorch/wheels/colab/torch_xla-nightly-cp37-cp37m-linux_x86_64.whl...
\

In [2]:
import torch.nn as nn
from transformers import AutoModelForSequenceClassification
import torch
import torch.nn.functional as F
import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.distributed.parallel_loader as pl
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp

device = xm.xla_device()

class Classifier(nn.Module):
    
    def __init__(self):
        super().__init__()
        
        self.bert = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
        # Also include followers and following
        self.layer2 = nn.Linear(2, 1, bias=True)
        
    def forward(self, x):
        out = torch.zeros((x.shape[0], 2), device=device)
        bert_out = self.bert(x[:, :512].long(), x[:, 512:1024].long()).to_tuple()[0]
        out[:, :2] = bert_out
        res = self.layer2(out)
        return torch.relu(res)
model = Classifier()

model.load_state_dict(torch.load("../input/pretrained/model.bin"))
# model = model.to(device)

  return torch.device(device)


Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

<All keys matched successfully>

# 

In [None]:
# import pandas as pd
# from transformers import AutoTokenizer
# import numpy as np
# # Let's precompute the tokenized versions and pickle it.
# df = pd.read_json("../input/tweets/merged_troll_data.json")
# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# encoded = df['content'][:100000].apply(lambda x :tokenizer(x, max_length=512, padding="max_length", truncation=True)).to_numpy()

# np.save("encoded.npy", encoded)

# encoded = np.load("encoded.npy", allow_pickle=True)



In [3]:
import pandas as pd
import torch
from transformers import AutoTokenizer
from torch.utils.data import Dataset
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

class TweetData(Dataset):
    
    def __init__(self, file_path):
        df = pd.read_json(file_path)
        
        self.labels = torch.tensor(df['troll'].values, dtype=torch.bool)
        self.content = df['content']
#         self.followers = torch.tensor(df['followers'].to_numpy()).float()
#         self.following = torch.tensor(df['following'].to_numpy()).float()
#         self.followers = self.followers / (self.followers.max() - self.followers.min())
#         self.following = self.following / (self.following.max() - self.following.min())
        
    def __len__(self):
        return self.labels.shape[0]
        
    def __getitem__(self, idx):
        # Retrieve item
        res = torch.zeros((1024))
        encoded = tokenizer(self.content[idx], max_length=512, padding="max_length", truncation=True)
        res[:512] = torch.tensor(encoded['input_ids'])
        res[512:1024] = torch.tensor(encoded['attention_mask'])
#         res[1024] = self.followers[idx]
#         res[1025] = self.following[idx]
        return res, self.labels[idx].reshape((1)).float()
        

data = TweetData("../input/tweets/merged_troll_data.json")

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [4]:

from torch.utils.data import DataLoader

train_size, test_size = int(0.8*len(data)), int(0.2*len(data))+1

train_set, val_set = torch.utils.data.random_split(data, [train_size, test_size])

train_sampler = torch.utils.data.distributed.DistributedSampler(
          train_set,
          num_replicas=xm.xrt_world_size(),
          rank=xm.get_ordinal(),
          shuffle=True)

valid_sampler = torch.utils.data.distributed.DistributedSampler(
          val_set,
          num_replicas=xm.xrt_world_size(),
          rank=xm.get_ordinal(),
          shuffle=False)

train_dataloader = DataLoader(
          train_set,
            num_workers=4,
            sampler = train_sampler,
            batch_size=64)

val_dataloader = DataLoader(
                        val_set,
                        num_workers=4,
                        batch_size=64,
                        sampler = valid_sampler,
                        drop_last=False
                       )

device = xm.xla_device()
model = model.to(device)

In [5]:
%env TOKENIZERS_PARALLELISM=true

env: TOKENIZERS_PARALLELISM=true


In [None]:


# for TPU
# import torch_xla
# import torch_xla.core.xla_model as xm
# device = xm.xla_device()
# torch.set_default_tensor_type('torch.FloatTensor')
import numpy as np
import torch_xla.utils.serialization as xser
device = xm.xla_device()
model = model.to(device)

max_epochs = 3
learning_rate = 0.4 * 1e-5
batch_size = 1000
loss_func = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)


epochs, loss = 0, float("inf")


def train_func_loop(loader, mod, opt, dev):
    print("Starting batch!")
    
    losses = []
    for batch_idx, (batch, target) in enumerate(loader,1):
        batch = batch.to(device, dtype=torch.float)
        target = target.to(device, dtype=torch.float)
        
        out = mod(batch)
        loss = loss_func(out, target.float())
        losses.append(loss.item())
        
        loss.backward()
        opt.step()
        
        optimizer.zero_grad()
        
        if batch_idx % 20 == 0:
            print(np.array(losses).mean())
            losses = []
        if (batch_idx + 1) % 200 == 0:
            torch_xla.core.xla_model.save(model.state_dict(), f"model.bin", master_only=True)
        if (batch_idx + 1) % 2000 == 0:
            torch_xla.core.xla_model.save(model.state_dict(), f"model.bin", master_only=True)

# def val_func_loop(loader, mod, opt, dev):
#     print("Starting batch!")
    
#     losses = []
#     for batch_idx, (batch, target) in enumerate(loader,1):
#         batch = batch.to(device, dtype=torch.float)
#         target = target.to(device, dtype=torch.float)
        
#         out = mod(batch)
#         loss = loss_func(out, target.float())
#         losses.append(loss.item())
        
#         if batch_idx % 10 == 0:
#             print(np.array(losses).mean())
#             losses = []
#         if (batch_idx + 1) % 1000 == 0:
#             return

def epochLoop():
    print("Starting epoch!")
    para_loader = pl.ParallelLoader(train_dataloader, [device])
    train_func_loop(para_loader.per_device_loader(device), model, optimizer, device)

    val_loader = pl.ParallelLoader(val_dataloader, [device])
    #val_func_loop(para_loader.per_device_loader(device), model, device)
    
def _mp_fn(rank, flags):
    torch.set_default_tensor_type('torch.FloatTensor')
    a = epochLoop()
    
FLAGS = {}
# applying multiprocessing so that images get trained different on      # cores of kaggle-tpuFLAGS={}
xmp.spawn(_mp_fn, args=(FLAGS,), nprocs=1, start_method='fork')

print("Done!")
    
    

Starting epoch!
Starting batch!
0.10837347935885192
0.12289521135389805
0.12252630367875099
0.12007163278758526
0.11837301217019558
0.12279433496296406
0.12167486548423767
0.11727854683995247
0.11650890223681927
0.12132159098982812
0.11012225262820721
0.10946258436888456
0.10925718508660794
0.1183034785091877
0.12720774300396442
0.10973196178674698
0.11256753914058208
0.10660698376595974
0.11386845111846924
0.10076332353055477
0.10473614372313023
0.11202005296945572
0.1160534955561161
0.10805297568440438
0.11870592683553696
0.12076152116060257
0.11637314409017563
0.11953723356127739
0.10834940373897553
0.11233472265303135
0.11999274268746377
0.1227375477552414
0.11783444173634053
0.12358365952968597
0.12693694271147252
0.11989331431686878
0.10818891823291779
0.12613968104124068
0.11897119134664536
0.12291390486061574
0.12344372682273388
0.13238167352974414
0.12433116100728511
0.12450432367622852


In [18]:
import torch_xla.utils.serialization as xser
torch_xla.core.xla_model.save(model.state_dict(), f"model.bin", master_only=True)

In [None]:


def val_func_loop(loader, mod, opt, dev):
    print("Starting batch!")
    
    losses = []
    for batch_idx, (batch, target) in enumerate(loader,1):
        batch = batch.to(device, dtype=torch.float)
        target = target.to(device, dtype=torch.float)
        
        out = mod(batch)
        loss = loss_func(out, target.float())
        losses.append(loss.item())
        
        if batch_idx % 10 == 0:
            print(np.array(losses).mean())
            losses = []
            

val_loader = pl.ParallelLoader(val_dataloader, [device])
val_func_loop(val_loader.per_device_loader(device), model, optimizer, device)
        
        