In [1]:
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip
/kaggle/input/test-csv/test.csv
/kaggle/input/train-csv/train.csv


In [2]:
import pandas as pd
import random


import torchtext
# from torchtext.legacy.data import TabularDataset 최신은 이것으로 해야 한다는데 아래 것도 되는 것 같음 
from torchtext.data import TabularDataset
from torchtext import data
import torch.nn as nn
import torch
import torch.optim as optim
import torch.nn.functional as F

import time
from sklearn.metrics import roc_auc_score,accuracy_score
import spacy


In [3]:
#gpu를 사용한다는 부분에 대한 내용이다. 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
nlp = spacy.load("en")
def tokenizer(text):
    filtered = ''.join([c if c not in filters else '' for c in text])
    return [token.text for token in nlp.tokenizer(filtered) if not token.is_space]

In [40]:
# hyperparams
TEXT_LENGTH = 128
EMBEDDING_SIZE = 50
# EMBEDDING_SIZE = 200 
BATCH_SIZE = 32
VOCAB_SIZE=25000

In [6]:
#데이터를 다듬는 부분인데 어떤 식으로 다듬는가? 
TEXT = data.Field(lower=True, batch_first=True,fix_length=TEXT_LENGTH, preprocessing=None, tokenize=tokenizer)
LABEL = data.Field(sequential=False,is_target=True, use_vocab=False, pad_token=None, unk_token=None)

# print(TEXT[0])
# print(LABEL)

datafields = [('id', None),
              ('comment_text', TEXT), 
              ("toxic", LABEL), 
              ("severe_toxic", LABEL),
              ('obscene', LABEL), 
              ('threat', LABEL),
              ('insult', LABEL),
              ('identity_hate', LABEL)]


alldata = TabularDataset(
    path='/kaggle/input/train-csv/train.csv',
    format='csv',
    skip_header=True,
    fields=datafields,)

In [7]:
print("Tokenize function:", TEXT.tokenize)
print("Preprocessing function:", TEXT.preprocessing)
print("Fix length:", TEXT.fix_length)
print("Batch first:", TEXT.batch_first)


Tokenize function: <function tokenizer at 0x7ad3598679d8>
Preprocessing function: None
Fix length: 128
Batch first: True


In [8]:
random.seed(17)
train,dev = alldata.split(split_ratio=0.8, random_state=random.getstate()) #초기값은 0.9 였음 

In [41]:
#단어집을 구성하는 함수이다. 최대 사이즈는 20000이고 최소 5번 이상 나온 것으로 생성을 하는 것이다. 

TEXT.build_vocab(train, max_size=25000, min_freq=5)

In [14]:
print(len(TEXT.vocab))

20002


In [42]:
class NNet(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx, embeddings, text_length, lstm_hidden_size):
        super().__init__()
        
#         self.embeddings = nn.Embedding.from_pretrained(embeddings, freeze=False, padding_idx=pad_idx)
        self.embeddings = nn.Embedding(vocab_size,embedding_dim,pad_idx)
        self.lstm = nn.GRU(input_size=embedding_dim, hidden_size=lstm_hidden_size, num_layers=1,bidirectional = True, batch_first=True)
        self.max_pool = nn.MaxPool2d((text_length,1))
        self.fc1 = nn.Linear(lstm_hidden_size*2, 100)
        self.fc2 = nn.Linear(100, output_dim)

    def forward(self, text):
        a1 = self.embeddings(text)
        a2 = self.lstm(a1)[0]
        a3 = self.max_pool(a2).squeeze(1)
        a4 = F.relu(self.fc1(a3))
        a5 = self.fc2(a4)
        return a5

In [93]:
random.seed(1234)
train_iterator, valid_iterator = data.BucketIterator.splits((train, dev),
                                                            batch_size=BATCH_SIZE,
                                                            device=device,
                                                            shuffle=True,
                                                            sort_key=lambda x: len(x.comment_text))

In [94]:
OUTPUT_DIM = 6
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
model = NNet(len(TEXT.vocab), EMBEDDING_SIZE, OUTPUT_DIM, PAD_IDX, TEXT.vocab.vectors,TEXT_LENGTH, 150).to(device)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,462,606 trainable parameters


In [25]:
def fit_epoch(iterator, model, optimizer, criterion):
    train_loss = 0
    train_acc = 0
    model.train()
    all_y = []
    all_y_hat = []
    for batch in iterator:
        optimizer.zero_grad()
        y = torch.stack([batch.toxic,
                         batch.severe_toxic,
                         batch.obscene,
                         batch.threat,
                         batch.insult,
                         batch.identity_hate],dim=1).float().to(device)
        y_hat = model(batch.comment_text.to(device))
        loss = criterion(y_hat, y)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        all_y.append(y)
        all_y_hat.append(y_hat)
    y = torch.cat(all_y,dim=0)
    y_hat = torch.cat(all_y_hat,dim=0)
    roc = roc_auc_score(y.cpu(),y_hat.sigmoid().detach().cpu())
    return train_loss / len(iterator.dataset), roc

def test_epoch(iterator, model, criterion):
    train_loss = 0
    train_acc = 0
    model.eval()
    all_y = []
    all_y_hat = []
    for batch in iterator:
        y = torch.stack([batch.toxic,
                         batch.severe_toxic,
                         batch.obscene,
                         batch.threat,
                         batch.insult,
                         batch.identity_hate],dim=1).float().to(device)
        with torch.no_grad():
            y_hat = model(batch.comment_text.to(device))
        loss = criterion(y_hat, y)
        train_loss += loss.item()
        all_y.append(y)
        all_y_hat.append(y_hat)
    y = torch.cat(all_y,dim=0)
    y_hat = torch.cat(all_y_hat,dim=0)
    roc = roc_auc_score(y.cpu(),y_hat.sigmoid().detach().cpu())
    return train_loss / len(iterator.dataset), roc

In [26]:
def train_n_epochs(n, lr, wd):

    criterion = nn.BCEWithLogitsLoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    for epoch in range(n):
        start_time = time.time()
        train_loss, train_roc = fit_epoch(train_iterator, model, optimizer, criterion)
        valid_loss, valid_roc = test_epoch(valid_iterator, model, criterion)

        secs = int(time.time() - start_time)
        mins = secs / 60
        secs = secs % 60

        print('Epoch: %d' % (epoch + 1), " | time in %d minutes, %d seconds" % (mins, secs))
        print(f'\tLoss: {train_loss:.4f}(train)\t|\troc: {train_roc :.6f} (train)')
        print(f'\tLoss: {valid_loss:.4f}(valid)\t|\troc: {valid_roc:.6f} (valid)') 

In [95]:
# train_n_epochs(3,0.01,0)
train_n_epochs(4,0.001,0)

Epoch: 1  | time in 1 minutes, 15 seconds
	Loss: 0.0024(train)	|	roc: 0.930348 (train)
	Loss: 0.0018(valid)	|	roc: 0.966113 (valid)
Epoch: 2  | time in 1 minutes, 15 seconds
	Loss: 0.0016(train)	|	roc: 0.972636 (train)
	Loss: 0.0016(valid)	|	roc: 0.975771 (valid)
Epoch: 3  | time in 1 minutes, 15 seconds
	Loss: 0.0014(train)	|	roc: 0.982250 (train)
	Loss: 0.0016(valid)	|	roc: 0.979846 (valid)
Epoch: 4  | time in 1 minutes, 15 seconds
	Loss: 0.0012(train)	|	roc: 0.987972 (train)
	Loss: 0.0016(valid)	|	roc: 0.980511 (valid)


In [96]:
torch.save(model.state_dict(), '/kaggle/working/model.pth')

In [None]:
#이 후에 출력을 한 sub file을 만들 수 있어야 한다. 

In [None]:
# ID = data.Field(sequential=False, use_vocab=False)

In [97]:
datafields = [('id', None),
              ('comment_text', TEXT)] #이렇게했을 때 id 도 가져오는 것인지 확인 필요 


testdata = TabularDataset(
    path='/kaggle/input/test-csv/test.csv',
    format='csv',
    skip_header=True,
    fields=datafields,)

In [99]:
test_iter1 = torchtext.data.Iterator(testdata, batch_size = 32, device = torch.device('cuda:0'),sort = False, sort_within_batch = False, repeat = False, shuffle = False)

In [100]:
testDF = pd.read_csv('/kaggle/input/test-csv/test.csv')

In [72]:
# for obj in test_iter1:
#         print(obj.commnet_text[0])
#         break; 

AttributeError: 'Batch' object has no attribute 'commnet_text'

In [101]:
myPreds = []
with torch.no_grad():
    model.eval()
    for obj in test_iter1:
        text = obj.comment_text  # batch.comment_text는 이미 텐서입니다.
        text = text.to(device)
        output = model(text)
        preds = torch.sigmoid(output).cpu().numpy()
        myPreds.extend(preds)
        
        
#         torch.cuda.empty_cache()
#         pred = model(obj.comment_text[0])
        
#         pred = torch.sigmoid(pred)
#         myPreds.append(pred.cpu().numpy())
#         del pred;del obj;
#         torch.cuda.empty_cache()
        

In [102]:
myPreds = np.vstack(myPreds)

In [103]:
for i, col in enumerate(["toxic","severe_toxic","obscene","threat","insult","identity_hate"]):
    testDF[col] = myPreds[:,i]

In [104]:
testDF.drop("comment_text", axis=1).to_csv("submission.csv", index=False)

In [88]:
testDF

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,9.975451e-01,1.137905e-02,9.996178e-01,1.031380e-04,9.683671e-01,1.275153e-02
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,5.764773e-12,4.273182e-29,3.846919e-12,9.057542e-23,7.417632e-16,3.200985e-17
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",4.189415e-09,5.564198e-18,3.734818e-08,6.249637e-15,2.215741e-09,7.373060e-12
3,00017563c3f7919a,":If you have a look back at the source, the in...",1.226943e-14,2.700051e-30,2.232814e-11,7.501499e-14,1.198059e-14,5.646655e-16
4,00017695ad8997eb,I don't anonymously edit articles at all.,4.764625e-10,1.234589e-24,1.477529e-12,1.413550e-14,2.633810e-11,6.350758e-18
...,...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,". \n i totally agree, this stuff is nothing bu...",5.869639e-07,3.626821e-23,1.508391e-10,4.773038e-18,1.211544e-10,9.166955e-18
153160,fffd7a9a6eb32c16,== Throw from out field to home plate. == \n\n...,4.991883e-04,8.319958e-13,4.642271e-05,4.024223e-08,9.785261e-09,1.035158e-05
153161,fffda9e8d6fafa9e,""" \n\n == Okinotorishima categories == \n\n I ...",1.035478e-09,1.518757e-23,2.080095e-09,9.568138e-14,1.002969e-10,3.074939e-12
153162,fffe8f1340a79fc2,""" \n\n == """"One of the founding nations of the...",7.593046e-08,2.829785e-20,4.408555e-08,4.756441e-11,1.824900e-10,7.763514e-07


------- asdf asdfas fd