In [438]:
#80
from collections import defaultdict
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F


train = pd.read_csv("../data/train.txt", sep="\t")
print(len(train))
valid = pd.read_csv("../data/valid.txt", sep='\t')
print(len(valid))
test = pd.read_csv("../data/test.txt", sep="\t")
print(len(test))

def add_words(df):
    df["words"] = df.TITLE.apply(lambda x: x.split(" "))
    return df

train = add_words(train)
valid = add_words(valid)
test = add_words(test)


def make_dict(df):
    frequency = defaultdict(int)
    for text in df.words:
        for token in text:
            frequency[token] += 1
    return frequency


def make_id_dict(dic):
    id_dict={}
    for i , (k,v) in enumerate(sorted(dic.items(), key=lambda x : -x[1])):
    #軽量化のためv>=5に。本来はv>=2
        if v>=2:
            id_dict[k]=i+1
        else:
            id_dict[k]=0
    return id_dict

word_dict=make_id_dict(make_dict(train))

10672
1334
1334


In [439]:
def equalize_class(df):
    size=min(df.CATEGORY.value_counts())
    result=[]
    for category in df.CATEGORY.unique():
        result.append(df[df.CATEGORY==category].iloc[:size])
    
    return pd.concat(result).sample(frac=1)

In [440]:
def id_to_vec(df, id_dic):
    dim=max(word_dict.values())+1
    result=torch.zeros([df.shape[0], df.shape[1]], dtype=torch.long) 
    for i, sentence in enumerate(df):
        for u, word in enumerate(sentence):
            try:
                result[i, u]=id_dic[word]
            except:
                continue
    return result

In [441]:
#81
train = equalize_class(train)
valid = equalize_class(valid)
X_train=id_to_vec(train, word_dict)
X_valid=id_to_vec(valid, word_dict)
#X_test=id_to_vec(uniform_sentence_len(test), word_dict)

from sklearn.preprocessing import LabelEncoder
#LabelEncoderのインスタンスを生成
le = LabelEncoder()
#ラベルを覚えさせる
le = le.fit(train.CATEGORY.values)
#ラベルを整数に変換

y_train = le.transform(train.CATEGORY.values)
y_valid = le.transform(valid.CATEGORY.values)
#y_test = le.transform(test.CATEGORY.values)

dtype = torch.float
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)

def get_acc(y_hat, y):
    _, label=torch.max(y_hat, 1)
    correct=(label==y).sum().item()
    return correct/label.size(0)

VOCAB_SIZE = max(word_dict.values())+1
EMBEDDING_DIM = 50


#X_test = to_tensor(X_test)
y_train = torch.tensor(y_train , dtype=torch.long)
y_valid = torch.tensor(y_valid, dtype=torch.long)
#y_test = to_tensor(y_test).long()

device: cpu


In [443]:
class RNN(nn.Module):
    def __init__(self, dim_w, dim_h, L, vocab_size):
        super(RNN, self).__init__()
        self.dim_h=dim_h
        self.emb = nn.Embedding(vocab_size, dim_w,
                                padding_idx=0)
        self.rnn = nn.LSTM(dim_w, dim_h, 1, batch_first=True)
        self.out = nn.Linear(dim_h, L)
        self.softmax =nn.Softmax(1)
    def forward(self, x, h0=None):
        x = self.emb(x)
        x, h = self.rnn(x, h0)
        x = x[:, -1, :]
        x = self.out(x)
        return self.softmax(x)

In [444]:
model = RNN(EMBEDDING_DIM,100,4, VOCAB_SIZE)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.SGD(params=model.parameters() , lr=0.1)
optimizer.zero_grad()
epochs=30
batch_size=32
for epoch in range(epochs):
    total_loss=0
    for idx in range(0, len(X_train), batch_size):
        optimizer.zero_grad()
        hat_y = model(X_train[idx : idx+batch_size if idx+batch_size<=len(X_train) else len(X_train)], None)
        loss=criterion(hat_y, y_train[idx : idx+batch_size if idx+batch_size<=len(X_train) else len(X_train)])
        total_loss+=loss.item()
        loss.backward(retain_graph=True)
        optimizer.step()
    #print(total_loss)

    with torch.no_grad():
        hat_train_y=model(X_train)
        train_loss=criterion(hat_train_y, y_train)
        train_acc=get_acc(hat_train_y,y_train)

        #検証用
        hat_valid_y = model(X_valid)
        valid_loss=criterion(hat_valid_y, y_valid)
        valid_acc=get_acc(hat_valid_y,y_valid)
        print("{}/{} epoch train_loss:{:.4f} | train_acc:{:.4f} | valid_loss:{:.4f} | valid_acc:{:.4f}".format(
            epoch+1, epochs, train_loss, train_acc, valid_loss, valid_acc))

        

1/30 epoch train_loss:1.3485 | train_acc:0.2500 | valid_loss:1.3485 | valid_acc:0.2500
2/30 epoch train_loss:1.3347 | train_acc:0.2500 | valid_loss:1.3347 | valid_acc:0.2500
3/30 epoch train_loss:1.3277 | train_acc:0.2500 | valid_loss:1.3277 | valid_acc:0.2500
4/30 epoch train_loss:1.3236 | train_acc:0.2500 | valid_loss:1.3235 | valid_acc:0.2500
5/30 epoch train_loss:1.3209 | train_acc:0.2500 | valid_loss:1.3209 | valid_acc:0.2525
6/30 epoch train_loss:1.3192 | train_acc:0.2500 | valid_loss:1.3191 | valid_acc:0.2525
7/30 epoch train_loss:1.3179 | train_acc:0.2510 | valid_loss:1.3179 | valid_acc:0.2500
8/30 epoch train_loss:1.3170 | train_acc:0.2510 | valid_loss:1.3170 | valid_acc:0.2500
9/30 epoch train_loss:1.3163 | train_acc:0.2503 | valid_loss:1.3163 | valid_acc:0.2500
10/30 epoch train_loss:1.3158 | train_acc:0.2507 | valid_loss:1.3158 | valid_acc:0.2500
11/30 epoch train_loss:1.3154 | train_acc:0.2507 | valid_loss:1.3154 | valid_acc:0.2500
12/30 epoch train_loss:1.3151 | train_acc

In [76]:
#84
import gensim
file_path = "../data/GoogleNews-vectors-negative300.bin.gz"
word2vec = gensim.models.KeyedVectors.load_word2vec_format(file_path, binary=True)

In [445]:
import torch.nn.utils.rnn as rnn
def get_pre_trained_vec(df):
    result=[]
    for sentence in df.words:
        sentence_list=[]
        for word in sentence:
            try:
                sentence_list.append(word2vec[word])
            except:
                continue
        if sentence_list!=[]:
            result.append(torch.tensor(sentence_list))
        else:
            result.append(torch.zeros(1,1,300))
    return rnn.pad_sequence(result)

In [446]:
train = equalize_class(train)
valid = equalize_class(valid)


X_train=get_pre_trained_vec(train).reshape(len(train), -1,300)
X_valid=get_pre_trained_vec(valid).reshape(len(valid), -1,300)

from sklearn.preprocessing import LabelEncoder
#LabelEncoderのインスタンスを生成
le = LabelEncoder()
#ラベルを覚えさせる
le = le.fit(train.CATEGORY.values)
#ラベルを整数に変換

y_train = le.transform(train.CATEGORY.values)
y_valid = le.transform(valid.CATEGORY.values)
y_train = torch.tensor(y_train , dtype=torch.long)
y_valid = torch.tensor(y_valid, dtype=torch.long)

In [447]:
class Bi_RNN(nn.Module):
    def __init__(self, dim_w, dim_h, L):
        super(Bi_RNN, self).__init__()
        self.rnn = nn.LSTM(dim_w, dim_h, 1, batch_first=True, bidirectional=True)
        self.out = nn.Linear(2*dim_h, L)
        self.softmax =nn.Softmax(1)
    def forward(self, x, h0=None):
        x, h = self.rnn(x, h0)
        x = x[:,-1,:]
        x = self.out(x)
        return self.softmax(x)

In [448]:
#84#85
model = Bi_RNN(300,50,4)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.SGD(params=model.parameters() , lr=0.03)
optimizer.zero_grad()
epochs=30
batch_size=32
for epoch in range(epochs):
    total_loss=0
    for idx in range(0, len(X_train), batch_size):
        optimizer.zero_grad()
        hat_y = model(X_train[idx : idx+batch_size if idx+batch_size<=len(X_train) else len(X_train)], None)
        loss=criterion(hat_y, y_train[idx : idx+batch_size if idx+batch_size<=len(X_train) else len(X_train)])
        total_loss+=loss.item()
        loss.backward(retain_graph=True)
        optimizer.step()
    with torch.no_grad():
        hat_y_train = model(X_train)
        hat_y_valid = model(X_valid)
        train_acc=get_acc(hat_y_train, y_train)
        valid_acc=get_acc(hat_y_valid, y_valid)
        train_loss = criterion(hat_y_train, y_train)
        valid_loss = criterion(hat_y_valid, y_valid)
        print("{}/{} epoch train_loss:{:.4f} | train_acc:{:.4f} | valid_loss:{:.4f} | valid_acc:{:.4f}".format(
            epoch+1, epochs, train_loss, train_acc, valid_loss, valid_acc))

1/30 epoch train_loss:1.3732 | train_acc:0.2493 | valid_loss:1.3731 | valid_acc:0.2500
2/30 epoch train_loss:1.3611 | train_acc:0.2486 | valid_loss:1.3610 | valid_acc:0.2500
3/30 epoch train_loss:1.3524 | train_acc:0.2490 | valid_loss:1.3523 | valid_acc:0.2500
4/30 epoch train_loss:1.3459 | train_acc:0.2497 | valid_loss:1.3459 | valid_acc:0.2525
5/30 epoch train_loss:1.3410 | train_acc:0.2493 | valid_loss:1.3410 | valid_acc:0.2551
6/30 epoch train_loss:1.3371 | train_acc:0.2500 | valid_loss:1.3371 | valid_acc:0.2576
7/30 epoch train_loss:1.3339 | train_acc:0.2503 | valid_loss:1.3340 | valid_acc:0.2551
8/30 epoch train_loss:1.3314 | train_acc:0.2514 | valid_loss:1.3315 | valid_acc:0.2500
9/30 epoch train_loss:1.3292 | train_acc:0.2552 | valid_loss:1.3294 | valid_acc:0.2424
10/30 epoch train_loss:1.3275 | train_acc:0.2576 | valid_loss:1.3277 | valid_acc:0.2399
11/30 epoch train_loss:1.3260 | train_acc:0.2562 | valid_loss:1.3263 | valid_acc:0.2399
12/30 epoch train_loss:1.3247 | train_acc

In [449]:
#87
class CNN(nn.Module):
    def __init__(self, dim_w, dim_h, L):
        super(CNN, self).__init__()
        self.dim_h=dim_h
        self.cnn1 = nn.Conv1d(dim_w, dim_h,3, padding=1)
        self.cnn2 = nn.Conv1d(dim_h, dim_h, 3, padding=1)
        self.max_pooling= nn.MaxPool1d(2)
        self.fc = nn.Linear(dim_h, L)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.cnn1(x)
        x = self.max_pooling(x)
        x = self.cnn2(x)
        x = torch.max(x, 2).values
        x = self.fc(x)
        x = self.softmax(x)
        return x

In [450]:
X_train=X_train.view(len(X_train), 300, -1)
X_valid=X_valid.view(len(X_valid), 300, -1)

In [451]:
model = CNN(300,100,4)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.SGD(params=model.parameters() , lr=0.1)
optimizer.zero_grad()
epochs=30
batch_size=32
for epoch in range(epochs):
    total_loss=0
    for idx in range(0, len(X_train), batch_size):
        optimizer.zero_grad()
        hat_y = model(X_train[idx : idx+batch_size if idx+batch_size<=len(X_train) else len(X_train)])
        loss=criterion(hat_y, y_train[idx : idx+batch_size if idx+batch_size<=len(X_train) else len(X_train)])
        total_loss+=loss.item()
        loss.backward(retain_graph=True)
        optimizer.step()
    with torch.no_grad():
        hat_y_train = model(X_train)
        hat_y_valid = model(X_valid)
        train_acc=get_acc(hat_y_train, y_train)
        valid_acc=get_acc(hat_y_valid, y_valid)
        train_loss = criterion(hat_y_train, y_train)
        valid_loss = criterion(hat_y_valid, y_valid)
        print("{}/{} epoch train_loss:{:.4f} | train_acc:{:.4f} | valid_loss:{:.4f} | valid_acc:{:.4f}".format(
            epoch+1, epochs, train_loss, train_acc, valid_loss, valid_acc))

1/30 epoch train_loss:1.3381 | train_acc:0.2684 | valid_loss:1.3378 | valid_acc:0.2677
2/30 epoch train_loss:1.3234 | train_acc:0.2983 | valid_loss:1.3239 | valid_acc:0.2551
3/30 epoch train_loss:1.3176 | train_acc:0.2639 | valid_loss:1.3188 | valid_acc:0.2551
4/30 epoch train_loss:1.3147 | train_acc:0.2524 | valid_loss:1.3167 | valid_acc:0.2500
5/30 epoch train_loss:1.3128 | train_acc:0.2500 | valid_loss:1.3157 | valid_acc:0.2500
6/30 epoch train_loss:1.3114 | train_acc:0.2503 | valid_loss:1.3153 | valid_acc:0.2500
7/30 epoch train_loss:1.3101 | train_acc:0.2514 | valid_loss:1.3151 | valid_acc:0.2500
8/30 epoch train_loss:1.3088 | train_acc:0.2538 | valid_loss:1.3150 | valid_acc:0.2500
9/30 epoch train_loss:1.3075 | train_acc:0.2587 | valid_loss:1.3151 | valid_acc:0.2500
10/30 epoch train_loss:1.3060 | train_acc:0.2705 | valid_loss:1.3153 | valid_acc:0.2475
11/30 epoch train_loss:1.3043 | train_acc:0.2951 | valid_loss:1.3155 | valid_acc:0.2500
12/30 epoch train_loss:1.3024 | train_acc

In [197]:
#89
from transformers import BertTokenizer, BertForSequenceClassification, BertForPreTraining,BertModel

In [454]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

I0504 10:45:12.611952 140735748305792 tokenization_utils.py:374] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/shan/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
I0504 10:45:13.631198 140735748305792 configuration_utils.py:151] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /Users/shan/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
I0504 10:45:13.635942 140735748305792 configuration_utils.py:168] Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "

In [455]:
from torch.nn.utils.rnn import pad_sequence
# Encode text

train_input_ids =pad_sequence([torch.Tensor(tokenizer.encode(i)) for i in train.TITLE.values], batch_first=True).long()  # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
valid_input_ids =pad_sequence([torch.Tensor(tokenizer.encode(i)) for i in valid.TITLE.values], batch_first=True).long() 

In [456]:
from tqdm import tqdm

In [457]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.SGD(params=model.parameters() , lr=0.001)
optimizer.zero_grad()
epochs=5
batch_size=64
train_size = len(X_bert_train)
for epoch in tqdm(range(epochs)):
    total_loss=0
    for idx in range(0, train_size, batch_size):
        optimizer.zero_grad()
        batch_x=train_input_ids[idx : idx+batch_size if idx+batch_size<=train_size else train_size]
        batch_y=y_train[idx : idx+batch_size if idx+batch_size<=train_size else train_size]
        loss, logit = model(input_ids=batch_x, labels=batch_y)
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
    print(total_loss)





  0%|          | 0/5 [00:00<?, ?it/s][A[A[A[A



 20%|██        | 1/5 [10:04<40:17, 604.38s/it][A[A[A[A

61.196483850479126






 40%|████      | 2/5 [21:14<31:12, 624.06s/it][A[A[A[A

56.97632694244385






 60%|██████    | 3/5 [32:35<21:22, 641.32s/it][A[A[A[A

50.6916778087616






 80%|████████  | 4/5 [43:56<10:53, 653.14s/it][A[A[A[A

44.791101932525635






100%|██████████| 5/5 [55:15<00:00, 663.00s/it][A[A[A[A

37.43970739841461





In [458]:
with torch.no_grad():
    loss, train_logit = model(input_ids=train_input_ids, labels=y_train)
    loss, valid_logit = model(input_ids=valid_input_ids, labels=y_valid)

In [459]:
print("train_acc:",get_acc(train_logit, y_train))
print("valid_acc:", get_acc(valid_logit, y_valid))

train_acc: 0.7572916666666667
valid_acc: 0.7323232323232324
