In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 1. Load Data

In [2]:
train_df = pd.read_csv(r'/content/drive/MyDrive/MLDL_개인프로젝트/Corona tweets NLP - Text Classification/datasets/Corona_NLP_train.csv', encoding='latin-1')
train_df

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative
...,...,...,...,...,...,...
41152,44951,89903,"Wellington City, New Zealand",14-04-2020,Airline pilots offering to stock supermarket s...,Neutral
41153,44952,89904,,14-04-2020,Response to complaint not provided citing COVI...,Extremely Negative
41154,44953,89905,,14-04-2020,You know itÂs getting tough when @KameronWild...,Positive
41155,44954,89906,,14-04-2020,Is it wrong that the smell of hand sanitizer i...,Neutral


In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB


In [4]:
test_df = pd.read_csv(r'/content/drive/MyDrive/MLDL_개인프로젝트/Corona tweets NLP - Text Classification/datasets/Corona_NLP_test.csv', encoding='latin-1')
test_df

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral
...,...,...,...,...,...,...
3793,3794,48746,Israel ??,16-03-2020,Meanwhile In A Supermarket in Israel -- People...,Positive
3794,3795,48747,"Farmington, NM",16-03-2020,Did you panic buy a lot of non-perishable item...,Negative
3795,3796,48748,"Haverford, PA",16-03-2020,Asst Prof of Economics @cconces was on @NBCPhi...,Neutral
3796,3797,48749,,16-03-2020,Gov need to do somethings instead of biar je r...,Extremely Negative


In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3798 entries, 0 to 3797
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       3798 non-null   int64 
 1   ScreenName     3798 non-null   int64 
 2   Location       2964 non-null   object
 3   TweetAt        3798 non-null   object
 4   OriginalTweet  3798 non-null   object
 5   Sentiment      3798 non-null   object
dtypes: int64(2), object(4)
memory usage: 178.2+ KB


## 2. Basic EDA on Train data

In [6]:
# value count of sentiment
train_df['Sentiment'].value_counts()

Positive              11422
Negative               9917
Neutral                7713
Extremely Positive     6624
Extremely Negative     5481
Name: Sentiment, dtype: int64

In [7]:
# check train, test data ratio
print(len(train_df['OriginalTweet']))
print(len(test_df['OriginalTweet']))
print(len(train_df['OriginalTweet']) / len(test_df['OriginalTweet']))

41157
3798
10.836492890995261


## 3. Preprocess text data

In [8]:
# 1. remove special character & url address letter starts with https~ & \r, \n letter & transform to lower case
import re

def preprocess_text(text):
    # remove special character
    refined_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # remove url address letter
    refined_text = ' '.join([i for i in refined_text.split(' ') if not i.startswith('http')])
    # remove \r, \n
    refined_text = refined_text.replace('\r', '')
    refined_text = refined_text.replace('\n', '')
    # change to lower case
    refined_text = refined_text.lower()
    
    return refined_text

# 훈련데이터 Original Tweet 기본 전처리
train_text_data = train_df['OriginalTweet'].apply(preprocess_text).to_list()
train_text_data[0]

'menyrbie philgahan chrisitv and and'

In [9]:
# 검증데이터 Original Tweet 기본 전처리
test_text_data = test_df['OriginalTweet'].apply(preprocess_text).to_list()
test_text_data[0]

'trending new yorkers encounter empty supermarket shelves pictured wegmans in brooklyn soldout online grocers foodkick maxdelivery as coronavirusfearing shoppers stock up'

In [10]:
%%capture
!pip install nltk

In [11]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
# 2. tokenize based on preprocessed text
from nltk.tokenize import word_tokenize

def tokenize_text(text: str) -> list:
  return word_tokenize(text)

train_text_list = [tokenize_text(text) for text in train_text_data]
test_text_list = [tokenize_text(text) for text in test_text_data]

print(train_text_list[0])
print(test_text_list[0])

print(len(train_text_list))
print(len(test_text_list))

['menyrbie', 'philgahan', 'chrisitv', 'and', 'and']
['trending', 'new', 'yorkers', 'encounter', 'empty', 'supermarket', 'shelves', 'pictured', 'wegmans', 'in', 'brooklyn', 'soldout', 'online', 'grocers', 'foodkick', 'maxdelivery', 'as', 'coronavirusfearing', 'shoppers', 'stock', 'up']
41157
3798


In [13]:
# 3. remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text_list: list):
  final_res = []
  for text in text_list:
    res = []
    for word in text:
      if word not in stop_words:
        res.append(word)
    final_res.append(res)

  return final_res

train_text_dataset = remove_stopwords(train_text_list)
test_text_dataset = remove_stopwords(test_text_list)

# 모델 훈련 위한 텍스트 데이터셋
print(train_text_dataset[0])

# 모델 검증 위한 텍스트 데이터셋
print(test_text_dataset[0])

['menyrbie', 'philgahan', 'chrisitv']
['trending', 'new', 'yorkers', 'encounter', 'empty', 'supermarket', 'shelves', 'pictured', 'wegmans', 'brooklyn', 'soldout', 'online', 'grocers', 'foodkick', 'maxdelivery', 'coronavirusfearing', 'shoppers', 'stock']


In [14]:
print(len(train_text_dataset))
print(len(test_text_dataset))

41157
3798


In [15]:
# 4. train text integer encoding
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text_dataset)
encoded_train_dataset = tokenizer.texts_to_sequences(train_text_dataset)

print(encoded_train_dataset[:5])

[[23506, 23507, 12866], [403, 778, 2495, 180, 2897, 764, 1166, 998, 347, 291, 764, 1166, 2495, 1097, 3951, 3614, 4928, 380, 13, 12, 2289, 12867, 3094, 110, 963, 3270, 155], [1, 747, 1673, 224, 227, 1220, 1722, 12, 173, 81, 2, 73], [4, 29, 31, 23508, 33, 20, 157, 4, 76, 60, 17, 46, 868, 46, 23509, 2, 2, 1, 4394, 23510, 23511], [634, 24, 5, 2, 23512, 47, 4094, 4, 29, 23513, 143, 1, 748, 218, 39, 33, 20, 1598, 23514, 10985, 300, 4394]]


In [16]:
# test data 정수형 인코딩 변환
tokenizer = Tokenizer()
tokenizer.fit_on_texts(test_text_dataset)
encoded_test_dataset = tokenizer.texts_to_sequences(test_text_dataset)

print(encoded_test_dataset[:5])

[[2160, 79, 2616, 2617, 54, 11, 31, 5010, 1429, 1602, 3405, 10, 987, 5011, 5012, 5013, 272, 6], [785, 168, 48, 100, 3406, 3407, 2161, 127, 5014, 55, 786, 5015, 2, 444, 1171, 13], [168, 328, 1851, 488, 2], [12, 20, 1603, 1852, 333, 1430, 272, 6, 5016, 42, 445, 988, 5017, 2162, 5018, 1284, 533, 2, 1285, 5019, 5020, 5021, 5022, 5023, 5024, 787], [215, 5025, 2, 1604, 164, 1, 5026, 3408, 5027, 5028, 372, 43, 88, 36, 20, 446, 334, 2618, 129, 36, 20, 15, 16]]


In [17]:
len(encoded_train_dataset) / len(encoded_test_dataset)

10.836492890995261

In [18]:
# 5. padding
from tensorflow.keras.preprocessing.sequence import pad_sequences

train_dataset = pad_sequences(encoded_train_dataset, padding='post')
print(train_dataset[:5])

[[23506 23507 12866     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0]
 [  403   778  2495   180  2897   764  1166   998   347   291   764  1166
   2495  1097  3951  3614  4928   380    13    12  2289 12867  3094   110
    963  3270   155     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0]
 [    1   747  1673   224   227  1220  1722    12   173    81     2    73
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0]
 [    4    29    31 23508    33    20   157     4    76    60    17    46
    868    46 23509     2     2     1  4394 23510 23511     0     0     0
      0     0  

In [19]:
test_dataset = pad_sequences(encoded_test_dataset, padding='post')
print(test_dataset[:5])

[[2160   79 2616 2617   54   11   31 5010 1429 1602 3405   10  987 5011
  5012 5013  272    6    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]
 [ 785  168   48  100 3406 3407 2161  127 5014   55  786 5015    2  444
  1171   13    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]
 [ 168  328 1851  488    2    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]
 [  12   20 1603 1852  333 1430  272    6 5016   42  445  988 5017 2162
  5018 1284  533    2 1285 5019 5020 5021 5022 5023 5024  787    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]
 [ 215 5025    2 1604  164    1 5026 3408 5027 5028  372   43   88   36
    20  446 

In [20]:
# 6. transform class label to number
sentiment_label = {
    'Extremely Positive': 5,
    'Positive': 4,
    'Neutral': 3,
    'Negative': 2,
    'Extremely Negative': 1
}

train_label_dataset = train_df['Sentiment'].apply(lambda x: sentiment_label[x]).to_list()
test_label_dataset = test_df['Sentiment'].apply(lambda x: sentiment_label[x]).to_list()

print(train_label_dataset[:3])
print(test_label_dataset[:3])

print(len(train_label_dataset))
print(len(test_label_dataset))

[3, 4, 4]
[1, 4, 5]
41157
3798


In [22]:
# 7. dataset 생성
import torch
import torch.nn as nn
from torch.utils.data import Dataset

class CreateDataset(Dataset):
    
    def __init__(self, data, labels):
        super().__init__()
        self.data = data
        self.labels = labels
        
    def __getitem__(self, idx):
        return torch.tensor(self.data[idx]), torch.tensor(self.labels[idx])
    
    def __len__(self):
        return len(self.labels)

train_dataset = CreateDataset(train_dataset, train_label_dataset)
test_dataset = CreateDataset(test_dataset, test_label_dataset)

In [25]:
print(train_dataset[:2])
print(len(train_dataset), len(test_dataset))

(tensor([[23506, 23507, 12866,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [  403,   778,  2495,   180,  2897,   764,  1166,   998,   347,   291,
           764,  1166,  2495,  1097,  3951,  3614,  4928,   380,    13,    12,
          2289, 12867,  3094,   110,   963,  3270,   155,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0]], dtype=torch.int32), tensor([3, 4]))
41157 3798


In [26]:
from torch.utils.data import DataLoader

BATCH_SIZE = 64

# dataloader 생성
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=BATCH_SIZE)

## 3. Modeling

In [28]:
# random seed 고정
import random

SEED = 5
random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fc896bb7f30>

In [45]:
# device 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [31]:
# 하이퍼파라미터
lr = 0.001
EPOCHS = 50

vocab_size = 72245
n_classes = 5
print('단어 집합의 크기 : {}'.format(vocab_size))
print('클래스의 개수 : {}'.format(n_classes))

단어 집합의 크기 : 72245
클래스의 개수 : 5


In [46]:
class GRU(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.2):
        super(GRU, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.dropout = nn.Dropout(dropout_p)
        self.gru = nn.GRU(embed_dim, self.hidden_dim,
                          num_layers=self.n_layers,
                          batch_first=True)
        self.out = nn.Linear(self.hidden_dim, n_classes)

    def forward(self, x):
        x = self.embed(x)
        h_0 = self._init_state(batch_size=x.size(0)) # 첫번째 히든 스테이트를 0벡터로 초기화
        x, _ = self.gru(x, h_0)  # GRU의 리턴값은 (배치 크기, 시퀀스 길이, 은닉 상태의 크기)
        h_t = x[:,-1,:] # (배치 크기, 은닉 상태의 크기)의 텐서로 크기가 변경됨. 즉, 마지막 time-step의 은닉 상태만 가져온다.
        self.dropout(h_t)
        logit = self.out(h_t)  # (배치 크기, 은닉 상태의 크기) -> (배치 크기, 출력층의 크기)
        return logit

    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

In [48]:
model = GRU(1, 256, vocab_size, 128, n_classes, 0.5).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [54]:
import torch.nn.functional as F

def train(model, optimizer, train_iter):
    model.train()
    for b, batch in enumerate(train_iter):
        x, y = batch.to(device), batch.to(device)
        y.data.sub_(1)  # 레이블 값을 0과 1로 변환
        optimizer.zero_grad()

        logit = model(x)
        loss = F.cross_entropy(logit, y)
        loss.backward()
        optimizer.step()

In [55]:
def evaluate(model, val_iter):
    """evaluate model"""
    model.eval()
    corrects, total_loss = 0, 0
    for batch in val_iter:
        x, y = batch.text.to(device), batch.label.to(device)
        y.data.sub_(1) # 레이블 값을 0과 1로 변환
        logit = model(x)
        loss = F.cross_entropy(logit, y, reduction='sum')
        total_loss += loss.item()
        corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
    size = len(val_iter.dataset)
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

In [59]:
import os

best_val_loss = None
for e in range(1, EPOCHS+1):
    train(model, optimizer, train_loader)
    val_loss, val_accuracy = evaluate(model, test_loader)

    print("[Epoch: %d] val loss : %5.2f | val accuracy : %5.2f" % (e, val_loss, val_accuracy))

    # 검증 오차가 가장 적은 최적의 모델을 저장
    if not best_val_loss or val_loss < best_val_loss:
        if not os.path.isdir("snapshot"):
            os.makedirs("snapshot")
        torch.save(model.state_dict(), './snapshot/txtclassification.pt')
        best_val_loss = val_loss

AttributeError: ignored