# Import libraries

In [182]:
# 필요한 library가 있다면 추가하셔도 됩니다.

# load all necessary libraries
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch.nn.functional as F
from torch import nn, optim
%matplotlib inline

# libraries for nlp task
import regex as re
import nltk, re, string
from nltk import FreqDist
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

#machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

import torch
import torch.nn as nn
torch.manual_seed(42)
# filtering warnings
import warnings
warnings.filterwarnings('ignore')

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
pd.set_option('display.max_columns', None)

from nltk.corpus import wordnet, stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hyundong\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hyundong\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hyundong\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Preprocessing

data의 preprocessing을 진행합니다. 아래 조건에 맞게 전처리를 진행합니다. 필요한만큼 셀을 사용하시면 됩니다.

조건에 맞는 전처리를 진행하고 각각의 실행 결과(output) 창을 보여주어야합니다.

- Sentence의 문자를 모두 소문자로 변경
- stopwords의 english stopwords 제거
- WordNetLemmatizer를 이용하여 lemmatize 진행
- 정규 표현식을 사용하여 url 제거
- 정규 표현식을 사용하여 알파벳을 제외한 punctuation 포함 문자 제거
- sklearn의 LabelEncoder를 이용하여 Sentiment에 대해 label encoding 진행
- FreqDist를 사용하여 word encoding
- 제일 길이가 긴 문장을 기준으로 zero-padding 진행

In [129]:
train = pd.read_csv('./train_data.csv') # 자신의 file path에 맞게 수정하시면 됩니다.
train

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
...,...,...
5795,"In 2009 , it reported net sales of approximate...",neutral
5796,"H1 '08 H1 '07 Q2 '08 Q2 '07 in mln euro , unle...",neutral
5797,`` Low energy consumption and flexible loading...,neutral
5798,$SPY $MITK fast 56pc dive http://stks.co/3ffN $$,negative


In [130]:
def cleaning_sentences(data):     
    data = data.lower().split()
        
    stops = set(stopwords.words('english'))
    data = [word for word in data if not word in stops] 

    lemmatizer = WordNetLemmatizer()
    data = [lemmatizer.lemmatize(word) for word in data]

    data = ' '.join(data)
    data = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",' ',data)   
    data = re.sub('[^a-zA-z]',' ',data)

    return data


In [131]:
train['Sentence'] = train['Sentence'].apply(lambda x : cleaning_sentences(x))

label_encoder = LabelEncoder()
train['Sentiment'] = label_encoder.fit_transform(train['Sentiment'])

all_words = ' '.join(train['Sentence']).split()
freq_dist = FreqDist(all_words)

sorted_word_freq = sorted(freq_dist.items(), key=lambda x: x[1], reverse=True)

word_index = {word: index+1 for index, (word,_) in enumerate(sorted_word_freq)}

train['Sentence'] = train['Sentence'].apply(lambda x : [word_index[word] for word in x.split()])

# 제일 길이가 긴 문장을 기준으로 zero-padding 진행
max_len = max(train['Sentence'].apply(len))
train['Sentence'] = pad_sequences(train['Sentence'], maxlen=max_len, padding='post').tolist()

# 최종 전처리된 데이터 확인
train.head()


Unnamed: 0,Sentence,Sentiment
0,"[3289, 56, 2596, 1149, 2, 1668, 39, 670, 618, ...",2
1,"[4747, 2598, 3291, 137, 2599, 0, 0, 0, 0, 0, 0...",0
2,"[77, 14, 529, 2, 11, 5, 1670, 1, 24, 1, 24, 17...",2
3,"[62, 7, 295, 3292, 1895, 262, 88, 3, 16, 13, 1...",1
4,"[217, 1150, 358, 359, 833, 41, 141, 587, 4749,...",1


# Model
전처리를 진행한 데이터를 이용하여 학습을 진행하는 부분입니다. 아래의 조건들에 맞는 코드를 작성하고, 결과를 확인하면 됩니다. 필요한만큼 셀을 사용하시면 됩니다.

조건에 맞는 코드를 작성하고, 결과창(output)을 보여주어야합니다.

- train data와 validation data 분리 → train : 8 / valid : 2 비율로 분리
- target의 경우 classification이기 때문에 categorical하게 바꿔야 합니다.
- MLP 모델 구현 → 하이퍼파라미터 설정은 자유
- train / valid 과정 구현
- train set loss/ validation set loss에 대한 learning curve 출력하기
- 학습된 model은 학번_model.pth로 파일 save → ex) 20XXXXXXX_model.pth

In [138]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# for reproducibility
torch.manual_seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)
    
X_train, X_valid, y_train, y_valid = train_test_split(train['Sentence'],train['Sentiment'],test_size=0.2,random_state=42)
# train['Sentiment'] = to_categorical(train['Sentiment'], num_classes=3)
y_train = to_categorical(y_train,num_classes=3)
y_valid = to_categorical(y_valid,num_classes=3)

X_train = torch.FloatTensor(X_train.tolist()).to(device)
X_valid = torch.FloatTensor(X_valid.tolist()).to(device)
y_train = torch.FloatTensor(y_train.tolist()).to(device)
y_valid = torch.FloatTensor(y_valid.tolist()).to(device)

print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)

torch.Size([4640, 42]) torch.Size([4640, 3]) torch.Size([1160, 42]) torch.Size([1160, 3])


In [161]:
print(len(set(X_train.shape)))

2


torch.Size([4640, 42])

In [169]:
model = nn.Sequential(
          nn.Linear(42, 10), # input_layer = 2, hidden_layer1 = 10
          nn.ReLU(),
          nn.Linear(10, 10), # hidden_layer1 = 10, hidden_layer2 = 10
          nn.ReLU(),
          nn.Linear(10, 10), # hidden_layer2 = 10, hidden_layer3 = 10
          nn.ReLU(),
          nn.Linear(10, 3), # hidden_layer3 = 10, output_layer = 1
          nn.Softmax(dim=1)  
          ).to(device)

criterion = torch.nn.CrossEntropyLoss(weight=None, ignore_index=-100, reduction='mean').to(device) #다중 클래스 분류에 적합한 CrossEntropy loss를 위해서 soft max 사용
optimizer = torch.optim.SGD(model.parameters(), lr=1)

for epoch in range(10001):
    optimizer.zero_grad()
    # forward 연산
    hypothesis = model(X_train)

    # 비용 함수
    cost = criterion(hypothesis, y_train)
    cost.backward()
    optimizer.step()

    # 100의 배수에 해당되는 에포크마다 비용을 출력
    if epoch % 100 == 0:
        print(epoch, cost.item())

0 1.2141057252883911
100 1.0053223371505737
200 1.0089457035064697
300 1.0141444206237793
400 1.0151422023773193
500 1.0122416019439697
600 1.0119913816452026


KeyboardInterrupt: 

In [186]:
model = nn.Sequential(
          nn.Linear(42, 30), # input_layer = 2, hidden_layer1 = 10
          nn.ReLU(),
          nn.Linear(30, 20), # hidden_layer1 = 10, hidden_layer2 = 10
          nn.ReLU(),
          nn.Linear(20, 10), # hidden_layer2 = 10, hidden_layer3 = 10
          nn.ReLU(),
          nn.Linear(10, 3), # hidden_layer3 = 10, output_layer = 1
          nn.Softmax(dim=1)  
          ).to(device)

learning_rate = 1

criterion = torch.nn.CrossEntropyLoss(weight=None, ignore_index=-100, reduction='mean').to(device) #다중 클래스 분류에 적합한 CrossEntropy loss를 위해서 soft max 사용
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)


for epoch in range(15001):
    optimizer.zero_grad()
    # forward 연산
    hypothesis = model(X_train)

    # 비용 함수
    cost = criterion(hypothesis, y_train)
    cost.backward()
    optimizer.step()

    # 100의 배수에 해당되는 에포크마다 비용을 출력
    if epoch % 100 == 0:
        print(epoch, cost.item())

0 1.224942684173584
100 1.016479730606079
200 1.0078165531158447
300 0.9953939318656921
400 0.9920290112495422
500 0.9656710624694824
600 0.9668888449668884
700 0.976428210735321
800 0.9636017084121704
900 0.964198887348175
1000 0.9519592523574829
1100 0.9488985538482666
1200 0.9664783477783203
1300 0.943958580493927
1400 0.9611387848854065
1500 0.9481385350227356
1600 0.956175684928894
1700 1.013801097869873
1800 1.0150519609451294
1900 1.01504385471344
2000 1.0150361061096191
2100 1.0150288343429565
2200 1.0150222778320312
2300 1.015015959739685
2400 1.015009880065918
2500 1.015004277229309
2600 1.0119874477386475
2700 1.0115654468536377
2800 1.0151594877243042
2900 1.01515793800354
3000 1.0151567459106445
3100 1.0151554346084595
3200 1.015154242515564
3300 1.015153169631958
3400 1.015151858329773
3500 1.0151506662368774
3600 1.0151493549346924
3700 1.0151482820510864
3800 1.015147089958191
3900 1.0151457786560059
4000 1.0151447057724
4100 1.015143632888794
4200 1.0151424407958984
43

In [184]:
with torch.no_grad():
    hypothesis = model(X_valid)
    predicted = (hypothesis > 0.7).float()
    accuracy = (predicted == y_valid).float().mean()
    print('모델의 출력값(Hypothesis): ', hypothesis.detach().cpu().numpy())
    print('모델의 예측값(Predicted): ', predicted.detach().cpu().numpy())
    print('실제값(Y): ', y_valid.cpu().numpy())
    print('정확도(Accuracy): ', accuracy.item())

모델의 출력값(Hypothesis):  [[0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 ...
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]
모델의 예측값(Predicted):  [[0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 ...
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]
실제값(Y):  [[1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 ...
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]]
정확도(Accuracy):  0.6977011561393738


In [None]:
# nn.Linear(42, 10, bias=True) 정확도 0.6876
# nn.Linear(42, 10) 정확도 0.688
# ReLU 사용 정확도 0.693
# 층 1개 제거 0.6977

"""
model = nn.Sequential(
          nn.Linear(42, 30), # input_layer = 2, hidden_layer1 = 10
          nn.ReLU(),
          nn.Linear(30, 20), # hidden_layer1 = 10, hidden_layer2 = 10
          nn.ReLU(),
          nn.Linear(20, 10), # hidden_layer2 = 10, hidden_layer3 = 10
          nn.ReLU(),
          nn.Linear(10, 3), # hidden_layer3 = 10, output_layer = 1
          nn.Softmax(dim=1)  
          ).to(device)
"""

# 참고 자료

- 아래 markdown cell에 내용 및 코드를 참고한 부분을 작성하시면 됩니다.

딥러닝을 이용한 자연어 처리 입문 : https://wikidocs.net/49071
PyTorch로 시작하는 딥 러닝 입문 : https://wikidocs.net/61010