In [2]:
import streamlit as st
import re
import emoji
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore")

# Packages for NLP
import torch
import transformers
import torch.nn.functional as F
from torch import nn
from torch import optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers.tokenization_bert_japanese import BertJapaneseTokenizer
from transformers import BertModel
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

# Packages for data preprocessing
import pandas as pd
import numpy as np
import os
import random

PyTorch version 1.2.0 available.
TensorFlow version 2.0.0 available.


In [28]:
def give_emoji_free_text(text):
    allchars = [string for string in text]
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
    cleaned_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    return cleaned_text

def clean_text(text):
    # Remove emoji
    text = give_emoji_free_text(text)
    # Remove punctuation
    text = re.sub(r'[^\w\d\s]+', '', text)
    # Remove digits
    text = ''.join([i for i in text if not i.isdigit()])
    return text

def argmax(lst):
    return max(range(len(lst)), key=lst.__getitem__)

In [3]:
def seed_everything(seed=17):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert_layer = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.dropout = nn.Dropout(p=0.25)
        self.classifier = nn.Linear(self.bert_layer.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert_layer(
            input_ids=input_ids, 
            attention_mask=attention_mask)
        main = self.dropout(pooled_output)
        return F.softmax(self.classifier(main), dim=1)

In [19]:
PRE_TRAINED_MODEL_NAME = 'bert-base-japanese-char-whole-word-masking'
tokenizer = BertJapaneseTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
class_name = ["negative", "positive"]
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = torch.load("best_bert.pkl")
model.to(device)

loading file https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking/vocab.txt from cache at C:\Users\YangWang/.cache\torch\transformers\7f2c06ef019d0d85f219ef3f4255250db9494e235d992c3c8460cdd338361721.b4eb206328b3e32243c87443c4f4ce514b090af2cf984cb8110b22cbd2cab419


SentimentClassifier(
  (bert_layer): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(4000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [12]:
text = "ダウンロードはネットが必要ですが、その後はオフラインで聞くことが出来てとても便利です。"
text = clean_text(text)

encoding = tokenizer.encode_plus(
    text, 
    max_length=128, 
    add_special_tokens=True, 
    return_token_type_ids=False, 
    pad_to_max_length=True, 
    return_attention_mask=True, 
    return_tensors="pt")

In [13]:
encoding

{'input_ids': tensor([[  2, 156, 111,  21,  67,  13,  55,  12, 222,  49,  38,  20, 623, 306,
          17,  46,  20,  78,   6,  99,  12, 103,  65,  47,  40,  21,  17, 805,
          66,  44,  14,   3]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]])}

In [48]:
def analyze_polarity(text, model, tokenizer, class_name):
    text = clean_text(text)
    encoding = tokenizer.encode_plus(
        text, 
        max_length=128, 
        add_special_tokens=True, 
        return_token_type_ids=False, 
        pad_to_max_length=True, 
        return_attention_mask=True, 
        return_tensors="pt")
    
    logits = model(encoding["input_ids"].to(device), encoding["attention_mask"].to(device))
    prediction = logits.to("cpu")
    prediction = prediction.tolist()[0]
    prediction = class_name[argmax(prediction)]
    return prediction

In [50]:
text = "かなり改悪された感じかな。 なんでバージョンアップする度に悪くなっていくんだろ… 曲の変わり目で勝手にフェードアウト/インしてオーバーラップさせるから、曲の頭が聞こえないケースが起きてストレスが溜まるね。"
prediction = analyze_polarity(text, model, tokenizer, class_name); prediction

'negative'