# インポート

In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

#LightGBM
import lightgbm as lgb

#DNN
import tensorflow as tf

#決定木
from sklearn.tree import DecisionTreeClassifier

#ベクトル化
from transformers import BertJapaneseTokenizer, BertModel
from sentence_transformers import SentenceTransformer
from sentence_transformers import models
import torch

# データの読み込みと正規化

In [9]:
#読み込み
df = pd.read_csv("../data/vector_split.csv",index_col=0)
df_text = df.iloc[:,:2]
df_vector = df.iloc[:,2:]
df_status = pd.read_csv("../data/status.csv", index_col=0)
df_onehot = pd.read_csv("../data/onehot.csv", index_col=0)

#正規化
df_vector_norm = df_vector
for i in range(len(df_vector.T)):
    vec_min = df_status.iat[2,i]
    vec_max = df_status.iat[3,i]
    df_vector_norm.iloc[:,i] = (df_vector.iloc[:,i] - vec_min) / (vec_max - vec_min)

# モデルの読み込み

In [3]:
#LightGBM
with open('../model/lgb.pkl', 'rb') as model_file:
    model_lgb = pickle.load(model_file)

#DNN
model_dnn = tf.keras.models.load_model("../model/dnn")

#決定木
with open('../model/clf.pkl', 'rb') as model_file:
    model_clf = pickle.load(model_file)

In [8]:
print(f"{model_lgb}\n{model_dnn}\n{model_clf}")

LGBMClassifier()
<keras.engine.sequential.Sequential object at 0x000001C52103AB20>
DecisionTreeClassifier(max_depth=7)


# アンサンブル

In [20]:
df_lgb,df_dnn,df_clf = None,None,None
print(df_lgb,df_dnn,df_clf)
df_lgb = pd.DataFrame(model_lgb.predict_proba(df_vector_norm),columns=["lgb_-1","lgb_0","lgb_1"])
display(df_lgb.head(1))
df_dnn = pd.DataFrame(model_dnn.predict(df_vector_norm),columns=["dnn_-1","dnn_0","dnn_1"])
display(df_dnn.head(1))
df_clf = pd.DataFrame(model_clf.predict_proba(df_vector_norm),columns=["clf_-1","clf_0","clf_1"])
display(df_clf.head(1))

None None None


Unnamed: 0,lgb_-1,lgb_0,lgb_1
0,0.768978,0.126345,0.104676




Unnamed: 0,dnn_-1,dnn_0,dnn_1
0,0.753309,0.158197,0.088494


Unnamed: 0,clf_-1,clf_0,clf_1
0,0.359649,0.307018,0.333333


In [17]:
df_esb = pd.concat([df.iloc[:,:2],df_dnn,df_lgb,df_clf],axis = 1)
df_esb["-1"] = (df_esb["dnn_-1"] + df_esb["lgb_-1"] + df_esb["clf_-1"])/3
df_esb["0"] = (df_esb["dnn_0"] + df_esb["lgb_0"] + df_esb["clf_0"])/3
df_esb["1"] = (df_esb["dnn_1"] + df_esb["lgb_1"] + df_esb["clf_1"])/3
pred = []
for i in range(len(df_esb)):
    pred.append(df_esb.iloc[i,-3:].to_numpy().argmax() -1)
df_esb["pred"] = pred
df_esb.head(1)

Unnamed: 0,text,sentiment,dnn_-1,dnn_0,dnn_1,lgb_-1,lgb_0,lgb_1,clf_-1,clf_0,clf_1,-1,0,1,pred
0,ぼけっとしてたらこんな時間。チャリあるから食べにでたいのに…,-1,0.753309,0.158197,0.088494,0.768978,0.126345,0.104676,0.359649,0.307018,0.333333,0.627312,0.197187,0.175501,-1


In [18]:
a = 0
for sent,pred in zip(df_esb["sentiment"],df_esb["pred"]):
    if sent == pred:
        a += 1
print(f"正答率：{a / len(df_esb) * 100}％")

正答率：70.49428571428571％


# 関数の定義

In [101]:
def ensemble(vector):
    df_lgb,df_dnn,df_clf,df_esb,df_return = None,None,None,None,None
    print("初期化中....")
    
    #正規化
    df_status = pd.read_csv("../data/status.csv", index_col=0)
    vector_norm = vector
    for i in range(len(vector.T)):
        vec_min = df_status.iat[2,i]
        vec_max = df_status.iat[3,i]
        vector_norm.iloc[:,i] = (vector.iloc[:,i] - vec_min) / (vec_max - vec_min)
    
    #それぞれのモデルの結果を格納
    df_lgb = pd.DataFrame(model_lgb.predict_proba(vector_norm),columns=["lgb_-1","lgb_0","lgb_1"])
    df_dnn = pd.DataFrame(model_dnn.predict(vector_norm),columns=["dnn_-1","dnn_0","dnn_1"])
    df_clf = pd.DataFrame(model_clf.predict_proba(vector_norm),columns=["clf_-1","clf_0","clf_1"])
    
    #アンサンブル
    df_esb = pd.concat([df_dnn,df_lgb,df_clf],axis = 1)
    df_esb["-1"] = (df_esb["dnn_-1"] + df_esb["lgb_-1"] + df_esb["clf_-1"])/3
    df_esb["0"] = (df_esb["dnn_0"] + df_esb["lgb_0"] + df_esb["clf_0"])/3
    df_esb["1"] = (df_esb["dnn_1"] + df_esb["lgb_1"] + df_esb["clf_1"])/3
    pred = []
    for i in range(len(df_esb)):
        pred.append(df_esb.iloc[i,-3:].to_numpy().argmax() -1)
    df_esb["pred"] = pred
    df_esb["pred"] = df_esb["pred"].replace(-1,"negative").replace(0,"neutral").replace(1,"positive")
    df_return = df_esb.loc[:,["pred"]]
    
    return df_return

In [130]:
df.head()

Unnamed: 0,text,sentiment,vector_0,vector_1,vector_2,vector_3,vector_4,vector_5,vector_6,vector_7,...,vector_758,vector_759,vector_760,vector_761,vector_762,vector_763,vector_764,vector_765,vector_766,vector_767
0,- The New RX - 走りの楽しさを最大化させるコックピット。 多くの機能をディスプ...,ニュートラル,0.108495,-0.286713,0.029664,-0.145083,-0.077757,0.235934,0.161421,0.040434,...,-0.183763,-0.052222,0.206732,0.075753,-0.273582,0.038897,-0.002719,-0.212596,-0.397314,-0.042434
1,"""◇ トヨタ　RAV4 09-12 ヘッドライト レンズ　リペア用　左右１セット　-RE-1...",ニュートラル,-0.231273,-0.226288,-0.063712,-0.219358,-0.136929,0.113497,0.198523,-0.307485,...,-0.13259,0.029339,0.17185,0.361812,-0.139428,0.211578,-0.003187,-0.433962,-0.13161,0.069377
2,"""FWDモデルだけど,電費が良くない"" ""<回答> 基本的には各車で走行条件を極力そろえてい...",ネガティブ,-0.045604,-0.195024,-0.181397,-0.274775,0.028938,0.305653,0.105273,0.14298,...,-0.078807,-0.047943,-0.015218,-0.105774,-0.174539,-0.175561,0.147552,-0.317954,-0.165231,-0.103092
3,"""G's""(現行・特別仕様車) 「SPORTSCARS for ALL.すべての人に、スポー...",ニュートラル,0.136061,-0.223842,-0.151229,-0.083908,0.002592,0.16822,0.029885,0.089017,...,-0.148272,-0.08191,0.129447,0.109829,-0.137619,0.067059,-0.063711,-0.13694,-0.151015,0.157262
4,"""トヨタbZ4Xで東京 青森までレースしたらそれどころじゃなかった【SUVのEV4台で充電レ...",ニュートラル,-0.046714,0.029287,-0.188404,-0.294618,-0.076427,0.059908,0.054741,-0.239359,...,-0.01462,0.090699,0.16119,0.078609,0.064066,0.029458,0.054378,-0.150109,-0.049408,0.30447


In [132]:
df_return = ensemble(df.iloc[:,2:])

初期化中....


In [137]:
_df = pd.concat([df.iloc[:,:2],df_return],axis=1)
_df["sentiment"] = _df["sentiment"].replace("ニュートラル","neutral").replace("ネガティブ","negative").replace("ポジティブ","positive")
a = 0
for sent,pred in zip(_df["sentiment"],_df["pred"]):
    if sent == pred:
        a += 1
print(f"正答率：{a / len(_df) * 100}％")

正答率：60.934766308422894％


In [139]:
_ = _df.iloc[:100,:]
for i in range(len(_)):
    print(_.iat[i,0])
    print(_.iat[i,2])
    print("----------------------------------------")

- The New RX - 走りの楽しさを最大化させるコックピット。 多くの機能をディスプレイ内のソフトスイッチに集約し、スイッチのサイズや形、レイアウト、表示情報など細部までこだわり、直感的に操作できる最適な配置と形状を追求しています。 ⇒ lexus.jp/models/rx/worl… #Lexus #LexusRX pic.twitter.com/SR8FMde1cM
neutral
----------------------------------------
"◇ トヨタ　RAV4 09-12 ヘッドライト レンズ　リペア用　左右１セット　-RE-1043 #◇ #トヨタ #RAV4 #09-12 #ヘッドライト #レンズ #リペア用 #左右１セット #-RE-1043 imp01.thebase.in/items/54782109 "
neutral
----------------------------------------
"FWDモデルだけど,電費が良くない" "<回答> 基本的には各車で走行条件を極力そろえていただいての走行ではあったと思いますが,現状頂いている情報の範囲内では何とも言えない部分がございます" 『bZ4X』東京ー青森長距離走行で実感した「疑問」について考えてみる｜evsmart blog.evsmart.net/electric-vehic…
neutral
----------------------------------------
"G's"(現行・特別仕様車) 「SPORTSCARS for ALL.すべての人に、スポーツカーのある楽しさを。」 トヨタのスポーツブランドG'sとハリアーのコラボレートモデル。 外見だけでなく内装も走りも全てがスポーティー。pic.twitter.com/mpzlUBgMSf
positive
----------------------------------------
"トヨタbZ4Xで東京 青森までレースしたらそれどころじゃなかった【SUVのEV4台で充電レース 日産アリア ヒョンデIONIQ5 テスラモデルY トヨタbZ4X】" を YouTube で見る youtu.be/B1pJc6_oWxs
neutral
-----------------------------

# 文章のベクトル化
参考：  
https://techblog.gmo-ap.jp/2022/12/21/bert_calc_sentence_similarity/

## tokenizerとbertのインスタンス化

In [60]:
MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)
model = BertModel.from_pretrained(MODEL_NAME)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [61]:
def sentence_to_vector(model, tokenizer, sentence):
    # 文を単語に区切って数字にラベル化
    tokens = tokenizer(sentence)["input_ids"]#<meta charset="utf-8">
    
    # BERTモデルの処理のためtensor型に変換
    input = torch.tensor(tokens).reshape(1,-1)
    
    # BERTモデルに入力し文のベクトルを取得
    with torch.no_grad():
        outputs = model(input, output_hidden_states=True)
        last_hidden_state = outputs.last_hidden_state[0]#<meta charset="utf-8">
        averaged_hidden_state = last_hidden_state.sum(dim=0) / len(last_hidden_state)
    return averaged_hidden_state

In [128]:
vector = []
for i, temp in enumerate(df_drop["text"]):
#     print(f"{i+1}行目")
    temp_vectors = sentence_to_vector(model, tokenizer, temp)
    dict_temp = {}
    for j, temp_vector in enumerate(temp_vectors):
        name = f"vector_{j}"
        dict_temp[name] = float(temp_vector.numpy())
    vector.append(dict_temp)
#     print(len(vector),len(vector[i]))
_df_vector = pd.DataFrame(vector)
df = pd.concat([df_drop.reset_index().iloc[:,1:],_df_vector],axis=1)

In [115]:
df_org = pd.read_excel("../data/Stream.xlsx")
df_org = df_org.iloc[5:,1:3]
df_org.set_axis(["text","sentiment"], axis="columns", inplace=True)
df_org["text"] = df_org["text"]\
    .str.replace("\n"," ")\
    .str.replace("�","")\
    .str.replace("×","と")\
    .str.replace("&","と")\
    .str.replace("→"," ")
df_drop = df_org.drop_duplicates()
df_drop = df_drop[(df_drop["sentiment"] == "ネガティブ") | (df_drop["sentiment"] == "ニュートラル") | (df_drop["sentiment"] == "ポジティブ")]

In [116]:
df_drop[df_drop["sentiment"]=="ポジティブ"].loc[:,"text"].count()

404