# 準備

In [1]:
# !pip install gensim
# !pip install SudachiPy==0.5.4 ginza==4.0.6 ja-ginza==4.0.0 
# !pip install pytorch_lightning
# # !pip install emoji
# !pip install transformers[ja]
# !pip install sentencepiece

# !pip install fugashi
# !pip install ipadic

# !pip install deep_translator
# !pip install googletrans==4.0.0-rc1

# import

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import os
import re
import matplotlib.pyplot as plt
from glob import glob
%matplotlib inline

#one_hotベクトル
from sklearn.feature_extraction.text import TfidfVectorizer

#ベクトル化
from gensim.models import word2vec

#データ分割用
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
import sklearn

import torch

from transformers import AutoTokenizer
from transformers import BertJapaneseTokenizer, AutoModelForMaskedLM, BertModel

# 翻訳
# -*- coding: utf-8
from googletrans import Translator

# データの読み込み

In [3]:
files = glob("*.xlsx")
files

['vocal_商品性情報.xlsx', '類似データ.xlsx', 'tqnet情報.xlsx']

In [4]:
vocal_xlsx = files[0]
tqnet_xlsx = files[2]

vocal_df = pd.read_excel(vocal_xlsx)
tqnet_df = pd.read_excel(tqnet_xlsx)

vocal_df = vocal_df[['time', 'Unnamed: 1', 'Unnamed: 2']]
vocal_df.columns = ["time", "text", "car_name"]

tqnet_df.columns = ["car_name", "text"]

In [5]:
tqnet_df
# vocal_df

Unnamed: 0,car_name,text
0,ｼｴﾝﾀ,運転席に乗り込み内側からドアを閉めようとすると、手をかける場所が 上のほうにあるので非常に閉...
1,ﾙｰﾐｰ,走行中、加速した時エンジンの音が大きく感じて、段差などの時、カラカラと金属音がする。
2,ﾉｱ,シフトノブ部分の下側に隙間があり小銭などが入ってしまったら取れなくなってしまう。実際にほかの...
3,ｸﾗｳﾝ,トランクを開けるスイッチの場所がわかりずらく押しにくい
4,RAV4,運転席、助手席のドア取っ手が前寄りについているためドアを閉めるのが重い、強風が吹いたと\nき...
5,a,運転席のドアが閉めづらい 運転席のドア内側の手をかける部分が高い位置の為ドアを閉めづらいそう...
6,b,運転席のドアが閉めづらい 運転席のドア内側の手をかける部分が高い位置の為ドアを閉めづらいそう...
7,c,運転席ドアが閉めづらい 運転席のドア内側の手をかける部分が高い位置の為ドアを閉めづらいそうです。


# データクリーニング
## vocal (X)

In [6]:
vocal_df["text"] = vocal_df["text"].str.replace("\n", ' ')
vocal_df["text"] = vocal_df["text"].str.replace(re.compile("（\w+）"), '')
vocal_df["text"] = vocal_df["text"].str.replace(re.compile("[。、]"), ' ')
vocal_df = vocal_df.iloc[0:100, :]
vocal_df.head(16)

Unnamed: 0,time,text,car_name
0,2022-12-26 00:00:00,運転席のドアが閉めづらい 運転席のドア内側の手をかける部分が高い位置の為ドアを閉めづらいそう...,シエンタ
1,2022-12-22 00:00:00,シフトノブ付近の小物入れが使いづらい シフトノブをパーキングに入れているとき小物入れとシフト...,ライズ
2,2022-12-22 00:00:00,助手席側のドアパネルとグローブボックスの位置が近い 助手席側のドア下のポケットに厚手の物入れ...,ヴォクシー
3,2022-12-22 00:00:00,加速時のエンジン音が大きい アクセルを深く踏んだ時のエンジン音が大きい 特に力強く加速した際...,ノア
4,2022-12-22 00:00:00,バックドアの開閉操作がやや使いづらい バックドアが開く時に上から手で押すとその位置で止まるが...,ノア
5,2022-12-22 00:00:00,運転席　助手席降りる時開閉しづらい フロントドアトリムボードのドアノブの位置が高すぎて開閉し...,シエンタ
6,2022-12-21 00:00:00,トランクを開けるスイッチの位置がわかりにくい トランクを外から開けるスイッチの位置が車の中...,クラウン
7,2022-12-17 00:00:00,メータ下にカードが入ってしまう メータの所にカードを置いたら下に入って取れなくなった 物...,シエンタ
8,2022-12-03 00:00:00,シフトノブ部に物を落としてしまいそうな位隙間がある シフトポジションをPレンジにした時にシ...,ヴォクシー
9,2022-11-30 00:00:00,バックドアが重くて開けにくい 買い物帰りに荷物を積む時 バックドアが重くて大変とのご指摘を受...,ノア


## tqnet (Y)

In [7]:
tqnet_df["text"] = tqnet_df["text"]\
    .str.replace("\n", ' ')\
    .str.replace("×", 'と')\
    .str.replace("&", 'と')\
    .str.replace("。", '')\
    .str.replace("→", ' ')\
    .str.replace("�", '')\
    .str.replace("販売店指摘）", '')\
    .str.replace(re.compile("（\w+）"), "")\
    .str.replace(re.compile("【第\w+報】"), '')

# テキストベクトル化(辞書なし検討) - Tokenizer活用
## X_label creation (768次元)

In [8]:
# tokenizerとmodelのload
model_name = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer = BertJapaneseTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
model = model.cuda()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## 日本語を英語に変換

In [9]:
d_t = []
d_v = []
for t_data in tqnet_df["text"]:
    translated_t = Translator().translate(t_data)
    d_t.append({
        "text": translated_t.text
    })
new_tqnet_df = pd.DataFrame(d_t)

for v_data in vocal_df["text"]:
    translated_v = Translator().translate(v_data)
    d_v.append({
        "text": translated_v.text
    })
new_vocal_df = pd.DataFrame(d_v)

In [10]:
len(new_vocal_df["text"])

15

In [11]:
new_vocal_df.head(12)

Unnamed: 0,text
0,It is difficult to close the door because the ...
1,When a shift knob is difficult to use near the...
2,The door panel on the passenger side and the g...
3,The engine sound when the engine sound is loud...
4,When the backdoor opening and closing operatio...
5,The driver's passenger seat It is difficult to...
6,Opening the trunk. The position of the switch ...
7,If you put the card in the meter where the car...
8,When a shift position that is likely to drop t...
9,The backdoor is heavy and it is difficult to o...


In [12]:
new_tqnet_df.head()

Unnamed: 0,text
0,When I got into the driver's seat and tried to...
1,"While driving, the sound of the engine feels l..."
2,If there is a gap on the lower side of the shi...
3,The location of the switch that opens the trun...
4,The driver's seat and the passenger seat doors...


In [13]:
#各データの形式を整える
max_length = 256

x_dataset = []
# labels = []
for i in range(len(vocal_df)):
    # 記事から文章を抜き出し符号化を行う
    lines = new_vocal_df.iloc[i,0].splitlines()
    text = '\n'.join(lines)
    encoding = tokenizer(
        text,
        max_length = max_length,
        padding = 'max_length',
        truncation = True,
        return_tensors = 'pt'
        )
    encoding = {k: v.cuda() for k, v in encoding.items()}
    attention_mask = encoding['attention_mask']
    
    #文章ベクトルを計算
    with torch.no_grad():
        output = model(**encoding)
        last_hidden_state = output.last_hidden_state
        averaged_hidden_state =(last_hidden_state*attention_mask.unsqueeze(-1)).sum(1)/attention_mask.sum(1,keepdim=True) 
        
    #文章ベクトルとラベルを追加
    x_dataset.append(averaged_hidden_state[0].cpu().numpy())
    # label = vocal_df.iloc[i,1]
    # labels.append(label)

#ベクトルとラベルをnumpy.ndarrayにする
x_dataset = np.vstack(x_dataset)
# labels = np.array(labels)

## Y_label creation

In [14]:
max_length = 256

y_dataset = []
for i in range(len(tqnet_df)):
    # 記事から文章を抜き出し符号化を行う
    lines = new_tqnet_df.iloc[i,0].splitlines()
    text = '\n'.join(lines)
    encoding = tokenizer(
        text,
        max_length = max_length,
        padding = 'max_length',
        truncation = True,
        return_tensors = 'pt'
        )
    encoding = {k: v.cuda() for k, v in encoding.items()}
    attention_mask = encoding['attention_mask']
    
    #文章ベクトルを計算
    with torch.no_grad():
        output = model(**encoding)
        last_hidden_state = output.last_hidden_state
        averaged_hidden_state =(last_hidden_state*attention_mask.unsqueeze(-1)).sum(1)/attention_mask.sum(1,keepdim=True) 
        
    #文章ベクトルとラベルを追加
    y_dataset.append(averaged_hidden_state[0].cpu().numpy())

#ベクトルとラベルをnumpy.ndarrayにする
y_dataset = np.vstack(y_dataset)

# テキストのベクトル化(model)

In [15]:
print(x_dataset.shape)
print(y_dataset.shape)

(15, 768)
(8, 768)


In [16]:
import sklearn
data = sklearn.metrics.pairwise.cosine_similarity(x_dataset,y_dataset)

In [24]:
df = pd.DataFrame(data)

In [25]:
df.columns = ["Frドア重い", "加速音大きい", "シフト隙間", "SW場所不明確", "ドア重い", "new", "new+1", "new-1"]

In [29]:
df["Vocal/TQ-NET"] = [
    "Frドア重い", 
    "シフト使用性", 
    "グローブボックス位置", 
    "加速音大きい", 
    "バックドア操作", 
    "Frドア開閉し難い", 
    "SW場所不明確", 
    "メータ隙間", 
    "シフト隙間", 
    "バックドア重い",
    "バックドア軽い",
    "Frドア操作性",
    "スーパー外れ値",
    "マルチメディア関係",
    "Frドア重い(少な目)"
] 

In [30]:
# first_column = df.pop("Vocal/TQ-NET")
# df.insert(0,"Vocal / TQ-NET",first_column)
df 

Unnamed: 0,Frドア重い,加速音大きい,シフト隙間,SW場所不明確,ドア重い,new,new+1,new-1,Vocal/TQ-NET
0,0.910138,0.843165,0.865729,0.865214,0.923575,1.0,1.0,0.965061,Frドア重い
1,0.89129,0.883249,0.933996,0.894736,0.906452,0.864343,0.864343,0.838204,シフト使用性
2,0.918359,0.854998,0.929188,0.877602,0.92661,0.893869,0.893869,0.873375,グローブボックス位置
3,0.809757,0.899804,0.837084,0.825127,0.843486,0.81086,0.81086,0.798735,加速音大きい
4,0.900627,0.879891,0.918031,0.901687,0.911724,0.884457,0.884457,0.849258,バックドア操作
5,0.925488,0.868314,0.901501,0.89044,0.943052,0.937123,0.937123,0.911243,Frドア開閉し難い
6,0.875126,0.85659,0.872588,0.936097,0.874858,0.86965,0.86965,0.843431,SW場所不明確
7,0.890542,0.840736,0.918781,0.887858,0.89264,0.8454,0.8454,0.82333,メータ隙間
8,0.846455,0.851058,0.91007,0.859277,0.896344,0.828202,0.828202,0.817588,シフト隙間
9,0.897792,0.857118,0.906617,0.888839,0.913156,0.878002,0.878002,0.845101,バックドア重い


In [31]:
df.style.background_gradient()

Unnamed: 0,Frドア重い,加速音大きい,シフト隙間,SW場所不明確,ドア重い,new,new+1,new-1,Vocal/TQ-NET
0,0.910138,0.843165,0.865729,0.865214,0.923575,1.0,1.0,0.965061,Frドア重い
1,0.89129,0.883249,0.933996,0.894736,0.906452,0.864343,0.864343,0.838204,シフト使用性
2,0.918359,0.854998,0.929188,0.877602,0.92661,0.893869,0.893869,0.873375,グローブボックス位置
3,0.809757,0.899804,0.837084,0.825127,0.843486,0.81086,0.81086,0.798735,加速音大きい
4,0.900627,0.879891,0.918031,0.901687,0.911724,0.884457,0.884457,0.849258,バックドア操作
5,0.925488,0.868314,0.901501,0.89044,0.943052,0.937123,0.937123,0.911243,Frドア開閉し難い
6,0.875126,0.85659,0.872588,0.936097,0.874858,0.86965,0.86965,0.843431,SW場所不明確
7,0.890542,0.840736,0.918781,0.887858,0.89264,0.8454,0.8454,0.82333,メータ隙間
8,0.846455,0.851058,0.91007,0.859277,0.896344,0.828202,0.828202,0.817588,シフト隙間
9,0.897792,0.857118,0.906617,0.888839,0.913156,0.878002,0.878002,0.845101,バックドア重い
