In [2]:
!pip install transformers
!pip install fugashi
!pip install unidic-lite

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m66.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https://us

In [3]:
# Google Driveをマウント
from google.colab import drive
drive.mount('/content/drive')

# ファイルを/content配下にコピー
!cp "/content/drive/MyDrive/train-v1.1.json" "/content/"
!cp "/content/drive/MyDrive/valid-v1.1.json" "/content/"

# データ読み込み
import json

def load_jsts_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f.readlines()]
    return data

train_data_path = "/content/train-v1.1.json"
valid_data_path = "/content/valid-v1.1.json"

train_data = load_jsts_data(train_data_path)
valid_data = load_jsts_data(valid_data_path)

# 課題1
import torch
from transformers import BertTokenizer, BertModel
from scipy.spatial.distance import cosine
from scipy.stats import pearsonr, spearmanr

def get_sentence_embedding(sentence, tokenizer, model):
    inputs = tokenizer(sentence, return_tensors="pt")
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state
    return torch.mean(embeddings, dim=1).squeeze()

tokenizer = BertTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-v2")
model = BertModel.from_pretrained("cl-tohoku/bert-base-japanese-v2")
valid_data = load_jsts_data(valid_data_path)

true_similarities = []
predicted_similarities = []

for item in valid_data:
    sent1 = item["sentence1"]
    sent2 = item["sentence2"]
    label = float(item["label"])

    embedding1 = get_sentence_embedding(sent1, tokenizer, model)
    embedding2 = get_sentence_embedding(sent2, tokenizer, model)

    cosine_similarity = 1 - cosine(embedding1.detach().numpy(), embedding2.detach().numpy())
    scaled_similarity = cosine_similarity * 5

    true_similarities.append(label)
    predicted_similarities.append(scaled_similarity)

# Pearson相関係数とSpearman相関係数を計算
pearson_corr = pearsonr(true_similarities, predicted_similarities)
spearman_corr = spearmanr(true_similarities, predicted_similarities)

print("Pearson correlation:", pearson_corr)
print("Spearman correlation:", spearman_corr)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/236k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/517 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'. 
The class this function is called from is 'BertTokenizer'.


Downloading pytorch_model.bin:   0%|          | 0.00/447M [00:00<?, ?B/s]

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v2 were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Pearson correlation: PearsonRResult(statistic=0.6394918528543447, pvalue=2.3489912319898795e-168)
Spearman correlation: SignificanceResult(statistic=0.6321073119018878, pvalue=2.2699922828499695e-163)


In [4]:
# 評価指標を指定されたフォーマットで出力
with open("evaluation_scores.jsonl", "w") as f:
    f.write(json.dumps({"metrics": "Pearson correlation", "score": pearson_corr}) + "\n")
    f.write(json.dumps({"metrics": "Spearman correlation", "score": spearman_corr}) + "\n")