In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
#!pip install -U sentence-transformers

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel

import torch
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
train_path = "/content/drive/MyDrive/清大研究所/碩一/Data mining/Final Project/Dataset/stage1/"
test_path = "/content/drive/MyDrive/清大研究所/碩一/Data mining/Final Project/Dataset/stage2/"

train_txtpath = "dataTrainComplete/"
test_txtpath = "dataPublicComplete/"

In [5]:
# Read Data
def read_text(path, txtpath):
  # 1. txt
  # Construct an empty dataframe to store txt data
  df = pd.DataFrame(columns = ["ID", "text"])
  k = 0
  # Read txt and Store into df
  for i in range(1402):
    text = []
    txt_name = str(i) + ".txt"
    txt_path = path + txtpath + txt_name
    try:
      f = open(txt_path, 'r')
      text = f.read()
      f.close
      df.loc[k, 'ID'] = i
      df.loc[k, 'text'] = text
      k = k + 1
    except:
      pass
  # Change to array
  arr_df = np.array(df["text"])

  
  # 2. Keyword.xlsx
  # Read excel
  keyword_1 = pd.read_excel(path+"Keywords/02chem.list.xlsx", header=None)
  keyword_2 = pd.read_excel(path+"Keywords/02crop.list.xlsx", header=None)
  keyword_3 = pd.read_excel(path+"Keywords/02pest.list.xlsx", header=None)
  # Merge them
  frames = [keyword_1, keyword_2, keyword_3]
  keyword = pd.concat(frames, axis=0)

  # 3. Train Label.csv
  # Test data does not have label
  try: 
    label_path = path + "TrainLabel.csv"
    label = pd.read_csv(label_path)
  except:
    pass

  # 4. Submission.csv
  submission = pd.read_csv(path+"submission_example.csv")

  return df, arr_df, keyword, submission

In [6]:
#np.save('/content/drive/MyDrive/清大研究所/碩一/Data mining/Final Project/train_df.npy', arr_train_df)

In [7]:
def BERT_model(arr_df):
  # Initialize model and tokenizer:
  tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
  model = AutoModel.from_pretrained('bert-base-chinese')

  # Initialize dictionary to store tokenized sentences
  tokens = {'input_ids': [], 'attention_mask': []}

  # Encode each acticle and append to dictionary
  for sentence in arr_df:
      new_tokens = tokenizer.encode_plus(sentence, max_length=128,
                                        truncation=True, padding='max_length',
                                        return_tensors='pt')
      tokens['input_ids'].append(new_tokens['input_ids'][0])
      tokens['attention_mask'].append(new_tokens['attention_mask'][0])

  # Reformat list of tensors into single tensor
  tokens['input_ids'] = torch.stack(tokens['input_ids'])
  tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

  # Process these tokens through our model
  outputs = model(**tokens)
  print("outputs.keys(): ", outputs.keys())

  # last_hidden_state
  embeddings = outputs.last_hidden_state
  print("embeddings.shape: ", embeddings.shape)

  # Resize our attention_mask tensor
  attention_mask = tokens['attention_mask']
  print("attention_mask.shape: ", attention_mask.shape)
  mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()

  masked_embeddings = embeddings * mask
  print("masked_embeddings.shape: ", masked_embeddings.shape)

  # Sum the remained of the embeddings along axis 1
  # Sum the number of values that must be given attention in each position of the tensor
  summed = torch.sum(masked_embeddings, 1)
  summed_mask = torch.clamp(mask.sum(1), min=1e-9)

  mean_pooled = summed / summed_mask
  mean_pooled = mean_pooled.detach().numpy()
  return mean_pooled

In [8]:
def prediction(mean_pooled, df, submission):
  # Convert from PyTorch tensor to numpy array
  m = 0
  for i in range(len(mean_pooled)-1):
    print(i, df["ID"][i])
    prob = cosine_similarity(
        [mean_pooled[i]],
        mean_pooled[i+1:]
    )
    print(prob)
  
    for j in range(len(prob[0])):
      if(prob[0][j] > 0.98):
        submission.loc[m, "Test"] = df["ID"][i]
        submission.loc[m, "Reference"] = df["ID"][i+1+j]
        m = m + 1
  
  return submission

In [9]:
df, arr_df, keyword, submission = read_text(train_path, train_txtpath)

In [10]:
# 測試用 因為本身RAM不夠且實驗室server沒有sudo權限
arr_df = np.array(df["text"])
arr_df = arr_df[:100]

In [11]:
mean_pooled = BERT_model(arr_df)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


outputs.keys():  odict_keys(['last_hidden_state', 'pooler_output'])
embeddings.shape:  torch.Size([100, 128, 768])
attention_mask.shape:  torch.Size([100, 128])
masked_embeddings.shape:  torch.Size([100, 128, 768])


In [12]:
submission = prediction(mean_pooled, df, submission)

0 1
[[0.9618845  0.9560703  0.94613886 0.950903   0.9544353  0.93537986
  0.9423306  0.9486482  0.927534   0.93987095 0.938343   0.92432123
  0.9436401  0.9146975  0.9262198  0.9369471  0.9448707  0.94224346
  0.93186074 0.93980825 0.92498624 0.92843586 0.92467946 0.93237966
  0.9312872  0.93199563 0.941846   0.92510986 0.9314351  0.9274205
  0.93519735 0.935906   0.946427   0.94441473 0.93605256 0.93507814
  0.9383271  0.93509674 0.94550896 0.93057346 0.94289064 0.9409363
  0.94840777 0.95142055 0.9364424  0.9266678  0.9340793  0.93420947
  0.93916833 0.9416158  0.95552194 0.9404     0.91713786 0.9585536
  0.94924545 0.9661418  0.96029687 0.9647063  0.95218176 0.958274
  0.9591611  0.9556331  0.9523432  0.95669603 0.95783013 0.95523524
  0.9557091  0.9604255  0.937842   0.953025   0.95546323 0.9585102
  0.95671844 0.94755137 0.96168596 0.96531    0.95246416 0.9437945
  0.93565595 0.93870735 0.9332411  0.9533344  0.94622886 0.9389845
  0.9533819  0.9583954  0.9346602  0.9450396  0.9506

In [13]:
label_path = train_path + "TrainLabel.csv"
label = pd.read_csv(label_path)

In [14]:
# Switch to Test > Reference
for i in range(len(label)):
  if int(label["Test"][i]) > int(label["Reference"][i]):
    m = label["Reference"][i]
    label.loc[i, "Reference"] = label["Test"][i]
    label.loc[i, "Test"] = m

In [15]:
# ACC
correct = 0
for m in range(len(submission)):
  testID = submission["Test"][m]
  referenceID = submission["Reference"][m]
  label_mask = label[label["Test"] == testID]
  label_mask = label_mask.reset_index(drop=True)
  for n in range(len(label_mask)):
    if label_mask["Test"][n] == testID:
      if label_mask["Reference"][n] == referenceID:
        correct = correct + 1

In [17]:
acc = correct/len(submission)