In [1]:
!pip install --upgrade accelerate transformers

Collecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub (from accelerate)
  Downloading huggingface_hub-0.19.1-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.1/311.1 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m64.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4

In [2]:
from transformers import AutoTokenizer, AutoModel, DistilBertModel, AutoModelForMaskedLM
import torch
import numpy as np
import pandas as pd

In [3]:
# medical words
med_words = ["diabetes", "emphysema", "hypertension", "arthritis", "asthma",
             "bronchitis", "insomnia", "migraine", "anemia", "colitis",
             "pneumonia", "hepatitis", "thyroiditis", "leukemia", "eczema",
             "myocarditis", "parkinsonism", "diarrhea", "depression", "cholesterol"]

# sport-related words
sport_words = ["football", "basketball", "soccer", "tennis", "baseball",
               "volleyball", "swimming", "golf", "hockey", "cycling",
               "boxing", "running", "skiing", "surfing", "karate",
               "badminton", "rugby", "hiking", "sailing", "racquetball"]

# financial words
# fin_words = ["investment", "portfolio", "dividend", "equity", "liability",
#              "asset", "credit", "debt", "interest", "income",
#              "savings", "budget", "cash flow", "stock", "bond",
#              "capital", "mortgage", "inflation", "taxation", "retirement"]

In [4]:
med_categories = ["medical"] * len(med_words)
sport_categories = ["sport"] * len(sport_words)
# fin_categories = ["financial"] * len(fin_words)

In [5]:
# all = med_words + sport_words + fin_words
# print(len(all))
# print(all)

In [6]:
# categories = ["medical"] * len(med_words) + ["sport"] * len(sport_words) + ["financial"] * len(fin_words)
# print(len(categories))
# print(categories)

In [7]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [8]:
med_tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT")
med_model = AutoModel.from_pretrained("medicalai/ClinicalBERT")

Downloading (…)okenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/542M [00:00<?, ?B/s]

In [9]:
sport_tokenizer = AutoTokenizer.from_pretrained("microsoft/SportsBERT")
sport_model = AutoModelForMaskedLM.from_pretrained("microsoft/SportsBERT")

Downloading (…)okenizer_config.json:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/66.0k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/375M [00:00<?, ?B/s]

In [10]:
# fin_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
# fin_model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [11]:
med_embeddings_cb = []

for i in range(len(med_words)):
  input_ids = torch.tensor(med_tokenizer.encode(med_words[i])).unsqueeze(0)
  outputs = med_model(input_ids)
  last_hidden_states = outputs[0]
  # skip the first and last tokens, which are the [CLS] and [SEP] tokens
  # take the mean of other tokens that form the word
  med_embeddings_cb.append(torch.mean(last_hidden_states[0][1:-1], dim = 0).tolist())

In [12]:
med_embeddings_cb = [list(np.around(np.array(e), 3)) for e in med_embeddings_cb]

with open("med_output_cb.txt", "w") as text_file:
  for i in range(len(med_words)):
    class_str = f"{med_words[i]}: {med_embeddings_cb[i][:10]}\n"
    text_file.write(class_str)

In [13]:
med_df_cb = pd.DataFrame(med_embeddings_cb)
med_df_cb.insert(0, "word", med_words)
med_df_cb.insert(1, "category", med_categories)

print(med_df_cb.shape)
med_df_cb.head()

(20, 770)


Unnamed: 0,word,category,0,1,2,3,4,5,6,7,...,758,759,760,761,762,763,764,765,766,767
0,diabetes,medical,-0.013,-0.148,0.75,0.04,0.421,0.093,-0.329,-0.651,...,-0.253,0.082,-0.607,-0.326,0.369,0.101,0.6,0.222,0.114,-0.271
1,emphysema,medical,-0.305,-0.311,0.671,-0.16,0.393,-0.436,-0.036,-0.227,...,-0.124,0.292,-0.996,-0.462,0.124,-0.673,0.352,0.181,0.534,-0.305
2,hypertension,medical,-0.828,-0.357,1.108,0.082,0.249,0.187,-0.505,-0.423,...,-0.248,0.38,-0.717,-0.674,0.401,-0.031,-0.303,-0.18,0.395,-0.07
3,arthritis,medical,-0.592,-0.158,0.564,0.23,0.109,-0.454,-0.058,-0.027,...,0.146,0.235,-0.448,-0.809,-0.058,-0.238,-0.227,0.189,0.575,0.104
4,asthma,medical,-0.311,-0.378,1.083,0.297,0.16,-0.172,-0.063,-0.349,...,-0.487,0.394,-0.721,-0.454,0.353,-0.381,0.543,0.041,0.651,-0.449


In [14]:
med_embeddings_db = []

for i in range(len(med_words)):
  input_ids = torch.tensor(tokenizer.encode(med_words[i])).unsqueeze(0)
  outputs = model(input_ids)
  last_hidden_states = outputs[0]
  # skip the first and last tokens, which are the [CLS] and [SEP] tokens
  # take the mean of other tokens that form the word
  med_embeddings_db.append(torch.mean(last_hidden_states[0][1:-1], dim = 0).tolist())

In [15]:
med_embeddings_db = [list(np.around(np.array(e), 3)) for e in med_embeddings_db]

with open("med_output_db.txt", "w") as text_file:
  for i in range(len(med_words)):
    class_str = f"{med_words[i]}: {med_embeddings_db[i][:10]}\n"
    text_file.write(class_str)

In [16]:
med_df_db = pd.DataFrame(med_embeddings_db)
med_df_db.insert(0, "word", med_words)
med_df_db.insert(1, "category", med_categories)

print(med_df_db.shape)
med_df_db.head()

(20, 770)


Unnamed: 0,word,category,0,1,2,3,4,5,6,7,...,758,759,760,761,762,763,764,765,766,767
0,diabetes,medical,0.32,0.337,-0.303,0.003,0.45,-0.149,0.197,0.113,...,0.116,-0.251,-0.073,-0.292,0.252,-0.101,-0.117,-0.013,0.198,-0.237
1,emphysema,medical,-0.012,-0.219,-0.043,-0.04,0.996,-0.037,0.031,0.492,...,-0.364,-0.097,0.439,-0.146,0.177,-0.02,-0.423,-0.056,-0.008,-0.047
2,hypertension,medical,-0.5,0.091,0.191,-0.123,0.777,0.123,0.083,0.825,...,0.12,-0.543,0.302,-0.45,0.197,0.149,-0.745,-0.186,0.193,-0.055
3,arthritis,medical,0.051,0.436,-0.281,-0.087,0.127,0.104,-0.037,0.295,...,0.249,-0.097,0.075,-0.187,0.419,-0.194,0.124,0.141,-0.002,0.113
4,asthma,medical,0.208,0.388,-0.265,0.043,0.163,0.008,0.038,0.227,...,0.446,0.026,-0.138,-0.046,0.293,-0.179,0.206,0.149,0.055,0.07


In [17]:
sport_embeddings_sb = []

for i in range(len(sport_words)):
  input_ids = torch.tensor(sport_tokenizer.encode(sport_words[i])).unsqueeze(0)
  outputs = sport_model(input_ids)
  last_hidden_states = outputs[0]
  # skip the first and last tokens, which are the [CLS] and [SEP] tokens
  # take the mean of other tokens that form the word
  sport_embeddings_sb.append(torch.mean(last_hidden_states[0][1:-1], dim = 0).tolist())

In [18]:
sport_embeddings_sb = [list(np.around(np.array(e), 3)) for e in sport_embeddings_sb]

with open("sport_output_sb.txt", "w") as text_file:
  for i in range(len(sport_words)):
    class_str = f"{sport_words[i]}: {sport_embeddings_sb[i][:10]}\n"
    text_file.write(class_str)

In [19]:
sport_df_sb = pd.DataFrame(sport_embeddings_sb)
sport_df_sb.insert(0, "word", sport_words)
sport_df_sb.insert(1, "category", sport_categories)

print(sport_df_sb.shape)
sport_df_sb.head()

(20, 10002)


Unnamed: 0,word,category,0,1,2,3,4,5,6,7,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,football,sport,-1.948,-0.865,-6.089,-0.201,-2.4,4.907,6.27,2.936,...,-4.922,-8.663,1.807,-4.591,-6.369,-2.7,-0.358,-3.598,2.455,-10.656
1,basketball,sport,-3.137,-2.396,-5.128,1.092,-2.133,3.066,5.326,0.041,...,-5.47,-8.39,4.7,-6.369,-2.474,-0.731,-3.035,-6.114,1.293,-6.965
2,soccer,sport,-2.951,-2.38,-4.738,0.32,-2.039,4.172,6.484,4.026,...,-1.632,-8.689,0.114,-8.35,-4.481,-0.579,-0.565,-0.743,0.871,-6.306
3,tennis,sport,-3.871,-2.732,-5.3,0.25,-2.01,4.822,5.755,1.999,...,-1.204,-7.702,2.112,-6.668,-1.856,1.856,0.901,-2.931,0.918,-5.039
4,baseball,sport,-2.043,-3.02,-6.069,0.399,-1.973,3.128,6.965,2.381,...,-3.671,-10.257,1.09,-7.666,-8.911,0.733,-1.52,-6.591,-1.915,-6.58


In [20]:
sport_embeddings_db = []

for i in range(len(sport_words)):
  input_ids = torch.tensor(tokenizer.encode(sport_words[i])).unsqueeze(0)
  outputs = model(input_ids)
  last_hidden_states = outputs[0]
  # skip the first and last tokens, which are the [CLS] and [SEP] tokens
  # take the mean of other tokens that form the word
  sport_embeddings_db.append(torch.mean(last_hidden_states[0][1:-1], dim = 0).tolist())

In [21]:
sport_embeddings_db = [list(np.around(np.array(e), 3)) for e in sport_embeddings_db]

with open("sport_output_db.txt", "w") as text_file:
  for i in range(len(sport_words)):
    class_str = f"{sport_words[i]}: {sport_embeddings_db[i][:10]}\n"
    text_file.write(class_str)

In [22]:
sport_df_db = pd.DataFrame(sport_embeddings_db)
sport_df_db.insert(0, "word", sport_words)
sport_df_db.insert(1, "category", sport_categories)

print(sport_df_db.shape)
sport_df_db.head()

(20, 770)


Unnamed: 0,word,category,0,1,2,3,4,5,6,7,...,758,759,760,761,762,763,764,765,766,767
0,football,sport,-0.18,0.317,-0.691,-0.526,0.257,-0.006,0.087,0.518,...,0.433,-0.328,0.572,-0.074,0.233,-0.065,-0.016,-0.086,0.677,-0.12
1,basketball,sport,0.715,0.159,-0.648,-0.461,0.322,0.108,-0.032,0.255,...,0.183,-0.096,0.278,0.19,0.162,0.07,-0.137,-0.032,0.718,-0.731
2,soccer,sport,0.074,0.133,-0.358,-0.299,0.154,0.074,0.105,0.069,...,0.759,-0.186,0.386,0.051,0.356,-0.332,-0.118,-0.177,0.664,-0.211
3,tennis,sport,-0.086,0.041,-0.049,-0.304,0.574,0.097,0.238,-0.078,...,0.503,-0.014,0.42,-0.055,0.213,-0.439,-0.265,0.033,0.536,-0.492
4,baseball,sport,-0.126,0.158,-0.617,-0.249,0.133,-0.34,0.036,0.242,...,0.301,-0.192,0.307,0.256,0.34,-0.4,0.077,0.241,0.719,-0.293
