In [796]:
import numpy as np
import pandas as pd 
import os 
from datasets import load_dataset
from transformers import TrainingArguments, Trainer, default_data_collator
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import get_scheduler
from torch.optim import AdamW
import torch
import tensorflow as tf
import tensorflow_hub as hub
from datasets import metric
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, multilabel_confusion_matrix
from transformers import BertTokenizerFast

#pip install einops
#package required for downloading transformer 
#https://huggingface.co/datasets/cnn_dailymail link to dataset




In [797]:
#Attempted to setup training to be done with GPU to speed up computation time of training model 
#Check if gpu is accessible

#Note when getting nvcc -V to get detection I had to make  a new variable path for my system environment 
#Path used v12.2\bin that needed to be added as system environment variable
#Ran bandwidth test to determine if system and cuda capable device can interact accordingly
#My system passed bandwidth test 

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))



Num GPUs Available:  1


In [798]:
dataset = load_dataset("cnn_dailymail",'3.0.0', split='train')
#Run loading script once located in same directory as python notebook
#Specify configuration version '3.0.0'
#load dataset using script because dataset is very large

In [799]:
df=pd.DataFrame(dataset)
#cast dataset into pandas dataframe object type 

In [800]:
df.head(10)

Unnamed: 0,article,highlights,id
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...,42c027e4ff9730fbb3de84c1af0d2c506e41c3e4
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...,ee8871b15c50d0db17b0179a6d2beab35065f1e9
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa...",06352019a19ae31e527f37f7571c6dd7f0c5da37
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non...",24521a2abb2e1f5e34e6824e0f9e56904a2b0e88
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical...",7fe70cc8b12fab2d0a258fababf7d9c6b5e1262a
5,"BAGHDAD, Iraq (CNN) -- Dressed in a Superman s...","Parents beam with pride, can't stop from smili...",a1ebb8bb4d370a1fdf28769206d572be60642d70
6,"BAGHDAD, Iraq (CNN) -- The women are too afrai...","Aid workers: Violence, increased cost of livin...",7c0e61ac829a3b3b653e2e3e7536cc4881d1f264
7,"BOGOTA, Colombia (CNN) -- A key rebel commande...",Tomas Medina Caracas was a fugitive from a U.S...,f0d73bdab711763e745cdc75850861c9018f235d
8,WASHINGTON (CNN) -- White House press secretar...,"President Bush says Tony Snow ""will battle can...",5e22bbfc7232418b8d2dd646b952e404df5bd048
9,(CNN) -- Police and FBI agents are investigati...,Empty anti-tank weapon turns up in front of Ne...,613d6311ec2c1985bd44707d1796d275452fe156


In [801]:
df.drop(columns=['article','id'], inplace=True)
#drop article colummn from dataset

In [802]:
df.head(10)

Unnamed: 0,highlights
0,Harry Potter star Daniel Radcliffe gets £20M f...
1,Mentally ill inmates in Miami are housed on th...
2,"NEW: ""I thought I was going to die,"" driver sa..."
3,"Five small polyps found during procedure; ""non..."
4,"NEW: NFL chief, Atlanta Falcons owner critical..."
5,"Parents beam with pride, can't stop from smili..."
6,"Aid workers: Violence, increased cost of livin..."
7,Tomas Medina Caracas was a fugitive from a U.S...
8,"President Bush says Tony Snow ""will battle can..."
9,Empty anti-tank weapon turns up in front of Ne...


In [803]:
#Download tokenizer associated with "bert-base-uncased"
#As some text inputs will be shorter than others we can use padding to make the inputs that are shorter a similar sequence length to inputs that are longer 


tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased",trust_remote_code=True, pad_token= '[PAD]', num_labels=155)

In [804]:
#load "bert-base_uncased" transformer and specify 'num_labels' parameter it should expect which we found length of highlights_labels_df
#length of highlights_labels_df which contains all binary labels for highlights_tokenied has 155 columns 
#specify 155 for 'num_labels' parameter in 'from_pretrained' method 


phi_model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=155)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [805]:
#Specify [SEP] Token to be used to seperate sentences so Bert understands input is made up of several sentences 

special_tokens = {
    'sep_token': '[SEP]'
}

In [806]:
tokenizer.add_special_tokens(special_tokens)

1

In [807]:
#Take 1000 examples from our dataframe to use for training
pretask_dataset=df.iloc[0:1000].copy()


#same procedure except new column name 'highlights_tokenized'
pretask_dataset['highlights_'+'tokenized']=pretask_dataset['highlights'].apply(lambda x: tokenizer.tokenize(x, truncation=True))


#peak at pretask_dataset
pretask_dataset.head(1)

#must use 'lambda x: ' function with tokenizer.tokenize(x) to apply tokenization to all rows 
#no error was outputted
    

Unnamed: 0,highlights,highlights_tokenized
0,Harry Potter star Daniel Radcliffe gets £20M f...,"[harry, potter, star, daniel, radcliffe, gets,..."


In [808]:
#Create new dataframe with only tokenized data of article and highlights 

columns_to_remove= ['highlights']
pretask_dataset_tokenized = pretask_dataset.drop(columns=columns_to_remove, axis=1, inplace=False).astype(str)
pretask_dataset_tokenized


Unnamed: 0,highlights_tokenized
0,"['harry', 'potter', 'star', 'daniel', 'radclif..."
1,"['mentally', 'ill', 'inmates', 'in', 'miami', ..."
2,"['new', ':', '""', 'i', 'thought', 'i', 'was', ..."
3,"['five', 'small', 'poly', '##ps', 'found', 'du..."
4,"['new', ':', 'nfl', 'chief', ',', 'atlanta', '..."
...,...
995,"['mother', 'of', 'murdered', 'school', '##boy'..."
996,"['new', ':', 'pope', 'benedict', 'xvi', 'arriv..."
997,"['eight', 'florida', 'teens', 'to', 'be', 'tri..."
998,"['judge', 'on', 'heather', 'mills', ':', 'leve..."


In [809]:
#Now work on creating pretask labels from keywords to align with "keyword presence" task

df_2=pd.read_csv('keywords.csv')

#make empty list
list1 = []

for word in df_2:
    for j in df_2[word]:
        list1.append(j)

print(list1)
    

['funny, laughter, joke, comedian', 'sad, upset, crying, hurt', 'happy, joy, smile', '\xa0thinking, question, learning, curious, mystery student, learning, school', 'celebration, party, birthday', 'scared, worried, concerned', 'angry, mad, upset, hatred, frustrated', 'shocked, OMG, surprise, exasperated', '\xa0computer, technology, social media', 'money, job, stocks', 'sleepy, tired, sleep, bed, night', 'gross, disgusting, throw up, puke, queasy', '\xa0dead, deathly, poison, fossil, afterlife', '\xa0good, great, ok, agreed', 'bad, no, disagree', '\xa0empowerment, movement', 'teamwork, agreement', '\xa0workout, sports, athlete, gym, muscle', 'religion, prayer, praise, beg', 'beauty, cosmetics, fashion', '\xa0baby, kid, child, young', '\xa0man', 'woman', ' elderly, old', 'love, relationship, significant other, marriage', 'construction, building', 'ocean, sea, ship, water', 'justice, law, government, politics', 'pet, animal, vet', 'nature, plants, farming, science, earth, recycle, go gree

In [810]:
characters_to_remove = '\xa0'

cleaned_list = [''.join(char for char in string if char not in characters_to_remove)for string in list1]

print(cleaned_list)

['funny, laughter, joke, comedian', 'sad, upset, crying, hurt', 'happy, joy, smile', 'thinking, question, learning, curious, mystery student, learning, school', 'celebration, party, birthday', 'scared, worried, concerned', 'angry, mad, upset, hatred, frustrated', 'shocked, OMG, surprise, exasperated', 'computer, technology, social media', 'money, job, stocks', 'sleepy, tired, sleep, bed, night', 'gross, disgusting, throw up, puke, queasy', 'dead, deathly, poison, fossil, afterlife', 'good, great, ok, agreed', 'bad, no, disagree', 'empowerment, movement', 'teamwork, agreement', 'workout, sports, athlete, gym, muscle', 'religion, prayer, praise, beg', 'beauty, cosmetics, fashion', 'baby, kid, child, young', 'man', 'woman', ' elderly, old', 'love, relationship, significant other, marriage', 'construction, building', 'ocean, sea, ship, water', 'justice, law, government, politics', 'pet, animal, vet', 'nature, plants, farming, science, earth, recycle, go green', 'music, musician, instrument

In [811]:
#create a list object with keywords that we will count the occurence of as a pretext task goal   

individual_words = []

for text in cleaned_list:
    words= text.split()
    individual_words.extend(words)
    
    
print(individual_words)

       

['funny,', 'laughter,', 'joke,', 'comedian', 'sad,', 'upset,', 'crying,', 'hurt', 'happy,', 'joy,', 'smile', 'thinking,', 'question,', 'learning,', 'curious,', 'mystery', 'student,', 'learning,', 'school', 'celebration,', 'party,', 'birthday', 'scared,', 'worried,', 'concerned', 'angry,', 'mad,', 'upset,', 'hatred,', 'frustrated', 'shocked,', 'OMG,', 'surprise,', 'exasperated', 'computer,', 'technology,', 'social', 'media', 'money,', 'job,', 'stocks', 'sleepy,', 'tired,', 'sleep,', 'bed,', 'night', 'gross,', 'disgusting,', 'throw', 'up,', 'puke,', 'queasy', 'dead,', 'deathly,', 'poison,', 'fossil,', 'afterlife', 'good,', 'great,', 'ok,', 'agreed', 'bad,', 'no,', 'disagree', 'empowerment,', 'movement', 'teamwork,', 'agreement', 'workout,', 'sports,', 'athlete,', 'gym,', 'muscle', 'religion,', 'prayer,', 'praise,', 'beg', 'beauty,', 'cosmetics,', 'fashion', 'baby,', 'kid,', 'child,', 'young', 'man', 'woman', 'elderly,', 'old', 'love,', 'relationship,', 'significant', 'other,', 'marriage'

In [812]:
#comma taken out of list and each word it's own value 

character_to_remove=','
cleaned_individual_words = [''.join(char for char in string if char not in character_to_remove)for string in individual_words]

print(cleaned_individual_words)


['funny', 'laughter', 'joke', 'comedian', 'sad', 'upset', 'crying', 'hurt', 'happy', 'joy', 'smile', 'thinking', 'question', 'learning', 'curious', 'mystery', 'student', 'learning', 'school', 'celebration', 'party', 'birthday', 'scared', 'worried', 'concerned', 'angry', 'mad', 'upset', 'hatred', 'frustrated', 'shocked', 'OMG', 'surprise', 'exasperated', 'computer', 'technology', 'social', 'media', 'money', 'job', 'stocks', 'sleepy', 'tired', 'sleep', 'bed', 'night', 'gross', 'disgusting', 'throw', 'up', 'puke', 'queasy', 'dead', 'deathly', 'poison', 'fossil', 'afterlife', 'good', 'great', 'ok', 'agreed', 'bad', 'no', 'disagree', 'empowerment', 'movement', 'teamwork', 'agreement', 'workout', 'sports', 'athlete', 'gym', 'muscle', 'religion', 'prayer', 'praise', 'beg', 'beauty', 'cosmetics', 'fashion', 'baby', 'kid', 'child', 'young', 'man', 'woman', 'elderly', 'old', 'love', 'relationship', 'significant', 'other', 'marriage', 'construction', 'building', 'ocean', 'sea', 'ship', 'water', '

In [813]:
#Now we must count occurences of keyword in summary column of our dataframe and then join these two dataframes together
#together these will be the labels of our dataset that is text article entries and their summaries so a total of two features 

#So now use 'highlights' column and count occurences of keywords saved above to make a new column consisting of keyword +'highlights'

highlights_labels_df=pd.DataFrame()

for keyword in cleaned_individual_words:
    highlights_keyword = pretask_dataset_tokenized['highlights_tokenized'].str.lower().str.contains(keyword.lower()).astype(int)
    
    highlights_keyword=highlights_keyword.rename('highlights_' + keyword)
    
    highlights_labels_df[highlights_keyword.name] = highlights_keyword    



  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_

In [814]:
#CHECKPOINT: Use row 198 to check if highlights datapoint for that row contains the word 'hurt'
pretask_dataset.iloc[198, 1]

['new',
 ':',
 'the',
 'united',
 'states',
 'is',
 'outraged',
 'by',
 'the',
 'attack',
 ',',
 'secretary',
 'of',
 'state',
 'rice',
 'says',
 '.',
 'car',
 'bomb',
 'strikes',
 'u',
 '.',
 's',
 '.',
 'embassy',
 'vehicle',
 'north',
 'of',
 'beirut',
 '.',
 'three',
 'lebanese',
 'civilians',
 'dead',
 ',',
 'american',
 'and',
 'lebanese',
 'officials',
 'confirm',
 '.',
 'driver',
 'of',
 'the',
 'vehicle',
 'was',
 'slightly',
 'injured',
 ',',
 'and',
 'the',
 'only',
 'passenger',
 'was',
 'not',
 'hurt',
 '.']

In [815]:
#Use mask to retreive rows which have the woord 'hurt' present in their highlights datapoint 

mask=highlights_labels_df['highlights_hurt']==1

examples_with_hurt_highlights=highlights_labels_df[mask]

#We see row 198 does have the keyword 'hurt' present so counting keyword occurences was successful
print("Rows with a value of 1")
examples_with_hurt_highlights


Rows with a value of 1


Unnamed: 0,highlights_funny,highlights_laughter,highlights_joke,highlights_comedian,highlights_sad,highlights_upset,highlights_crying,highlights_hurt,highlights_happy,highlights_joy,...,highlights_medicine,highlights_emergency,highlights_fire,highlights_firefighters,highlights_911,highlights_home,highlights_house,highlights_living,highlights_family,highlights_domestic
198,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
317,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
453,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
650,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
729,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
880,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [816]:
pretask_dataset_tokenized.head(1)

Unnamed: 0,highlights_tokenized
0,"['harry', 'potter', 'star', 'daniel', 'radclif..."


In [817]:
new_df= pd.merge(pretask_dataset_tokenized, highlights_labels_df, left_index=True, right_index=True)

new_df_2=pd.merge(new_df, highlights_labels_df, left_index=True, right_index=True)

In [818]:
raw_inputs=new_df['highlights_tokenized'].tolist()


inputs = tokenizer(raw_inputs,padding=True,truncation=True,return_tensors="pt")
print(inputs)

{'input_ids': tensor([[ 101, 1031, 1005,  ...,    0,    0,    0],
        [ 101, 1031, 1005,  ...,    0,    0,    0],
        [ 101, 1031, 1005,  ...,    0,    0,    0],
        ...,
        [ 101, 1031, 1005,  ...,    0,    0,    0],
        [ 101, 1031, 1005,  ...,    0,    0,    0],
        [ 101, 1031, 1005,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [819]:
#view tokenized input example

print(tokenizer.decode(inputs['input_ids'][0]))

[CLS] ['harry ','potter ','star ','daniel ','radcliffe ','gets ','£2 ','# # 0 ','# # m ','fortune ','as ','he ','turns ','18 ','monday ', '. ','young ','actor ','says ','he ','has ','no ','plans ','to ','fr ','# # itte ','# # r ','his ','cash ','away ', '. ','radcliffe ', "'",'s ','earnings ','from ','first ','five ','potter ','films ','have ','been ','held ','in ','trust ','fund ', '.'] [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PA

In [820]:
from datasets import Dataset

In [823]:
from datasets import ClassLabel, Value
import torch

In [824]:
#store binary labels from new_df in new variable named 'labels_df' by dropping only column that doesn't count labels

#highlights_tokenized column contains summary of article text that is tokenized so we drop this column


labels_df=new_df.drop(columns='highlights_tokenized', axis=1)

In [825]:
#extract all values from labels_df and store into object variable named binary_labels
binary_labels=labels_df.values

In [826]:
#Make binary_labels a tensor object of dtype torch.int64 to be compatible for making a tensor dataset

tensor_labels=torch.tensor(binary_labels, dtype=torch.int64)

In [827]:
tensor_labels

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])

In [828]:
#Checking datatype of input_ids key value

type(inputs['input_ids'])

torch.Tensor

In [829]:
from torch.utils.data import TensorDataset, DataLoader

In [830]:
#Create tensor dataset now using TensorDataset and passing each tensor object through 

dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'],  tensor_labels)

In [831]:
#make tensordataset named 'dataset' a huggingface compatible dataset by loading dataset using Dataset.from_dict
#Index into dataset starting from [0] then [1] then [2] which is the same order of how we made dataset above 
#Indexing like the above sentence into dataset retrieves the corresponding key values for the key names mentioned in the next comment
#Each key for dictionary needs to be in hugging face format so 'input_ids' & 'attention_mask' & 'labels' must be key names
#model recognizes appropriate input ids which are text examples that are tokenized 
#model recognizes appropriate attention_mask which is used to batch input sequence together 
#Convert each value returned from indexing into dataset as list to be compatible value for training process as
#training requires indexing into dictionary using key to retrieve value that IS NOT A DICTIONARY TYPE OBJECT 




hf_dataset = Dataset.from_dict({
    'input_ids': dataset.tensors[0].tolist(),
    'attention_mask': dataset.tensors[1].tolist(),
    'labels': dataset.tensors[2].tolist(),
})

#hf naming convention to say this object/variable with 'hf' in the name of the dataset is huggingface compatible now with the appropriate object type

In [832]:
#Format dataset to be compatible with pytorch
hf_dataset.set_format("torch")

In [833]:
from transformers import DataCollatorForTokenClassification

In [834]:
#Split dataset into training and test splits so hf_split_dataset contains a train dataset of 700 examples and a test dataset of 300 examples

hf_split_dataset=hf_dataset.train_test_split(test_size=0.3, seed=42)

In [835]:
#Make two objects, the first variable 'pretask_train' to store the training subset from the split 
pretask_train=hf_split_dataset['train']

#The latter variable 'pretask_eval' to store the test subset from the split
pretask_eval=hf_split_dataset['test']

In [836]:
#Shuffle new subsets from previous cell and select 'integer' within range of both respective subsets in previous cell 
#This takes even smaller subset of data (500 examples for training & 300 examples for evaluation) to finetune and evaluate on in order to speed up training process 

small_train_dataset=pretask_train.shuffle(seed=42).select(range(500))
small_eval_dataset=pretask_eval.shuffle(seed=42).select(range(300))

In [837]:
#Our eval dataset does contain 300 rows as we selected previously

print(small_eval_dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 300
})


In [838]:
#Inspecting shape of attention mask 
small_eval_dataset['attention_mask'].shape

torch.Size([300, 406])

In [839]:
#Checking if GPU is detected by tensorflow for gpu acceleration setup 

print("TF version: ", tf.__version__)
print("TF Hub version: ", hub.__version__)

# Check for GPU availability
print("GPU", "available (YESSSSSS!!!!!!!)" if tf.config.list_physical_devices("GPU") else "not available :()")

TF version:  2.10.0
TF Hub version:  0.15.0
GPU available (YESSSSSS!!!!!!!)


In [840]:
#Verified TensorFlow can run an operation on my GPU

# Create a simple TensorFlow operation
a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
c = tf.matmul(a, b)

# Run the operation on the GPU
with tf.device('/GPU:0'):
    print(c)

tf.Tensor(
[[22. 28.]
 [49. 64.]], shape=(2, 2), dtype=float32)


In [841]:
#Number of GPU's available is 1 which is True

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [842]:
#Remove attention mask as it's not needed for training and causes errors during training related to shape 

small_training_dataset=small_train_dataset#.remove_columns(column_names='attention_mask')

#Remove labels as we will use this dataset for predictions and then comparing against ground truth labels 
small_evaluating_dataset=small_eval_dataset.remove_columns(column_names='labels')

In [843]:
#Inspect shape of training dataset 

small_training_dataset['labels'].shape

torch.Size([500, 155])

In [844]:
#Setup custom data collator which uses TokenClassification from huggingface library and padding remains True 

custom_data_collator=DataCollatorForTokenClassification(tokenizer,padding=True)

In [845]:
print(small_training_dataset)
print(small_evaluating_dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 500
})
Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 300
})


In [849]:
#'num_epochs' is number of passes over training set 

#optimizer adjusts  phi_model hyperparameters and uses learning rate 2e-4 

#Took learning rate from 2.3 training details section in research paper titled 'Textbooks are all you need II: phi_1.5 technical report'

#'num_training_steps' defines how many times the model goes through the training data 

#num_epochs=32
#optimizer=AdamW(phi_model.parameters(), lr= 2e-4)
#num_training_steps = num_epochs * len(dataloader)
#lr_scheduler=get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [850]:
#Set up TrainingArguments and assign to training_args to be used in Trainer instance

#At every 8 epochs we will be returned a new training loss 

#Expirementing with "10e-4" learning rate for this model

training_args=TrainingArguments(output_dir="test_trainer", per_device_train_batch_size=8, evaluation_strategy='epoch', metric_for_best_model=metric_name, learning_rate=10e-4, num_train_epochs=32)

In [931]:
#Define function to compute metrics once model makes predictions and compare with ground truth labels using following metrics 

def compute_metrics(p, labels, threshold=0.8):
    probabilities=torch.nn.functional.softmax(torch.tensor(p), dim=1)
    binary_predictions = (probabilities > threshold).int()
    binary_labels = (labels>threshold).int()
    
    keyword_metrics = {f'keyword_{i}': {} for i in range(155)}

    for i in range(155):
        keyword_predictions=binary_predictions[:, i]
        keyword_labels=binary_labels[:,i]
        #Compute accuracy
        accuracy = accuracy_score(keyword_labels, keyword_predictions)

        #Compute precision, recall, and F1 score for each label
        precision = precision_score(keyword_labels, keyword_predictions, zero_division='warn')
        recall= recall_score(keyword_labels, keyword_predictions, zero_division='warn')
        f1 = f1_score(keyword_labels, keyword_predictions, zero_division='warn')

        #Store metrics for the keyword
        keyword_metrics[f'keyword_{i}']['accuracy']= accuracy
        keyword_metrics[f'keyword_{i}']['precision'] = precision
        keyword_metrics[f'keyword_{i}']['recall'] = recall
        keyword_metrics[f'keyword_{i}']['f1_score'] = f1 
    # Compute multilabel confusion matrix
    confusion_matrices = multilabel_confusion_matrix(binary_labels, binary_predictions)

    
    #Calculate macro averages 
    macro_precision = np.mean([keyword_metrics[f'keyword_{i}']['precision'] for i in range(155)])
    macro_recall = np.mean([keyword_metrics[f'keyword_{i}']['recall'] for i in range(155)])
    macro_f1 = np.mean([keyword_metrics[f'keyword_{i}']['f1_score'] for i in range(155)])
    
    # Store macro-averaged metrics
    keyword_metrics['confusion_matrices']=confusion_matrices.tolist()
    keyword_metrics['macro_precision'] = macro_precision
    keyword_metrics['macro_recall'] = macro_recall
    keyword_metrics['macro_f1'] = macro_f1


    return keyword_metrics


In [852]:
#Instantiate Trainer object to be stored in trainer 
#No need for paranthesis around compute_metrics as data is passed through and handled by Trainer instance

trainer=Trainer(
    model=phi_model,
    args=training_args,
    train_dataset=small_training_dataset,
    eval_dataset=small_evaluating_dataset,
    compute_metrics=compute_metrics,
    data_collator=custom_data_collator

)

In [853]:
#Finetuning model on pretask of keyword presence 
trainer.train()



You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,No log,No log
4,No log,No log
5,No log,No log
6,No log,No log
7,No log,No log
8,0.129300,No log
9,0.129300,No log
10,0.129300,No log


TrainOutput(global_step=2016, training_loss=0.09551320644834685, metrics={'train_runtime': 56475.5294, 'train_samples_per_second': 0.283, 'train_steps_per_second': 0.036, 'total_flos': 3319788400320000.0, 'train_loss': 0.09551320644834685, 'epoch': 32.0})

In [854]:
#Print log of training history as a DataFrame

trainer_log_history=pd.DataFrame(trainer.state.log_history)
print(trainer_log_history)

    eval_runtime  eval_samples_per_second  eval_steps_per_second  epoch  step  \
0       358.2233                    0.837                  0.106   1.00    63   
1       334.5547                    0.897                  0.114   2.00   126   
2       332.8282                    0.901                  0.114   3.00   189   
3       337.8026                    0.888                  0.112   4.00   252   
4       336.5980                    0.891                  0.113   5.00   315   
5       349.2706                    0.859                  0.109   6.00   378   
6       340.0182                    0.882                  0.112   7.00   441   
7            NaN                      NaN                    NaN   7.94   500   
8       344.3980                    0.871                  0.110   8.00   504   
9       345.5652                    0.868                  0.110   9.00   567   
10      346.9540                    0.865                  0.110  10.00   630   
11      345.2732            

In [855]:
trainer_log_history

Unnamed: 0,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch,step,loss,learning_rate,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss
0,358.2233,0.837,0.106,1.0,63,,,,,,,
1,334.5547,0.897,0.114,2.0,126,,,,,,,
2,332.8282,0.901,0.114,3.0,189,,,,,,,
3,337.8026,0.888,0.112,4.0,252,,,,,,,
4,336.598,0.891,0.113,5.0,315,,,,,,,
5,349.2706,0.859,0.109,6.0,378,,,,,,,
6,340.0182,0.882,0.112,7.0,441,,,,,,,
7,,,,7.94,500,0.1293,0.000752,,,,,
8,344.398,0.871,0.11,8.0,504,,,,,,,
9,345.5652,0.868,0.11,9.0,567,,,,,,,


In [871]:
#Make predictions on evaluation dataset
predictions = trainer.predict(small_evaluating_dataset)

In [872]:
#Take ground truth labels from small_eval_dataset so we can compare with the examples we took from the same dataset and the labels our model generated for them

ground_truth_labels=small_eval_dataset['labels']

In [889]:
ground_truth_labels.shape

torch.Size([300, 155])

In [900]:
predictions.predictions.shape

(300, 406, 155)

In [912]:
#Take an aggregate of the predictions using the mean and 'axis=1' In order to compute metrics between ground truth labels and predicted labels 
aggregated_predictions = np.mean(predictions.predictions, axis=1)

In [914]:
aggregated_predictions.shape

(300, 155)

In [932]:
metrics=compute_metrics(p=aggregated_predictions, labels=ground_truth_labels, threshold=0.5)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(av

In [929]:
#Since divide by zero error occurs since we encounter 0 true positives, 0 true negatives etc; Precision, Recall, F1-Score will be set to 0
print("Macro Precision:", metrics['macro_precision'])
print("Macro Recall:", metrics['macro_recall'])
print("Macro F1-Score:", metrics['macro_f1'])


Macro Precision: 0.0
Macro Recall: 0.0
Macro F1-Score: 0.0


In [933]:
#Metrics computed for each keyword. Accuracy is a o non zero value and Confusion matrices give insight on true positive, true negative, false positive and false negative for each label/keyword

metrics

{'keyword_0': {'accuracy': 0.0,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0},
 'keyword_1': {'accuracy': 0.9966666666666667,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0},
 'keyword_2': {'accuracy': 0.9966666666666667,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0},
 'keyword_3': {'accuracy': 1.0,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0},
 'keyword_4': {'accuracy': 0.9866666666666667,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0},
 'keyword_5': {'accuracy': 0.99,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0},
 'keyword_6': {'accuracy': 1.0,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0},
 'keyword_7': {'accuracy': 0.9966666666666667,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0},
 'keyword_8': {'accuracy': 0.9933333333333333,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0},
 'keyword_9': {'accuracy': 1.0,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0},
 'keyword_10': {'accuracy': 1.0,
 

In [1093]:
#Unicode Emoji Chart Link:  https://unicode.org/emoji/charts/full-emoji-list.html

In [938]:
#Unicode printed emoji example 
print("\U0001F602")

😂


In [1209]:
#Define function to generate emojis when input text is given 
#Use with torch.no_grad() as we are no longer training the model and have no need for computing gradient when making predictions 
#Currently using threshold of 0.3 

def generate_emojis(input_text): 
    
    inputs=tokenizer(input_text, return_tensors='pt')
    
    with torch.no_grad():
        outputs=phi_model(**inputs)
    
    #Model outputs binary presence probabilities
    keyword_probabilities=torch.sigmoid(outputs.logits)
    
    print("Keyword Probabilities:", keyword_probabilities.tolist())
    #set threshold to determine if keyword is present or not 
    threshold = 0.3
    
    keyword_present=(keyword_probabilities>threshold).tolist()[0]
    
    #Defined dictionary mapping keyword IDs to emojis for 20 emojis in our list 
    
    keyword_to_emoji={
        "funny": "\U0001F602",
        "laughter": "\U0001F602",
         "joke": "\U0001F602",
         "comedian": "\U0001F602",
        "sad": "\U0001F62D", 
         "upset":  "\U0001F62D", 
         "crying":  "\U0001F62D", 
         "hurt":  "\U0001F62D",
        "thinking":"\U0001F914" , 
         "question": "\U0001F914", 
         "learning": "\U0001F914", 
         "curious": "\U0001F914", 
         "mystery": "\U0001F914", 
         "student": "\U0001F914", 
        "school": "\U0001F914", 
         "happy": "\U0001F604", 
         "joy": "\U0001F604", 
         "smile": "\U0001F604",
        "celebration":"\U0001F973", 
         "party":"\U0001F973", 
         "birthday":"\U0001F973",
         "scared": "\U0001F616", 
         "worried": "\U0001F616", 
         "concerned": "\U0001F616", 
        "angry": "\U0001F620", 
         "mad":  "\U0001F620", 
         "upset":  "\U0001F620", 
         "hatred":  "\U0001F620", 
         "frustrated":  "\U0001F620",
         "shocked": "\U0001F632", 
         "OMG":  "\U0001F632", 
        "surprise":  "\U0001F632", 
        "exasperated":  "\U0001F632",
        "computer": "\U0001F4BB", 
        "technology": "\U0001F4BB", 
        "social media": "\U0001F4BB", 
        "money": "\U0001F4B0", 
        "job": "\U0001F4B0",
        "stocks": "\U0001F4B0",
          "sleepy": "\U0001F62A", 
         "tired": "\U0001F62A",
          "sleep": "\U0001F62A", 
         "bed": "\U0001F62A",
        "night": "\U0001F62A", 
        "gross": "\U0001F922", 
        "disgusting": "\U0001F922",
         "throw up": "\U0001F922",
         "puke": "\U0001F922",
         "queasy": "\U0001F922",
        "dead": "\U0001F480",
        "deathly":  "\U0001F480", 
         "poison":  "\U0001F480", 
         "fossil":  "\U0001F480", 
         "afterlife":  "\U0001F480",
        "good": "\U0001F44D", 
         "great":  "\U0001F44D", 
         "ok":  "\U0001F44D", 
         "agreed":  "\U0001F44D",
         "bad":"\U0001F44E"  , 
        "no": "\U0001F44E", 
         "disagree": "\U0001F44E",
         "empowerment": "\U0000270A", 
        "movement": "\U0000270A", 
        "teamwork": "\U0001F91D", 
         "agreement": "\U0001F91D",
         "workout": "\U0001F4AA", 
        "sports": "\U0001F4AA", 
         "athlete": "\U0001F4AA", 
        "gym": "\U0001F4AA", 
        "muscle": "\U0001F4AA",
        "religion": "\U0001F64F", 
         "prayer": "\U0001F64F", 
         "praise": "\U0001F64F", 
         "beg": "\U0001F64F",
         "beauty": "\U0001F484", 
         "cosmetics": "\U0001F484", 
         "fashion": "\U0001F484", 
    }
#End of keyword:emoji dictionary mapping 

    unique_emojis = set()

    for keyword, present in zip(keyword_to_emoji.keys(), keyword_present): 
        if present: 
            unique_emojis.add(keyword_to_emoji[keyword])
        
    emoji_string=" ".join(unique_emojis)
    
    return emoji_string



In [1219]:
#Examples below are input text given to the model and the models generated emoji's for the input text it's given
#So we have 20 unique emojis in our dictionary that we can print out

In [1190]:
first_input_text="Yesterday, I went to a comedy show with my friends. The comedian told a hilarious joke, and we couldn't stop laughing."

In [1191]:
emojis=generate_emojis(first_input_text)

Keyword Probabilities: [[[0.999977707862854, 0.9984548091888428, 0.001673768856562674, 0.001052561099641025, 0.0012714093318209052, 0.0012006756151095033, 0.0011677369475364685, 0.0011858445359393954, 0.0012059491127729416, 0.0013874024152755737, 0.0015631053829565644, 0.001815136638469994, 0.0016118361381813884, 0.001217750133946538, 0.0011663450859487057, 0.0017968673491850495, 0.0013906856765970588, 0.001233779708854854, 0.0014105049194768071, 0.001081214053556323, 0.0011412248713895679, 0.00164668052457273, 0.0012144361389800906, 0.0017413814784958959, 0.0016893413849174976, 0.001068281359039247, 0.001254730741493404, 0.00137770373839885, 0.0011623367900028825, 0.0015839277766644955, 0.001236715237610042, 0.0015689656138420105, 0.0016435760771855712, 0.0009494214900769293, 0.0012152072740718722, 0.0010167679283767939, 0.0013002414489164948, 0.0010335958795621991, 0.0013819930609315634, 0.0012088962830603123, 0.0015746481949463487, 0.0014066630974411964, 0.001446651527658105, 0.0012

In [1192]:
print(first_input_text)

Yesterday, I went to a comedy show with my friends. The comedian told a hilarious joke, and we couldn't stop laughing.


In [1193]:
print(emojis)
#Few emojis are relevant while others should't have been generated in this case

😠 🤔 😂 😄 🥳 😲 😭 😖


In [1194]:
second_input_text = "I just invested in some technology stocks, and the market is doing great!"

In [1195]:
print(second_input_text)

I just invested in some technology stocks, and the market is doing great!


In [1196]:
emojis_text_two=generate_emojis(second_input_text)


Keyword Probabilities: [[[0.999977707862854, 0.9984548091888428, 0.001673768856562674, 0.001052560517564416, 0.0012714093318209052, 0.0012006749166175723, 0.001167737995274365, 0.0011858445359393954, 0.0012059491127729416, 0.0013874010182917118, 0.0015631053829565644, 0.0018151358235627413, 0.0016118345083668828, 0.001217750133946538, 0.0011663450859487057, 0.0017968673491850495, 0.0013906843960285187, 0.001233779126778245, 0.0014105042209848762, 0.001081213471479714, 0.0011412248713895679, 0.00164668052457273, 0.001214436604641378, 0.0017413814784958959, 0.0016893420834094286, 0.001068281359039247, 0.001254730741493404, 0.0013777024578303099, 0.0011623379541561007, 0.0015839269617572427, 0.001236715936101973, 0.0015689656138420105, 0.0016435760771855712, 0.000949422421399504, 0.0012152079725638032, 0.0010167689761146903, 0.001300242030993104, 0.0010335958795621991, 0.0013819916639477015, 0.0012088973307982087, 0.0015746481949463487, 0.0014066636795178056, 0.001446651527658105, 0.00121

In [1197]:
print(emojis_text_two)
#Performs poorly on this sentence and doesn't output any relevant emojis

😠 🤔 😂 😄 😭


In [1198]:
third_input_text = "Yesterday, I attended a religious ceremony at the temple. It was a moment of great spirituality and praise. People gathered to offer their prayers and seek blessings. The atmosphere was serene and peaceful, and it filled my heart with a sense of goodness. However, there was a strange incident later in the day. I stumbled upon a dead animal on the path, which was quite gross and unsettling. It was a stark contrast to the earlier positive and spiritual experience"

In [1199]:
print(third_input_text)

Yesterday, I attended a religious ceremony at the temple. It was a moment of great spirituality and praise. People gathered to offer their prayers and seek blessings. The atmosphere was serene and peaceful, and it filled my heart with a sense of goodness. However, there was a strange incident later in the day. I stumbled upon a dead animal on the path, which was quite gross and unsettling. It was a stark contrast to the earlier positive and spiritual experience


In [1200]:
emojis_text_three=generate_emojis(third_input_text)

Keyword Probabilities: [[[0.999977707862854, 0.9984548091888428, 0.0016737680416554213, 0.001052560517564416, 0.0012714093318209052, 0.0012006760807707906, 0.001167737995274365, 0.0011858445359393954, 0.0012059485306963325, 0.0013874024152755737, 0.0015631053829565644, 0.0018151358235627413, 0.0016118361381813884, 0.0012177495518699288, 0.0011663445038720965, 0.0017968673491850495, 0.0013906843960285187, 0.001233779126778245, 0.0014105035224929452, 0.0010812130058184266, 0.0011412248713895679, 0.00164668052457273, 0.0012144361389800906, 0.0017413814784958959, 0.0016893420834094286, 0.0010682803113013506, 0.0012547295773401856, 0.0013777024578303099, 0.0011623367900028825, 0.0015839263796806335, 0.001236714655533433, 0.0015689656138420105, 0.0016435767756775022, 0.0009494214900769293, 0.0012152072740718722, 0.0010167679283767939, 0.001300242030993104, 0.0010335958795621991, 0.0013819930609315634, 0.0012088968651369214, 0.0015746481949463487, 0.0014066630974411964, 0.001446650130674243, 

In [1201]:
print(emojis_text_three)
#Outputs all emojis available in dictionary. Very off target with this example. 

🤔 🥳 🤝 😪 😂 👎 🤢 😲 😭 😖 🙏 💪 😄 👍 😠 💻 💄 💀 💰 ✊


In [1202]:
fourth_input_text = "Today's stock market is doing great, and investors are celebrating their profits. However, some are worried about the future of technology stocks. It's a mix of emotions in the finance world!"
emojis_text_four=generate_emojis(fourth_input_text)


Keyword Probabilities: [[[0.999977707862854, 0.9984548091888428, 0.0016737697878852487, 0.0010525615653023124, 0.0012714093318209052, 0.0012006749166175723, 0.0011677369475364685, 0.0011858439538627863, 0.0012059496948495507, 0.0013874017167836428, 0.0015631038695573807, 0.001815136638469994, 0.0016118361381813884, 0.0012177489697933197, 0.0011663440382108092, 0.0017968673491850495, 0.0013906829990446568, 0.001233779126778245, 0.0014105035224929452, 0.001081213471479714, 0.0011412248713895679, 0.0016466790111735463, 0.0012144349748268723, 0.0017413798486813903, 0.0016893405700102448, 0.001068281359039247, 0.001254730741493404, 0.001377701759338379, 0.0011623373720794916, 0.0015839263796806335, 0.001236714655533433, 0.0015689656138420105, 0.0016435744473710656, 0.0009494214900769293, 0.001215206808410585, 0.0010167679283767939, 0.0013002407504245639, 0.0010335969273000956, 0.0013819930609315634, 0.0012088973307982087, 0.0015746488934382796, 0.0014066630974411964, 0.0014466509455814958, 

In [1203]:
print(fourth_input_text)

Today's stock market is doing great, and investors are celebrating their profits. However, some are worried about the future of technology stocks. It's a mix of emotions in the finance world!


In [1204]:
print(emojis_text_four)
#It's able to predict computer and moneybag which correspond to the text well, but other emojis aren't as relevant 

😠 🤔 😂 😄 🥳 💻 😲 😭 😖 💰 😪


In [1101]:
import os


In [1102]:
#Saving current model weights and parameters so it can be imported for future use without needing to retrain

#Saving tokenizer configuration to be imported for future use 


save_directory = 'C:\\Users\\tdcap\\MICROSOFT AI PROJECT'

# Ensure that the directory exists; if not, create it
os.makedirs(save_directory, exist_ok=True)

# Save the model's state_dict to a file within the directory
model_save_path = os.path.join(save_directory, 'model_state.pth')
torch.save(phi_model.state_dict(), model_save_path)

# Save the tokenizer's configuration and vocabulary to the same directory
tokenizer_save_path = os.path.join(save_directory, 'tokenizer')
tokenizer.save_pretrained(tokenizer_save_path)

('C:\\Users\\tdcap\\MICROSOFT AI PROJECT\\tokenizer\\tokenizer_config.json',
 'C:\\Users\\tdcap\\MICROSOFT AI PROJECT\\tokenizer\\special_tokens_map.json',
 'C:\\Users\\tdcap\\MICROSOFT AI PROJECT\\tokenizer\\vocab.txt',
 'C:\\Users\\tdcap\\MICROSOFT AI PROJECT\\tokenizer\\added_tokens.json',
 'C:\\Users\\tdcap\\MICROSOFT AI PROJECT\\tokenizer\\tokenizer.json')

In [1133]:
fifth_input_text="In ancient times, the discovery of a fossil was considered a window into the past, offering insights into the creatures that roamed the Earth millions of years ago. The concept of an afterlife has been a topic of discussion in many cultures, with beliefs in various forms of existence beyond death. Sports and athleticism have always been a significant part of human history, with athletes training rigorously to excel in competitions. The gym is where many people go to build strength, endurance, and maintain a healthy lifestyle. Fashion trends have evolved over the years, with different eras known for their unique styles and clothing choices."
emojis_text_five=generate_emojis(fifth_input_text)

Keyword Probabilities: [[[0.999977707862854, 0.9984548091888428, 0.0016737650148570538, 0.0010525590041652322, 0.0012714111944660544, 0.0012006772449240088, 0.0011677375296130776, 0.001185846165753901, 0.0012059469008818269, 0.0013874017167836428, 0.0015631053829565644, 0.001815136638469994, 0.0016118391649797559, 0.0012177478056401014, 0.0011663418263196945, 0.0017968639731407166, 0.0013906856765970588, 0.0012337767984718084, 0.0014105042209848762, 0.0010812145192176104, 0.0011412248713895679, 0.0016466828528791666, 0.0012144384672865272, 0.0017413798486813903, 0.0016893396386876702, 0.0010682824067771435, 0.0012547282967716455, 0.0013776997802779078, 0.0011623362079262733, 0.0015839263796806335, 0.001236714655533433, 0.0015689647989347577, 0.0016435744473710656, 0.000949422421399504, 0.0012152079725638032, 0.0010167683940380812, 0.0013002414489164948, 0.0010335973929613829, 0.0013819936430081725, 0.0012088939547538757, 0.0015746488934382796, 0.0014066630974411964, 0.00144665152765810

In [1104]:
print(fifth_input_text)

In ancient times, the discovery of a fossil was considered a window into the past, offering insights into the creatures that roamed the Earth millions of years ago. The concept of an afterlife has been a topic of discussion in many cultures, with beliefs in various forms of existence beyond death. Sports and athleticism have always been a significant part of human history, with athletes training rigorously to excel in competitions. The gym is where many people go to build strength, endurance, and maintain a healthy lifestyle. Fashion trends have evolved over the years, with different eras known for their unique styles and clothing choices.


In [1134]:
print(emojis_text_five)
#Another example where it outputs mostly all emojis in the dictionary 

🤔 🥳 🤝 😪 😂 👎 🤢 😲 😭 😖 🙏 💪 😄 👍 😠 💻 💄 💀 💰 ✊


In [1135]:
sixth_input_text= "I have always been fascinated by sports, and I aspire to become a professional athlete someday. To achieve my goal, I spend hours training at the gym every day."
emojis_text_six=generate_emojis(sixth_input_text)


Keyword Probabilities: [[[0.999977707862854, 0.9984548091888428, 0.0016737680416554213, 0.001052560517564416, 0.0012714093318209052, 0.0012006760807707906, 0.001167737995274365, 0.0011858445359393954, 0.0012059491127729416, 0.0013874017167836428, 0.0015631045680493116, 0.0018151375697925687, 0.0016118361381813884, 0.001217750133946538, 0.0011663450859487057, 0.0017968682805076241, 0.0013906850945204496, 0.001233779126778245, 0.0014105049194768071, 0.0010812130058184266, 0.0011412259191274643, 0.00164668052457273, 0.0012144355569034815, 0.0017413814784958959, 0.0016893420834094286, 0.001068281359039247, 0.001254730741493404, 0.0013777024578303099, 0.0011623373720794916, 0.0015839263796806335, 0.001236714655533433, 0.0015689656138420105, 0.0016435760771855712, 0.000949422421399504, 0.001215206808410585, 0.0010167679283767939, 0.0013002414489164948, 0.0010335969273000956, 0.0013819930609315634, 0.0012088968651369214, 0.0015746481949463487, 0.0014066630974411964, 0.001446651527658105, 0.00

In [1136]:
print(emojis_text_six)
#No relevant emojis generated for this input 

😠 🤔 😂 😄 🥳 💻 😲 😭 😖 💰


In [1139]:
seventh_input_text= "LeBron James delivered a stunning triple-double performance in last night's basketball game, leading his team to a thrilling overtime victory."
emojis_text_seven= generate_emojis(seventh_input_text)

Keyword Probabilities: [[[0.999977707862854, 0.9984548091888428, 0.0016737697878852487, 0.0010525615653023124, 0.0012714093318209052, 0.001200674450956285, 0.001167737995274365, 0.0011858445359393954, 0.0012059485306963325, 0.0013874024152755737, 0.0015631053829565644, 0.001815136638469994, 0.0016118369530886412, 0.0012177495518699288, 0.0011663450859487057, 0.001796869095414877, 0.0013906856765970588, 0.001233779708854854, 0.0014105049194768071, 0.0010812130058184266, 0.0011412259191274643, 0.0016466813394799829, 0.0012144361389800906, 0.0017413814784958959, 0.0016893413849174976, 0.0010682818247005343, 0.001254730741493404, 0.00137770373839885, 0.0011623379541561007, 0.0015839277766644955, 0.001236715237610042, 0.0015689668944105506, 0.0016435760771855712, 0.0009494228288531303, 0.0012152079725638032, 0.0010167694417759776, 0.0013002407504245639, 0.0010335964616388083, 0.0013819943415001035, 0.0012088962830603123, 0.0015746481949463487, 0.0014066630974411964, 0.0014466509455814958, 0

In [1140]:
print(emojis_text_seven)
#Here we see it predict false positives again but it does predict celebration emoji which corresponds to victory although victory isn't one of our key words

😠 🤔 😂 😄 🥳 😲 😭 😖


In [1210]:
sample_text_eight="Just finished a great workout at the gym."
emojis_text_eight=generate_emojis(sample_text_eight)
#Very off for this text, possibly due to how brief it is


Keyword Probabilities: [[[0.999977707862854, 0.9984548091888428, 0.0016737665282562375, 0.0010525600519031286, 0.0012714117765426636, 0.0012006784090772271, 0.0011677375296130776, 0.001185846165753901, 0.0012059469008818269, 0.0013874036958441138, 0.0015631053829565644, 0.001815136638469994, 0.0016118399798870087, 0.0012177483877167106, 0.0011663434561342, 0.0017968673491850495, 0.0013906870735809207, 0.0012337756343185902, 0.0014105049194768071, 0.001081213471479714, 0.0011412253370508552, 0.0016466836677864194, 0.0012144373031333089, 0.001741380780003965, 0.0016893405700102448, 0.0010682828724384308, 0.0012547301594167948, 0.0013777004787698388, 0.001162335742264986, 0.0015839277766644955, 0.001236715936101973, 0.0015689656138420105, 0.0016435744473710656, 0.0009494219557382166, 0.0012152085546404123, 0.0010167683940380812, 0.001300242030993104, 0.0010335973929613829, 0.0013819923624396324, 0.001208895118907094, 0.0015746488934382796, 0.0014066630974411964, 0.0014466509455814958, 0.0

In [1211]:
print(emojis_text_eight)

😠 🤔 😂 😭


In [1212]:
sample_text_nine="Artificial intelligence is revolutionizing the way we interact with technology."
emojis_text_nine=generate_emojis(sample_text_nine)


Keyword Probabilities: [[[0.999977707862854, 0.9984548091888428, 0.0016737657133489847, 0.0010525590041652322, 0.0012714104959741235, 0.0012006784090772271, 0.0011677369475364685, 0.0011858451180160046, 0.001205946202389896, 0.0013874036958441138, 0.0015631045680493116, 0.0018151350086554885, 0.001611837767995894, 0.0012177472235634923, 0.001166342874057591, 0.0017968665342777967, 0.0013906863750889897, 0.0012337756343185902, 0.0014105049194768071, 0.001081213471479714, 0.0011412248713895679, 0.0016466836677864194, 0.001214436604641378, 0.001741380780003965, 0.0016893405700102448, 0.0010682824067771435, 0.0012547288788482547, 0.0013777004787698388, 0.001162335160188377, 0.0015839269617572427, 0.001236714655533433, 0.0015689641004428267, 0.0016435729339718819, 0.0009494214900769293, 0.0012152079725638032, 0.0010167674627155066, 0.0013002407504245639, 0.0010335973929613829, 0.0013819916639477015, 0.0012088945368304849, 0.0015746481949463487, 0.0014066630974411964, 0.001446650130674243, 0

In [1213]:
print(emojis_text_nine)
#Only predicts these dominant emojis that keep showing up in every emoji generation task the model is given for an input text

😠 🤔 😂 😭


In [1215]:
sample_text_ten="Cryptocurrency markets are surging, with many investors rejoicing over their gains, while others express concerns about potential volatility. It's a mixed sentiment in the world of digital assets!"
emojis_text_ten=generate_emojis(sample_text_ten)

Keyword Probabilities: [[[0.999977707862854, 0.9984548091888428, 0.0016737697878852487, 0.0010525615653023124, 0.0012714104959741235, 0.0012006749166175723, 0.001167737995274365, 0.0011858445359393954, 0.0012059496948495507, 0.0013874024152755737, 0.0015631053829565644, 0.001815136638469994, 0.001611837767995894, 0.0012177495518699288, 0.0011663450859487057, 0.0017968682805076241, 0.0013906843960285187, 0.001233779126778245, 0.0014105042209848762, 0.001081213471479714, 0.0011412259191274643, 0.00164668052457273, 0.0012144361389800906, 0.0017413814784958959, 0.0016893413849174976, 0.001068281359039247, 0.0012547319056466222, 0.0013777024578303099, 0.0011623379541561007, 0.0015839269617572427, 0.001236714655533433, 0.0015689656138420105, 0.0016435752622783184, 0.000949422421399504, 0.001215206808410585, 0.0010167689761146903, 0.0013002414489164948, 0.0010335969273000956, 0.0013819936430081725, 0.0012088973307982087, 0.0015746488934382796, 0.0014066630974411964, 0.001446651527658105, 0.00

In [1216]:
print(emojis_text_ten)

#Another lengthier example about tech and this time AI shows the model generated a computer and moneybag emoji
#Disgusting green face, worried, and shocked emoji(possibly related to "concerns about volatility") and ("markets surging")
#Celeberation emoji generated because of seeing in text "investors rejoicing"?
#

😠 🤔 😂 😄 🥳 💻 😲 😭 😖 🤢 💰 😪


In [1220]:
#Notes: 

#Model appears to struggle predicting emojis for brief sentences. It prints the same 4 emojis that correspond to I assume overrepresented keywords 

#However 3-6 sentence texts like the one above while generating irrelevant emojis still, also generates some relevant ones too that it hasn't for other text.

#Specifically the computer and money bag which it didn't generate for input 9 we gave the model. However the computer and moneybag were generated for input 10 which is appropriate for this  text. 

#Will have to review dataset and numbers of labels for each class and modify dataset to be balanced representation of each class label and retrain model

#Current hypothesis is that their is a clear class imbalance causing certain emojis to be constantly generated while others aren't generated at all despite the text 

#This would be due to keyword presence and not filtering dataset to have equal representation of each keyword_label 

#Once model is retrained will reattempt, generating emojis

#Overrepresented emojis/Emojis that are generated often while not being relevant to the text, (laughing, thinking, angry, happy, sad, tired, emoji)

#Next Steps:

#Go back to dataset and manipulate it further so it has close to equal amount of representation for each keyword_label
#Setup GPU acceleration to shorten training time and try out different parameters/training arguments 
#Learn more about TokenClassification
#Set Up/Calculate validation loss while running trainer.train