In [1]:
import numpy as np
import pandas as pd 
import os 
from datasets import load_dataset
from transformers import TrainingArguments, Trainer, default_data_collator
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import get_scheduler
from torch.optim import AdamW


#pip install einops
#package required for downloading transformer 
#https://huggingface.co/datasets/cnn_dailymail link to dataset




In [2]:
dataset = load_dataset("cnn_dailymail",'3.0.0', split='train')
#Run loading script once located in same directory as python notebook
#Specify configuration version '3.0.0'
#load dataset using script because dataset is very large

In [3]:
df=pd.DataFrame(dataset)
#cast dataset into pandas dataframe object type 

In [4]:
df.head(10)

Unnamed: 0,article,highlights,id
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...,42c027e4ff9730fbb3de84c1af0d2c506e41c3e4
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...,ee8871b15c50d0db17b0179a6d2beab35065f1e9
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa...",06352019a19ae31e527f37f7571c6dd7f0c5da37
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non...",24521a2abb2e1f5e34e6824e0f9e56904a2b0e88
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical...",7fe70cc8b12fab2d0a258fababf7d9c6b5e1262a
5,"BAGHDAD, Iraq (CNN) -- Dressed in a Superman s...","Parents beam with pride, can't stop from smili...",a1ebb8bb4d370a1fdf28769206d572be60642d70
6,"BAGHDAD, Iraq (CNN) -- The women are too afrai...","Aid workers: Violence, increased cost of livin...",7c0e61ac829a3b3b653e2e3e7536cc4881d1f264
7,"BOGOTA, Colombia (CNN) -- A key rebel commande...",Tomas Medina Caracas was a fugitive from a U.S...,f0d73bdab711763e745cdc75850861c9018f235d
8,WASHINGTON (CNN) -- White House press secretar...,"President Bush says Tony Snow ""will battle can...",5e22bbfc7232418b8d2dd646b952e404df5bd048
9,(CNN) -- Police and FBI agents are investigati...,Empty anti-tank weapon turns up in front of Ne...,613d6311ec2c1985bd44707d1796d275452fe156


In [5]:
df.drop(columns=['article','id'], inplace=True)
#drop article colummn from dataset

In [6]:
df.head(10)

Unnamed: 0,highlights
0,Harry Potter star Daniel Radcliffe gets £20M f...
1,Mentally ill inmates in Miami are housed on th...
2,"NEW: ""I thought I was going to die,"" driver sa..."
3,"Five small polyps found during procedure; ""non..."
4,"NEW: NFL chief, Atlanta Falcons owner critical..."
5,"Parents beam with pride, can't stop from smili..."
6,"Aid workers: Violence, increased cost of livin..."
7,Tomas Medina Caracas was a fugitive from a U.S...
8,"President Bush says Tony Snow ""will battle can..."
9,Empty anti-tank weapon turns up in front of Ne...


In [7]:
#Download tokenizer associated with "microsoft/phi-1_5"
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5",  from_tf=False,trust_remote_code=True)

In [8]:
tokenizer.pad_token = tokenizer.eos_token

In [9]:
#Take 1000 examples from our dataframe to use for training
pretask_dataset=df.iloc[0:1000].copy()

#tokenize pretask_dataset 'article' column and turn words into subword tokens for 'article' column stored in new 
#column name 'article_tokenized'

#apply across article column of pretask_dataset tokenizer and store results for each example in corresponing row and new column
#name 'article_tokenized'
#pretask_dataset['article_'+'tokenized']=pretask_dataset['article'].apply(lambda x: tokenizer.tokenize(x, truncation=True))

#same procedure except new column name 'highlights_tokenized'
pretask_dataset['highlights_'+'tokenized']=pretask_dataset['highlights'].apply(lambda x: tokenizer.tokenize(x, truncation=True))


#peak at pretask_dataset
pretask_dataset.head(1)

#must use 'lambda x: ' function with tokenizer.tokenize(x) to apply tokenization to all rows 
#no error was outputted
    

Unnamed: 0,highlights,highlights_tokenized
0,Harry Potter star Daniel Radcliffe gets £20M f...,"[Harry, ĠPotter, Ġstar, ĠDaniel, ĠRad, cliffe,..."


In [10]:
#Create new dataframe with only tokenized data of article and highlights 

columns_to_remove= ['highlights']
pretask_dataset_tokenized = pretask_dataset.drop(columns=columns_to_remove, axis=1, inplace=False).astype(str)
pretask_dataset_tokenized


Unnamed: 0,highlights_tokenized
0,"['Harry', 'ĠPotter', 'Ġstar', 'ĠDaniel', 'ĠRad..."
1,"['M', 'ent', 'ally', 'Ġill', 'Ġinmates', 'Ġin'..."
2,"['NEW', ':', 'Ġ""', 'I', 'Ġthought', 'ĠI', 'Ġwa..."
3,"['Five', 'Ġsmall', 'Ġpoly', 'ps', 'Ġfound', 'Ġ..."
4,"['NEW', ':', 'ĠNFL', 'Ġchief', ',', 'ĠAtlanta'..."
...,...
995,"['Mother', 'Ġof', 'Ġmurdered', 'Ġschool', 'boy..."
996,"['NEW', ':', 'ĠPope', 'ĠBenedict', 'ĠXVI', 'Ġa..."
997,"['Eight', 'ĠFlorida', 'Ġteens', 'Ġto', 'Ġbe', ..."
998,"['Judge', 'Ġon', 'ĠHeather', 'ĠMills', ':', 'Ġ..."


In [11]:
#Now work on creating pretask labels from keywords to align with "keyword presence" task

df_2=pd.read_csv('keywords.csv')

#make empty list
list1 = []

for word in df_2:
    for j in df_2[word]:
        list1.append(j)

print(list1)
    

['funny, laughter, joke, comedian', 'sad, upset, crying, hurt', 'happy, joy, smile', '\xa0thinking, question, learning, curious, mystery student, learning, school', 'celebration, party, birthday', 'scared, worried, concerned', 'angry, mad, upset, hatred, frustrated', 'shocked, OMG, surprise, exasperated', '\xa0computer, technology, social media', 'money, job, stocks', 'sleepy, tired, sleep, bed, night', 'gross, disgusting, throw up, puke, queasy', '\xa0dead, deathly, poison, fossil, afterlife', '\xa0good, great, ok, agreed', 'bad, no, disagree', '\xa0empowerment, movement', 'teamwork, agreement', '\xa0workout, sports, athlete, gym, muscle', 'religion, prayer, praise, beg', 'beauty, cosmetics, fashion', '\xa0baby, kid, child, young', '\xa0man', 'woman', ' elderly, old', 'love, relationship, significant other, marriage', 'construction, building', 'ocean, sea, ship, water', 'justice, law, government, politics', 'pet, animal, vet', 'nature, plants, farming, science, earth, recycle, go gree

In [12]:
characters_to_remove = '\xa0'

cleaned_list = [''.join(char for char in string if char not in characters_to_remove)for string in list1]

print(cleaned_list)

['funny, laughter, joke, comedian', 'sad, upset, crying, hurt', 'happy, joy, smile', 'thinking, question, learning, curious, mystery student, learning, school', 'celebration, party, birthday', 'scared, worried, concerned', 'angry, mad, upset, hatred, frustrated', 'shocked, OMG, surprise, exasperated', 'computer, technology, social media', 'money, job, stocks', 'sleepy, tired, sleep, bed, night', 'gross, disgusting, throw up, puke, queasy', 'dead, deathly, poison, fossil, afterlife', 'good, great, ok, agreed', 'bad, no, disagree', 'empowerment, movement', 'teamwork, agreement', 'workout, sports, athlete, gym, muscle', 'religion, prayer, praise, beg', 'beauty, cosmetics, fashion', 'baby, kid, child, young', 'man', 'woman', ' elderly, old', 'love, relationship, significant other, marriage', 'construction, building', 'ocean, sea, ship, water', 'justice, law, government, politics', 'pet, animal, vet', 'nature, plants, farming, science, earth, recycle, go green', 'music, musician, instrument

In [13]:
#create a list object with keywords that we will count the occurence of as a pretext task goal   

individual_words = []

for text in cleaned_list:
    words= text.split()
    individual_words.extend(words)
    
    
print(individual_words)

       

['funny,', 'laughter,', 'joke,', 'comedian', 'sad,', 'upset,', 'crying,', 'hurt', 'happy,', 'joy,', 'smile', 'thinking,', 'question,', 'learning,', 'curious,', 'mystery', 'student,', 'learning,', 'school', 'celebration,', 'party,', 'birthday', 'scared,', 'worried,', 'concerned', 'angry,', 'mad,', 'upset,', 'hatred,', 'frustrated', 'shocked,', 'OMG,', 'surprise,', 'exasperated', 'computer,', 'technology,', 'social', 'media', 'money,', 'job,', 'stocks', 'sleepy,', 'tired,', 'sleep,', 'bed,', 'night', 'gross,', 'disgusting,', 'throw', 'up,', 'puke,', 'queasy', 'dead,', 'deathly,', 'poison,', 'fossil,', 'afterlife', 'good,', 'great,', 'ok,', 'agreed', 'bad,', 'no,', 'disagree', 'empowerment,', 'movement', 'teamwork,', 'agreement', 'workout,', 'sports,', 'athlete,', 'gym,', 'muscle', 'religion,', 'prayer,', 'praise,', 'beg', 'beauty,', 'cosmetics,', 'fashion', 'baby,', 'kid,', 'child,', 'young', 'man', 'woman', 'elderly,', 'old', 'love,', 'relationship,', 'significant', 'other,', 'marriage'

In [14]:
#comma taken out of list and each word it's own value 

character_to_remove=','
cleaned_individual_words = [''.join(char for char in string if char not in character_to_remove)for string in individual_words]

print(cleaned_individual_words)


['funny', 'laughter', 'joke', 'comedian', 'sad', 'upset', 'crying', 'hurt', 'happy', 'joy', 'smile', 'thinking', 'question', 'learning', 'curious', 'mystery', 'student', 'learning', 'school', 'celebration', 'party', 'birthday', 'scared', 'worried', 'concerned', 'angry', 'mad', 'upset', 'hatred', 'frustrated', 'shocked', 'OMG', 'surprise', 'exasperated', 'computer', 'technology', 'social', 'media', 'money', 'job', 'stocks', 'sleepy', 'tired', 'sleep', 'bed', 'night', 'gross', 'disgusting', 'throw', 'up', 'puke', 'queasy', 'dead', 'deathly', 'poison', 'fossil', 'afterlife', 'good', 'great', 'ok', 'agreed', 'bad', 'no', 'disagree', 'empowerment', 'movement', 'teamwork', 'agreement', 'workout', 'sports', 'athlete', 'gym', 'muscle', 'religion', 'prayer', 'praise', 'beg', 'beauty', 'cosmetics', 'fashion', 'baby', 'kid', 'child', 'young', 'man', 'woman', 'elderly', 'old', 'love', 'relationship', 'significant', 'other', 'marriage', 'construction', 'building', 'ocean', 'sea', 'ship', 'water', '

In [15]:
#pretask goal: Label the instance for each keyword in the list "cleaned_individual_words" amongst 1000 text entry examples from cnn_dailynews

#create empty list to store dataframe
#articles_labels_df=pd.DataFrame()

#Loop through list of keywords
#Check if keyword is present amongst all rows for datapoint under article and if so label this as 1 and if not label 0
#create dataframe for each individual keyword and their occurence amongst all samples 
#add individual dataframes to list articles_dataframe 
#concatenate articles_dataframe with articles_labels_df so all individual dataframes are together in one dataframe serving as columns 

##
#for keyword in cleaned_individual_words:
 #   articles_keyword=pretask_dataset_tokenized['article_tokenized'].str.lower().str.contains(keyword.lower()).astype(int)
    
  #  articles_keyword=articles_keyword.rename('article_' + keyword)
   
   # articles_labels_df[articles_keyword.name] = articles_keyword

In [16]:
#articles_labels_df.head(2)

In [17]:
#Now we must count occurences of keyword in summary column of our dataframe and then join these two dataframes together
#together these will be the labels of our dataset that is text article entries and their summaries so a total of two features 

#So now use 'highlights' column and count occurences of keywords saved above to make a new column consisting of keyword +'highlights'

highlights_labels_df=pd.DataFrame()

for keyword in cleaned_individual_words:
    highlights_keyword = pretask_dataset_tokenized['highlights_tokenized'].str.lower().str.contains(keyword.lower()).astype(int)
    
    highlights_keyword=highlights_keyword.rename('highlights_' + keyword)
    
    highlights_labels_df[highlights_keyword.name] = highlights_keyword    



  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_keyword.name] = highlights_keyword
  highlights_labels_df[highlights_

In [18]:
#CHECKPOINT: Use row 198 to check if highlights datapoint for that row contains the word 'hurt'
pretask_dataset.iloc[198, 1]

['NEW',
 ':',
 'ĠThe',
 'ĠUnited',
 'ĠStates',
 'Ġis',
 'Ġoutraged',
 'Ġby',
 'Ġthe',
 'Ġattack',
 ',',
 'ĠSecretary',
 'Ġof',
 'ĠState',
 'ĠRice',
 'Ġsays',
 'Ġ.',
 'Ċ',
 'Car',
 'Ġbomb',
 'Ġstrikes',
 'ĠU',
 '.',
 'S',
 '.',
 'ĠEmbassy',
 'Ġvehicle',
 'Ġnorth',
 'Ġof',
 'ĠBeirut',
 'Ġ.',
 'Ċ',
 'Three',
 'ĠLebanese',
 'Ġcivilians',
 'Ġdead',
 ',',
 'ĠAmerican',
 'Ġand',
 'ĠLebanese',
 'Ġofficials',
 'Ġconfirm',
 'Ġ.',
 'Ċ',
 'Driver',
 'Ġof',
 'Ġthe',
 'Ġvehicle',
 'Ġwas',
 'Ġslightly',
 'Ġinjured',
 ',',
 'Ġand',
 'Ġthe',
 'Ġonly',
 'Ġpassenger',
 'Ġwas',
 'Ġnot',
 'Ġhurt',
 'Ġ.']

In [19]:
#Use mask to retreive rows which have the woord 'hurt' present in their highlights datapoint 

mask=highlights_labels_df['highlights_hurt']==1

examples_with_hurt_highlights=highlights_labels_df[mask]

#We see row 198 does have the keyword 'hurt' present so counting keyword occurences was successful
print("Rows with a value of 1")
examples_with_hurt_highlights


Rows with a value of 1


Unnamed: 0,highlights_funny,highlights_laughter,highlights_joke,highlights_comedian,highlights_sad,highlights_upset,highlights_crying,highlights_hurt,highlights_happy,highlights_joy,...,highlights_medicine,highlights_emergency,highlights_fire,highlights_firefighters,highlights_911,highlights_home,highlights_house,highlights_living,highlights_family,highlights_domestic
198,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
317,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
453,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
650,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
729,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
880,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
#Download transformer using "namespace/modelname" and trust_remote_code=True to allow downloading remote software 



#Use class name AutoModelForCausualLM 



phi_model= AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", from_tf=False, trust_remote_code=True)

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


In [21]:
pretask_dataset_tokenized.head(1)

Unnamed: 0,highlights_tokenized
0,"['Harry', 'ĠPotter', 'Ġstar', 'ĠDaniel', 'ĠRad..."


In [22]:
#articles_labels_df.columns

In [23]:
new_df= pd.merge(pretask_dataset_tokenized, highlights_labels_df, left_index=True, right_index=True)

#new_df_2=pd.merge(new_df, highlights_labels_df, left_index=True, right_index=True)

In [24]:
raw_inputs=new_df['highlights_tokenized'].tolist()
tokenizer.pad_token = tokenizer.eos_token

inputs = tokenizer(raw_inputs,padding=True,truncation=True,return_tensors="pt")
print(inputs)

{'input_ids': tensor([[17816, 18308,  3256,  ..., 50256, 50256, 50256],
        [17816,    44,  3256,  ..., 50256, 50256, 50256],
        [17816, 13965,  3256,  ..., 50256, 50256, 50256],
        ...,
        [17816, 29571,  3256,  ..., 50256, 50256, 50256],
        [17816, 29511,  3256,  ..., 50256, 50256, 50256],
        [17816, 10364,  3256,  ..., 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [25]:
#view tokenized input example
print(tokenizer.decode(inputs['input_ids'][0]))

['Harry', 'ĠPotter', 'Ġstar', 'ĠDaniel', 'ĠRad', 'cliffe', 'Ġgets', 'ĠÂ£', '20', 'M', 'Ġfortune', 'Ġas', 'Ġhe', 'Ġturns', 'Ġ18', 'ĠMonday', 'Ġ.', 'Ċ', 'Young', 'Ġactor', 'Ġsays', 'Ġhe', 'Ġhas', 'Ġno', 'Ġplans', 'Ġto', 'Ġf', 'rit', 'ter', 'Ġhis', 'Ġcash', 'Ġaway', 'Ġ.', 'Ċ', 'Rad', 'cliffe', "'s", 'Ġearnings', 'Ġfrom', 'Ġfirst', 'Ġfive', 'ĠPotter', 'Ġfilms', 'Ġhave', 'Ġbeen', 'Ġheld', 'Ġin', 'Ġtrust', 'Ġfund', 'Ġ.']<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftex

In [26]:
from datasets import Dataset

In [27]:
#Currently storing only labels after dropping tokenized text column
#Educatioal Code Cell
tokenized_dataset = Dataset.from_pandas(new_df.drop(columns='highlights_tokenized', axis=1))

In [28]:
#Educatioal Code Cell
#In order to access datapoint of huggingface compatible dataset you must index like a dictionary 
#so index using column name which is a list value for features key
#Index again into the row you want to retrieve the example of 

tokenized_dataset['highlights_funny'][0]

0

In [29]:
from datasets import ClassLabel, Value
import torch

In [30]:
#store binary labels from new_df in new variable named 'labels_df' by dropping only column that doesn't count labels
#highlights_tokenized column contains summary of article text that is tokenized

labels_df=new_df.drop(columns='highlights_tokenized', axis=1)

In [31]:
#extract all values from labels_df and store into object variable named binary_labels
binary_labels=labels_df.values

In [32]:
#Make binary_labels a tensor object of dtype torch.float32 to be compatible with making tensor dataset 
tensor_labels=torch.tensor(binary_labels, dtype=torch.float32)

In [33]:
type(tensor_labels)

torch.Tensor

In [34]:
type(inputs['input_ids'])

torch.Tensor

In [35]:
from torch.utils.data import TensorDataset, DataLoader

In [36]:
#Create tensor dataset now using TensorDataset and passing each tensor object through 

dataset= TensorDataset(inputs['input_ids'], inputs['attention_mask'], tensor_labels)

In [37]:
#make tensordataset named 'dataset' a huggingface compatible dataset by loading dataset using Dataset.from_dict
#Index into dataset starting from [0] then [1] then [2] which is the same order of how we made dataset above 
#Indexing like the above sentence into dataset retrieves the corresponding key values for the key names mentioned in the next comment
#Each key for dictionary needs to be in hugging face format so 'input_ids' & 'attention_mask' & 'labels' must be key names
#model recognizes appropriate input ids which are text examples that are tokenized 
#model recognizes appropriate attention_mask which is used to batch input sequence together 
#Convert each value returned from indexing into dataset as list to be compatible value for training process as
#training requires indexing into dictionary using key to retrieve value that IS NOT A DICTIONARY TYPE OBJECT 




hf_dataset = Dataset.from_dict({
    'input_ids': dataset.tensors[0].tolist(),
    'attention_mask': dataset.tensors[1].tolist(),
    'labels': dataset.tensors[2].tolist(),
})

#hf naming convention to say this object/variable with 'hf' in the name of the dataset is huggingface compatible now 

In [38]:
#Format dataset to be compatible with pytorch
hf_dataset.set_format("torch")

In [39]:
#Use default training arguments and assign to training_args to be used in Trainer instance 
training_args=TrainingArguments(output_dir="test_trainer")

In [40]:
#Split dataset into training and test splits so hf_split_dataset contains a train dataset of 900 examples and a test dataset of 100 examples

hf_split_dataset=hf_dataset.train_test_split(test_size=0.1, seed=42)

In [41]:
#Make two objects, the first variable 'pretask_train' to store the training subset from the split 
#The latter variable 'pretask_eval' to store the test subset from the split
pretask_train=hf_split_dataset['train']
pretask_eval=hf_split_dataset['test']

In [42]:
#Shuffle new subsets from previous cell and select 'integer' within range of both subsets in previous cell 
#This takes even smaller subset of data to finetune and evaluate on in order to speed up training process 

small_train_dataset=pretask_train.shuffle(seed=42).select(range(90))
small_eval_dataset=pretask_eval.shuffle(seed=42).select(range(90))

In [43]:
#load microsoft phi-1.5 transformer and specify 'num_labels' parameter it should expect which we found length of highlights_labels_df
#length of highlights_labels_df which contains all binary labels for highlights_tokenied has 155 columns 
#specify 155 for 'num_labels' parameter in 'from_pretrained' method 


phi_model= AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", from_tf=False, trust_remote_code=True, num_labels=155)

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


In [46]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [48]:
#comment this out for now this is in the case I would like to manually train 
train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

In [49]:
num_epochs=3
optimizer=AdamW(phi_model.parameters(), lr= 2e-4)
num_training_steps=num_epochs * len(train_dataloader)
lr_scheduler=get_scheduler(
 name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [50]:
trainer=Trainer(
    model=phi_model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,

)

In [51]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
phi_model.to(device)

MixFormerSequentialForCausalLM(
  (layers): Sequential(
    (0): Embedding(
      (wte): Embedding(51200, 2048)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (1): ParallelBlock(
      (ln): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
      (resid_dropout): Dropout(p=0.0, inplace=False)
      (mixer): MHA(
        (rotary_emb): RotaryEmbedding()
        (Wqkv): Linear(in_features=2048, out_features=6144, bias=True)
        (out_proj): Linear(in_features=2048, out_features=2048, bias=True)
        (inner_attn): SelfAttention(
          (drop): Dropout(p=0.0, inplace=False)
        )
        (inner_cross_attn): CrossAttention(
          (drop): Dropout(p=0.0, inplace=False)
        )
      )
      (mlp): MLP(
        (fc1): Linear(in_features=2048, out_features=8192, bias=True)
        (fc2): Linear(in_features=8192, out_features=2048, bias=True)
        (act): NewGELUActivation()
      )
    )
    (2): ParallelBlock(
      (ln): LayerNorm((2048,), eps=1e-05, elementwis

In [None]:
trainer.train()

***** Running training *****
  Num examples = 90
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 36
  Number of trainable parameters = 1418270720


`attention_mask` is not supported during training. Using it might lead to unexpected results.


In [None]:
##Now we must evaluate model performance on small evaluation dataset 
#Once finetuning/training is done phi_model is finetuned to understand keywords we want it to pay attention to in input sequence it processes 
#Now it's ready to generate emojis for it's main task by counting keyword presence in examples 
#then using the presence of certain key words to generate an emoji corresponding with the summary of the text 


In [104]:
#In the case with need to use the optimizer 
from torch.optim import AdamW