# Install and import necessary libraries to carry out the training

In [None]:
#Importing libraries
import os
import openai
import wandb
import pandas as pd
import json
import random
from openai import OpenAI
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import sklearn.metrics
from ratelimit import limits, sleep_and_retry
import matplotlib.pyplot as plt
import numpy as np

# Code to convert from Excel to JSON format

In [None]:
df = pd.read_excel('CorpusTwitchVideogames2024Final.xlsx')

output_filename = "base_datos_emociones_T.jsonl"

with open(output_filename, "w") as file:
    for _, row in df.iterrows():
        
        data = {
                "messages": [
                    {
                        "role": "system",
                        "content": "¿Cual es la emoción del siguiente texto? Responde con 'Aprobación/Empatía/Confianza', 'Decepción/Tristeza', 'Desaprobación', 'Enfado/Ira', 'Indeterminado' o 'Interés/Anticipación/Hype'."
                        #‘content": ’What is the emotion of the following text? Answer with ‘Approval/Empathy/Confidence’, ‘Disappointment/Sadness’, ‘Disapproval’, ‘Anger/Ira’, ‘Undetermined’ or ‘Interest/Anticipation/Hype’.
                        
                    },
                    {
                        "role": "user",
                        "content": row['Text'] 
                    },
                    {
                        "role": "assistant",
                        "content": row['Emotions']  
                    }
                ]
        }
        
        
        file.write(json.dumps(data) + "\n")
        
# Label "¿Cual es la emoción del siguiente texto? 
# Responde con 'Aprobación/Empatía/Confianza', 'Decepción/Tristeza', 'Desaprobación', 'Enfado/Ira', 'Indeterminado' o 'Interés/Anticipación/Hype'"
# is What is the emotion of the following text? 
#Respond with 'Approval/Empathy/Trust' 'Disappointment/Sadness' 'Indeterminate,' 'Disapproval' 'Anger/Frustration' or 'Interest/Anticipation/Hype'

# Database partitioning

In [None]:
def split_json(input_file, output_file_1, output_file_2):
    with open(input_file, 'r') as f:
        data = [json.loads(line) for line in f]
    
    total_messages = len(data)

    num_messages_1 = int(total_messages * 0.7)
    num_messages_2 = total_messages - num_messages_1

    shuffled_data = data[:]
    random.shuffle(shuffled_data)

    data_1 = shuffled_data[:num_messages_1]
    data_2 = shuffled_data[num_messages_1:]

    with open(output_file_1, 'w') as f1:
        for item in data_1:
            json.dump(item, f1)
            f1.write('\n')  

    with open(output_file_2, 'w') as f2:
        for item in data_2:
            json.dump(item, f2)
            f2.write('\n')  


split_json("base_datos_emociones_T.jsonl", "training_file_emociones_T.jsonl", "test_file_emociones_T.jsonl")

#Label "base_datos_emociones_T.jsonl" is sentiment_data_base_T.jsonl

# Code to convert the test file from Excel to JSON format 

In [None]:
with open("test_file_emociones_T.jsonl", "r") as file:
    datos = [json.loads(line) for line in file] #Label "datos" is data

textos = [] #Label "textos" is texts
emociones = [] #Label "emociones" is emotions

for dato in datos:
    textos.append(dato["messages"][1]["content"])
    emociones.append(dato["messages"][2]["content"])


df = pd.DataFrame({"Texto": textos, "Emociones": emociones})

df.to_excel("datos_test_emociones_T.xlsx", index=False)

# Training data verification

In [None]:
import json
from collections import defaultdict

data_path = "training_file_emociones_T.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
  dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))

# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
  if not isinstance(ex, dict):
    format_errors["data_type"] += 1
    continue

  messages = ex.get("messages", None)
  if not messages:
    format_errors["missing_messages_list"] += 1
    continue

  for message in messages:
    if "role" not in message or "content" not in message:
      format_errors["message_missing_key"] += 1

    if any(k not in ("role", "content", "name", "function_call") for k in message):
      format_errors["message_unrecognized_key"] += 1

    if message.get("role", None) not in ("system", "user", "assistant", "function"):
      format_errors["unrecognized_role"] += 1

    content = message.get("content", None)
    function_call = message.get("function_call", None)

    if (not content and not function_call) or not isinstance(content, str):
      format_errors["missing_content"] += 1

  if not any(message.get("role", None) == "assistant" for message in messages):
    format_errors["example_missing_assistant_message"] += 1

if format_errors:
  print("Found errors:")
  for k, v in format_errors.items():
    print(f"{k}: {v}")
else:
  print("No errors found")

In [None]:
import tiktoken
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")


In [None]:
#Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    
print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

In [None]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
TARGET_EPOCHS = 3
MIN_EPOCHS = 1
MAX_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
     n_epochs = min(MAX_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")


# Fine-tuning process execution

In [None]:
#API key (private API key of the user)
openai.api_key='XXXXXXXXXX'

#Start wandb
wandb.init(project="Emotional Analysis for Twitch") 

#Definition of the client
client = OpenAI(api_key='XXXXXXXXXX')

In [None]:
training_file = client.files.create(
  file=open("training_file_emociones_T.jsonl", "rb"),
  purpose='fine-tune'
)

training_file_id = training_file.id
print("File has been uploaded to OpenAI with id ", training_file_id)

In [None]:
ft_job = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    model="gpt-3.5-turbo",
    hyperparameters={
        "n_epochs": 3
    },
    integrations=[
        {
            "type": "wandb",
            "wandb": {
                "project": wandb.run.project,
                "name": "mental-health-model-Twitch"
            }
        }
    ]
)

model_id = ft_job.id
print("Fine Tune Job has been created with id ", ft_job.id)


# Fine-tuned model verification

In [None]:
RATE_LIMIT_TPM=60000
@sleep_and_retry
@limits(calls=RATE_LIMIT_TPM, period=60)  # 60 seconds in a minute

#Label "realizar_solicitud(texto)" is make_request(text)
# Label "¿Cual es la emoción del siguiente texto? 
# Responde con 'Aprobación/Empatía/Confianza', 'Decepción/Tristeza', 'Desaprobación', 'Enfado/Ira', 'Indeterminado' o 'Interés/Anticipación/Hype'"
# is What is the emotion of the following text? 
#Respond with 'Approval/Empathy/Trust' 'Disappointment/Sadness' 'Indeterminate,' 'Disapproval' 'Anger/Frustration' or 'Interest/Anticipation/Hype'

def realizar_solicitud(texto):
    # Apply here
    completion = client.chat.completions.create(
        model=model_id,
        messages=[
            {"role": "system", "content": "¿Cual es la emoción del siguiente texto? Responde con 'Aprobación/Empatía/Confianza', 'Decepción/Tristeza', 'Desaprobación', 'Enfado/Ira', 'Indeterminado' o 'Interés/Anticipación/Hype'."},
            {"role": "user", "content": str(texto)},
        ]
    )
    return completion.choices[0].message.content

In [None]:
filename = 'datos_test_emociones_T.xlsx'
df= pd.read_excel(filename)

model_id= "ft:gpt-3.5-turbo-0125:personal::9cAb2PmF"

sentimiento_clases=["Aprobación/Empatía/Confianza", "Decepción/Tristeza", "Desaprobación", "Enfado/Ira", "Indeterminado", "Interés/Anticipación/Hype"]
#Label "sentimiento_clases" is class_emotion
#Label "Amprobación/Empatía/Confianza" is Approval/Empathy/Trust
#Label "Decepción/Tristeza" is Disappointment/Sadness
#Label "Indeterminado" is Indeterminate
#label "Desaprobación" is Disapproval
#Label "Enfado/Ira" is Anger/Frustation
#Label "Interés/Anticipación/Hype" is Interest/Anticipation/Hype


predicciones=[] #Label "predicciones" is predictions
etiquetas_verdaderas=[] #Label "etiquetas_verdaderas" is true_labels


df_resultados = pd.DataFrame(columns=["Comentario", "Predicción", "Etiqueta Verdadera"])

#Label "Comentario" is Comment
#Label "Predicción" is Prediction
#Label "Etiqueta Verdadera" is True Label


for index, row in df.iterrows(): 
    texto = row['Texto'] #Label "texto" is text
    
    try: 
        
        response= realizar_solicitud(texto)
        predicciones_completas=response   
        #Label "predicciones_completas" is complete_predictions
        
        if response not in sentimiento_clases:
            print(f"Respuesta inesperada en la fila {index}: {response}")
            continue
        
        etiqueta=sentimiento_clases.index(response)
        #Label "sentimiento_clases" is class_sentiment
        
        predicciones.append(sentimiento_clases[etiqueta])
        
        true_label = row['Emociones']
        
        etiquetas_verdaderas.append(true_label)
        
        texto_str=str(texto)
        df_resultados = pd.concat([df_resultados, pd.DataFrame({"Comentario": [texto], "Predicción": [response], "Etiqueta Verdadera": [true_label]})], ignore_index=True)
        #Label "Comentario" is Comment
        #Label "Predicción" is Prediction
        #Label "Etiqueta Verdadera" is True Label 

    except Exception as e:
        print("Error:", e)
        continue
        

wandb.log({"predicciones_vs_etiquetas_verdaderas": wandb.Table(dataframe=df_resultados)})


### Classification report

In [None]:
report = classification_report(etiquetas_verdaderas, predicciones)
print("Reporte de clasificación Twitch:")
print(report)

In [None]:
# Code to upload the classification report to wandb
lines = report.split('\n')

lines = [line for line in lines if line.strip()]


table_data = []


for line in lines:
    parts = line.split()
    parts = [part for part in parts if part]

   
    if len(parts) == 5 and parts[0] != 'accuracy':
        class_name = parts[0]
        precision = parts[1]
        recall = parts[2]
        f1_score = parts[3]
        support = parts[4]
        
       
        table_data.append([class_name, precision, recall, f1_score, support])


columns = ["Clase", "Precision", "Recall", "F1-score", "Soporte"]

#Label "Clase" is Class
#Label "Soporte" is Support

wandb.log({"classification_report": wandb.Table(data=table_data, columns=columns)})

### Confusion Matrix

In [None]:
#Confusion matrix
matrix= confusion_matrix(etiquetas_verdaderas, predicciones, labels=sentimiento_clases)

# Print the confusion matrix
print("Confusion Matrix:")
print(matrix)

fig, ax = plt.subplots(figsize=(10, 8))
vis.plot(ax=ax, cmap=plt.cm.Blues) 

#Adjust spacing between names on x-axis
ax.set_xticklabels(sentimiento_clases, rotation=45, ha="right")

plt.title("CONFUSION MATRIX")
plt.xlabel("Prediction")
plt.ylabel("True Label)
plt.tight_layout() # Adjust the design to avoid overlaps

# Save the figure as an image file
img_path = "confusion_matrix.png"
plt.savefig(img_path)

# Upload the image to wandb as an artifact
wandb.log({"confusion_matrix_emociones_Twitch": wandb.Image(img_path)})

plt.show()

In [None]:
wandb.finish()

# Example of using the OpenAI model 

In [None]:
completation = client.chat.completions.create(
    model="ft:gpt-3.5-turbo-0125:personal::9cAb2PmF",
    messages=[
        {"role": "system", "content": "¿Cual es la emoción del siguiente texto? Responde con 'Aprobación/Empatía/Confianza', 'Decepción/Tristeza', 'Desaprobación', 'Enfado/Ira', 'Indeterminado' o 'Interés/Anticipación/Hype'."},
        {"role": "user", "content": ""},
        #‘content": ’What is the emotion of the following text? Answer with ‘Approval/Empathy/Confidence’, ‘Disappointment/Sadness’, ‘Disapproval’, ‘Anger/Ira’, ‘Undetermined’ or ‘Interest/Anticipation/Hype’.
        
    ]
)
    
print(completation.choices[0].message.content)

# Function predict

In [None]:
#Function predict emotions
def predict_emotions_Twitch(text):
    completion = client.chat.completions.create(
        model="ft:gpt-3.5-turbo-0125:personal::9cAb2PmF",
        messages=[
            {"role": "system", "content":"¿Cuál es la emoción del siguiente texto? Responde con 'Aprobación/Empatía/Confianza', 'Decepción/Tristeza', 'Desaprobación', 'Enfado/Ira', 'Indeterminado' o 'Interés/Anticipación/Hype'."},
            {"role": "user", "content": text},
            #‘content": ’What is the emotion of the following text? Answer with ‘Positive’, ‘Negative’ or ‘Undetermined’’.

        ]
    )
    return completion.choices[0].message.content.strip()

#Example
input_text= ""
emocion_predicha= predict_emotions(input_text)
print(emocion_predicha)

#Function predict emotions from an excel 
def predict_emotions_excel_Twitch(input_file, output_file, text_column):
    #Upload the excel input file
    df=pd.read_excel(input_file)
    
    df['Emotion']=df[text_column].apply(predict_emotions_Twitch)
    df.to_excel(output_file, index=False)
    
    print(output_file)
    
#Example
input_file="" #Write an input file
output_file="" #Write an output file where do you what all the predictions
text_column="" #Write the name of the column where all the text  you want to predict are. 

predict_emotions_excel_Twitch(input_file, output_file, text_column)