In [8]:
import pandas as pd
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

# Load the model and tokenizer
model_name = "EleutherAI/gpt-neo-1.3B"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPTNeoForCausalLM.from_pretrained(model_name)

# Sample data
data = [
    "i feel a bit rude writing to an elderly gentleman to ask for gifts because i feel a bit greedy but what is christmas about if not mild greed;anger",
    "i need you i need someone i need to be protected and feel safe i am small now i find myself in a season of no words;joy",
    "i plan to share my everyday life stories traveling adventures inspirations and handmade creations with you and hope you will also feel inspired;joy",
    "i already have my christmas trees up i got two and am feeling festive which i m sure is spurring me to get started on this book;joy",
    "ive worn it once on its own with a little concealer and for the days im feeling brave but dont want to be pale then its perfect;joy",
    "i feel very strongly passionate about when some jerk off decides to poke and make fun of us;joy",
    "i was feeling so discouraged we are already robbing peter to pay paul to get our cow this year but we cant afford to not get the cow this way;sadness",
    "i was feeling listless from the need of new things something different;sadness",
    "i lost my special mind but don t worry i m still sane i just wanted you to feel what i felt while reading this book i don t know how many times it was said that sam was special but i can guarantee you it was many more times than what i used in that paragraph did i tell you she was special;joy",
    "i can t let go of that sad feeling that i want to be accepted here in this first home of mine;love"
]

# Prepare the data
df = pd.DataFrame(data, columns=['text'])
df['sex'] = None  # Placeholder for sex category
df['race'] = None  # Placeholder for race category

# Function to generate categories using GPT-Neo
# TODO to understand better how to generate things properly (maybe the example prompt is bad, also it doesnt get any starting category, maybe its a model problem idk)
def generate_category(text, category_type):     # category_type is the category we want to generate, in this case sex and race
    example_prompt = (
        f"Text: {text}\n"
        f"Category type: {category_type.capitalize()}\n"
        f"{category_type.capitalize()}:"
    )
    input_ids = tokenizer(example_prompt, return_tensors='pt').input_ids    # converts example_prompt string into input IDs that the model can process, output in pytorch tensor format
    attention_mask = tokenizer(example_prompt, return_tensors='pt').attention_mask
    output = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=10, pad_token_id=tokenizer.eos_token_id)   # generate text based on the input IDs and the max_tokens
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)   # converts the generated token IDs into human-readable string, doesnt include special tokens of the model
    # Debugging print statement
    print(f"Generated text for {category_type}: {generated_text}")

    try:
        category = generated_text.split(f'{category_type.capitalize()}:')[-1].strip().split()[0]    # splits the generated text at the point where the category type appears and takes the part after it (what?), then removes whitespaces and takes first word which is the category
    except IndexError:
        # Handle the case where the expected category is not found
        category = "unknown"

    return category

# Annotate data with sex and race categories
df['sex'] = df['text'].apply(lambda x: generate_category(x.split(';')[0], 'sex'))   # generates the sex category for all the dataset
df['race'] = df['text'].apply(lambda x: generate_category(x.split(';')[0], 'race')) # generates the race category for all the dataset

# Display the annotated data
df

# # Save the dataframe to visualize (doesnt work for now)
# import ace_tools as tools;
# tools.display_dataframe_to_user(name="Annotated Text Data with Sex and Race", dataframe=df)


Generated text for sex: Text: i feel a bit rude writing to an elderly gentleman to ask for gifts because i feel a bit greedy but what is christmas about if not mild greed
Category type: Sex
Sex: Male
Age: 65
Location: UK

Generated text for sex: Text: i need you i need someone i need to be protected and feel safe i am small now i find myself in a season of no words
Category type: Sex
Sex:

I am a very shy person and I
Generated text for sex: Text: i plan to share my everyday life stories traveling adventures inspirations and handmade creations with you and hope you will also feel inspired
Category type: Sex
Sex:

I am a very open minded person and
Generated text for sex: Text: i already have my christmas trees up i got two and am feeling festive which i m sure is spurring me to get started on this book
Category type: Sex
Sex: Male
Age: 18
Location: UK

Generated text for sex: Text: ive worn it once on its own with a little concealer and for the days im feeling brave but dont want to be

Unnamed: 0,text,sex,race
0,i feel a bit rude writing to an elderly gentle...,Male,Human
1,i need you i need someone i need to be protect...,I,Human
2,i plan to share my everyday life stories trave...,I,unknown
3,i already have my christmas trees up i got two...,Male,Human
4,ive worn it once on its own with a little conc...,I'm,I
5,i feel very strongly passionate about when som...,I,Human
6,i was feeling so discouraged we are already ro...,Male,unknown
7,i was feeling listless from the need of new th...,I,Human
8,i lost my special mind but don t worry i m sti...,Yes,Human
9,i can t let go of that sad feeling that i want...,Male,Human


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Training data
train_data = [
    ("i do feel insecure sometimes but who doesnt", "fear"),
    ("i highly recommend visiting on a wednesday if youre able because its less crowded so you get to ask the farmers more questions without feeling rude for holding up a line", "anger"),
    ("ive been missing him and feeling so restless at home thinking of him", "fear"),
    ("i posted on my facebook page earlier this week ive been feeling a little grumpy and out of sorts the past few days", "anger"),
    ("i start to feel emotional", "sadness"),
    ("i feel so cold a href http irish", "anger"),
    ("i feel like i m defective or something for not having baby fever", "sadness"),
    ("i feel more virtuous than when i eat veggies dipped in hummus", "joy"),
    ("i feel very honoured to be included in a magzine which prioritises health and clean living so highly im curious do any of you read magazines concerned with health and clean lifestyles such as the green parent", "joy"),
    ("i spent the last two weeks of school feeling miserable", "sadness"),
    ("im feeling very peaceful about our wedding again now after having", "joy")
]

# Testing data
test_data = [
    ("i feel like i could go into any situation and become successful because i ve been competing all my life explained schaub in an interview with the a href http bleacherreport", "joy"),
    ("i can t stop the anxiety i feel when i m alone when i ve got no distractions", "sadness"),
    ("im trying to feel out my house style now that im living on my own and have creative carte blanche", "joy"),
    ("i have tried to see what it would be like if i liked one of my girl friends but it has never really worked and i can only ever feel an emotional connection to them because they are my friends", "sadness"),
    ("i had every intention of doing more gardening this morning while it was still cool but i was just feeling so rotten", "sadness"),
    ("i have a good feeling about this so i am excited", "joy"),
    ("i feel like i am just starting to understand the blessings that come from being submissive to the will of the father", "sadness"),
    ("i think about the things ive said and the stuff i have done it makes me feel disgusted in myself when i should be making you happy and smile which i was far from doing", "anger"),
    ("i woke up yesterday monday morning feeling a little depressed", "sadness")
]

# Convert data to DataFrame
train_df = pd.DataFrame(train_data, columns=['text', 'category'])
test_df = pd.DataFrame(test_data, columns=['text', 'category'])

# Extract features and labels
X_train = train_df['text']
y_train = train_df['category']
X_test = test_df['text']
y_test = test_df['category']

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Logistic Regression classifier
classifier = LogisticRegression()
classifier.fit(X_train_tfidf, y_train)

# Predict on test data
y_pred = classifier.predict(X_test_tfidf)

# Evaluate overall model performance
print("Overall Performance:")
print(classification_report(y_test, y_pred))

# Function to evaluate subgroup performance
def evaluate_subgroup_performance(df, classifier, vectorizer, subgroup_col, subgroup_val):
    subgroup_data = df[df[subgroup_col] == subgroup_val]
    if len(subgroup_data) == 0:
        print(f"No data for subgroup: {subgroup_val}")
        return

    X_subgroup = subgroup_data['text']
    y_subgroup = subgroup_data['category']
    X_subgroup_tfidf = vectorizer.transform(X_subgroup)

    y_subgroup_pred = classifier.predict(X_subgroup_tfidf)

    print(f"Performance for Subgroup - {subgroup_col}: {subgroup_val}")
    print("Accuracy:", accuracy_score(y_subgroup, y_subgroup_pred))
    print("Precision:", precision_score(y_subgroup, y_subgroup_pred, average='weighted'))
    print("Recall:", recall_score(y_subgroup, y_subgroup_pred, average='weighted'))
    print("F1 Score:", f1_score(y_subgroup, y_subgroup_pred, average='weighted'))

# Evaluate performance for each category (emotion)
categories = train_df['category'].unique()
for category in categories:
    evaluate_subgroup_performance(train_df, classifier, vectorizer, 'category', category)
