# Exploring HuggingFace Topics

In [94]:
import pandas as pd

## Connect GoogleDrive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load CSV's

In [95]:
therapist_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/amooora/raw_data/text_labeled_bert.csv', keep_default_na=False)
therapist_df.head()

Unnamed: 0,label,score,real_label
0,0,0.44948,Time Up and Future Meetings
1,34,0.431342,Managing and Increasing Energy Levels
2,18,0.298963,Voices and Perception of Sound
3,14,0.176815,Struggles and Desires in Learning
4,31,0.607514,Expressions of Happiness and Joy


In [96]:
classifier_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/amooora/raw_data/topic_no_url.csv', keep_default_na=False)
classifier_df.head()

Unnamed: 0,label,real_label
0,7,Social Life
1,7,Social Life
2,7,Social Life
3,6,Entertainment
4,7,Social Life


## Grouping labels

### Therapist Model

In [97]:
# Counting labels %
label_counts = therapist_df["real_label"].value_counts(normalize=True, ascending=False)
label_counts

Unnamed: 0_level_0,proportion
real_label,Unnamed: 1_level_1
Expressions of Happiness and Joy,0.484553
Time Up and Future Meetings,0.173606
Managing and Increasing Energy Levels,0.080122
See and Understanding Conversations,0.048143
Dynamics of Meaningful Relationships,0.028092
Fear and Reflection on Aging,0.024489
Job Anxiety and Self-Reflection,0.023738
Drinking Habits and Concerns,0.023221
Struggles and Desires in Learning,0.022821
Voices and Perception of Sound,0.020385


In [98]:
# Define labels below threshold
labels_to_replace = label_counts[label_counts < 0.07].index
labels_to_replace

Index(['See and Understanding Conversations',
       'Dynamics of Meaningful Relationships', 'Fear and Reflection on Aging',
       'Job Anxiety and Self-Reflection', 'Drinking Habits and Concerns',
       'Struggles and Desires in Learning', 'Voices and Perception of Sound',
       'Revisiting the Past Together', 'Gender Roles and Relationships',
       'Struggles with Personal Change', 'Marriage Anxiety and Dependence',
       'Understanding Depression and Its Roots',
       'Pursuing Meaningful Personal Goals',
       'Therapy and Father Relationships',
       'Personal Growth and Decision-Making', 'Nurturing the inner child',
       'Open Conversation and Sharing',
       'Father-Child Relationships and Authority',
       'Self-Acceptance and Relationships',
       'Understanding and Confronting Fear',
       'Complex Mother-Sibling Relationships',
       'Possibilities and Potential Outcomes',
       'Exploring Emotional Hurt and Bitterness',
       'Desires and Disappointments', 

In [99]:
# new column for grouped labels
therapist_df["grouped_label"] = therapist_df["real_label"].apply(lambda x: "Other" if x in labels_to_replace else x)

In [100]:
therapist_df.head()

Unnamed: 0,label,score,real_label,grouped_label
0,0,0.44948,Time Up and Future Meetings,Time Up and Future Meetings
1,34,0.431342,Managing and Increasing Energy Levels,Managing and Increasing Energy Levels
2,18,0.298963,Voices and Perception of Sound,Other
3,14,0.176815,Struggles and Desires in Learning,Other
4,31,0.607514,Expressions of Happiness and Joy,Expressions of Happiness and Joy


## Grouping labels function

In [101]:
def group_labels_below(threshold, df):
  # Counting labels %
  label_counts = df["real_label"].value_counts(normalize=True, ascending=False)
  # Define labels below threshold
  labels_to_replace = label_counts[label_counts < threshold].index
  # new column for grouped labels
  df["grouped_label"] = df["real_label"].apply(lambda x: "Other" if x in labels_to_replace else x)
  return df


### Testing function for both DataFrame

#### Classifier DF

In [102]:
classifier_grouped_df = group_labels_below(0.03, classifier_df)
classifier_grouped_df.grouped_label.unique()

array(['Social Life', 'Entertainment', 'Literature', 'Other',
       'Home & Hobbies'], dtype=object)

#### Therapist DF

In [103]:
therapist_grouped_df = group_labels_below(0.03, therapist_df)
therapist_grouped_df.grouped_label.unique()

array(['Time Up and Future Meetings',
       'Managing and Increasing Energy Levels', 'Other',
       'Expressions of Happiness and Joy',
       'See and Understanding Conversations'], dtype=object)

## Save CSV

In [None]:
therapist_grouped_df.to_csv(
    '/content/drive/MyDrive/Colab Notebooks/amooora/raw_data/text_labeled_bert_grouped.csv',
    index=False,
    columns=therapist_grouped_df.columns
)

In [None]:
classifier_grouped_df.to_csv(
    '/content/drive/MyDrive/Colab Notebooks/amooora/raw_data/topic_no_url_grouped.csv',
    index=False,
    columns=classifier_grouped_df.columns
)

## Encoding grouped labels

In [119]:
classifier_encoded_df = pd.get_dummies(
    classifier_grouped_df,
    columns=['grouped_label'],
    dtype=int,
    prefix='general'
).drop(columns=['label', 'real_label'])
classifier_encoded_df = pd.concat([classifier_grouped_df, classifier_encoded_df], axis=1)
classifier_encoded_df.head()

Unnamed: 0,label,real_label,grouped_label,general_Entertainment,general_Home & Hobbies,general_Literature,general_Other,general_Social Life
0,7,Social Life,Social Life,0,0,0,0,1
1,7,Social Life,Social Life,0,0,0,0,1
2,7,Social Life,Social Life,0,0,0,0,1
3,6,Entertainment,Entertainment,1,0,0,0,0
4,7,Social Life,Social Life,0,0,0,0,1


In [109]:
therapist_encoded_df = pd.get_dummies(
    therapist_grouped_df,
    columns=['grouped_label'],
    dtype=int,
    prefix='therapist'
).drop(columns=['score', 'label', 'real_label'])
therapist_encoded_df = pd.concat([therapist_grouped_df, therapist_encoded_df], axis=1).drop(columns=['score'])
therapist_encoded_df.head()

Unnamed: 0,label,real_label,grouped_label,therapist_Expressions of Happiness and Joy,therapist_Managing and Increasing Energy Levels,therapist_Other,therapist_See and Understanding Conversations,therapist_Time Up and Future Meetings
0,0,Time Up and Future Meetings,Time Up and Future Meetings,0,0,0,0,1
1,34,Managing and Increasing Energy Levels,Managing and Increasing Energy Levels,0,1,0,0,0
2,18,Voices and Perception of Sound,Other,0,0,1,0,0
3,14,Struggles and Desires in Learning,Other,0,0,1,0,0
4,31,Expressions of Happiness and Joy,Expressions of Happiness and Joy,1,0,0,0,0


### Renaming columns

In [125]:
classifier_encoded_df = classifier_encoded_df.rename(
    columns={
        'label': 'general_label',
        'real_label': 'general_real_label',
        'grouped_label': 'general_grouped_label'
    }
)
classifier_encoded_df.head()

Unnamed: 0,general_label,general_real_label,general_grouped_label,general_Entertainment,general_Home & Hobbies,general_Literature,general_Other,general_Social Life
0,7,Social Life,Social Life,0,0,0,0,1
1,7,Social Life,Social Life,0,0,0,0,1
2,7,Social Life,Social Life,0,0,0,0,1
3,6,Entertainment,Entertainment,1,0,0,0,0
4,7,Social Life,Social Life,0,0,0,0,1


In [126]:
therapist_encoded_df = therapist_encoded_df.rename(
    columns={
        'label': 'therapis_label',
        'real_label': 'therapist_real_label',
        'grouped_label': 'therapist_grouped_label'
    }
)
therapist_encoded_df.head()

Unnamed: 0,therapis_label,therapist_real_label,therapist_grouped_label,therapist_Expressions of Happiness and Joy,therapist_Managing and Increasing Energy Levels,therapist_Other,therapist_See and Understanding Conversations,therapist_Time Up and Future Meetings
0,0,Time Up and Future Meetings,Time Up and Future Meetings,0,0,0,0,1
1,34,Managing and Increasing Energy Levels,Managing and Increasing Energy Levels,0,1,0,0,0
2,18,Voices and Perception of Sound,Other,0,0,1,0,0
3,14,Struggles and Desires in Learning,Other,0,0,1,0,0
4,31,Expressions of Happiness and Joy,Expressions of Happiness and Joy,1,0,0,0,0


## Combining all with text_and_topics

In [127]:
text_topics_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/amooora/raw_data/text_and_topics.csv', keep_default_na=False)

In [128]:
text_topics_df.head()

Unnamed: 0,combined_preprocessed,answered_percent,text_length,topic_0_from_five,topic_1_from_five,topic_2_from_five,topic_3_from_five,topic_4_from_five,topic_0_from_two,topic_1_from_two
0,would love think kind intellectual either dumb...,100.0,1565,0.9055,0.023574,0.023639,0.023778,0.023509,0.271588,0.728412
1,chef mean workaholic love cook regardless whet...,60.0,815,0.027683,0.339134,0.02759,0.578125,0.027468,0.446532,0.553468
2,im ashamed much write public text online date ...,90.0,3728,0.914891,0.021284,0.021307,0.021207,0.021311,0.280111,0.719889
3,work library go school read thing write old de...,70.0,330,0.045781,0.04562,0.817402,0.045673,0.045524,0.330528,0.669472
4,hey hows go currently vague profile know come ...,50.0,496,0.031946,0.031976,0.872147,0.032119,0.031812,0.450953,0.549047


In [129]:
text_topics_encoded_df = pd.concat([text_topics_df, classifier_encoded_df, therapist_encoded_df], axis=1)
text_topics_encoded_df.iloc[:,10:].head()

Unnamed: 0,general_label,general_real_label,general_grouped_label,general_Entertainment,general_Home & Hobbies,general_Literature,general_Other,general_Social Life,therapis_label,therapist_real_label,therapist_grouped_label,therapist_Expressions of Happiness and Joy,therapist_Managing and Increasing Energy Levels,therapist_Other,therapist_See and Understanding Conversations,therapist_Time Up and Future Meetings
0,7,Social Life,Social Life,0,0,0,0,1,0,Time Up and Future Meetings,Time Up and Future Meetings,0,0,0,0,1
1,7,Social Life,Social Life,0,0,0,0,1,34,Managing and Increasing Energy Levels,Managing and Increasing Energy Levels,0,1,0,0,0
2,7,Social Life,Social Life,0,0,0,0,1,18,Voices and Perception of Sound,Other,0,0,1,0,0
3,6,Entertainment,Entertainment,1,0,0,0,0,14,Struggles and Desires in Learning,Other,0,0,1,0,0
4,7,Social Life,Social Life,0,0,0,0,1,31,Expressions of Happiness and Joy,Expressions of Happiness and Joy,1,0,0,0,0


### Save to CSV

In [130]:
text_topics_encoded_df.to_csv(
    '/content/drive/MyDrive/Colab Notebooks/amooora/raw_data/text_and_topics_classified_encoded.csv',
    index=False,
    columns=text_topics_encoded_df.columns
)

In [132]:
pd.read_csv('/content/drive/MyDrive/Colab Notebooks/amooora/raw_data/text_and_topics_classified_encoded.csv', keep_default_na=False).iloc[:,10:]

Unnamed: 0,general_label,general_real_label,general_grouped_label,general_Entertainment,general_Home & Hobbies,general_Literature,general_Other,general_Social Life,therapis_label,therapist_real_label,therapist_grouped_label,therapist_Expressions of Happiness and Joy,therapist_Managing and Increasing Energy Levels,therapist_Other,therapist_See and Understanding Conversations,therapist_Time Up and Future Meetings
0,7,Social Life,Social Life,0,0,0,0,1,0,Time Up and Future Meetings,Time Up and Future Meetings,0,0,0,0,1
1,7,Social Life,Social Life,0,0,0,0,1,34,Managing and Increasing Energy Levels,Managing and Increasing Energy Levels,0,1,0,0,0
2,7,Social Life,Social Life,0,0,0,0,1,18,Voices and Perception of Sound,Other,0,0,1,0,0
3,6,Entertainment,Entertainment,1,0,0,0,0,14,Struggles and Desires in Learning,Other,0,0,1,0,0
4,7,Social Life,Social Life,0,0,0,0,1,31,Expressions of Happiness and Joy,Expressions of Happiness and Joy,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59941,7,Social Life,Social Life,0,0,0,0,1,31,Expressions of Happiness and Joy,Expressions of Happiness and Joy,1,0,0,0,0
59942,7,Social Life,Social Life,0,0,0,0,1,0,Time Up and Future Meetings,Time Up and Future Meetings,0,0,0,0,1
59943,7,Social Life,Social Life,0,0,0,0,1,0,Time Up and Future Meetings,Time Up and Future Meetings,0,0,0,0,1
59944,7,Social Life,Social Life,0,0,0,0,1,33,Drinking Habits and Concerns,Other,0,0,1,0,0
