In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
from transformers import pipeline

In [2]:
from google.colab import drive
#drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
# Read in filtered data
file_path = '/content/drive/MyDrive/filtered_data.csv'
df = pd.read_csv(file_path)
df.shape

(26344, 49)

In [34]:
# Select relevant fields: other_moods, other_lifestyles, notes
text_df = pd.DataFrame(df, columns=['ID', 'userID', 'other_moods', 'other_lifestyles', 'notes'])

users_total = len(text_df['userID'].unique())

# Drop rows where relevant fields are NaN
text_df = text_df.dropna(how='all', subset=['other_moods', 'other_lifestyles', 'notes'])
print(text_df.shape)

users_text = len(text_df['userID'].unique())
print(users_total)
print(users_text)

print("Percentage of users that made text notes: ", round(users_text*100/users_total, 1), "%")

(6240, 5)
1932
1032
Percentage of users that made text notes:  53.4 %


In [11]:
# Ensure fields are strings
text_df["other_moods"] = text_df["other_moods"].astype(str)
text_df["other_lifestyles"] = text_df["other_lifestyles"].astype(str)
text_df["notes"] = text_df["notes"].astype(str)

# Combine text fields, drop individual text fields
text_df["text"] = text_df["other_moods"] + " " + text_df["other_lifestyles"] + " " + text_df["notes"]
text_df = text_df.drop(columns=['other_moods', 'other_lifestyles', 'notes'])

# Add a column for work related label
text_df['label'] = ''
text_df.shape

(6240, 4)

In [13]:
## Read in saved data
#file_path = '/content/drive/MyDrive/text_df.csv'
#text_df = pd.read_csv(file_path)
#text_df.shape

In [12]:
text_df.head()

Unnamed: 0,ID,userID,text,label
45,219,88,nan nan left si HOT pain,
81,276,113,nan nan cramp maybe triggered by bloating /gas...,
83,278,113,nan nan hung over but paracetamol wasn’t worki...,
89,288,113,nan nan drove to see parents and claire - lots...,
99,299,113,"nan nan feeling just so exhausted, eating to k...",


In [14]:
# filter for rows that have nan in label and classify these rows
text_df_unlabelled = text_df[text_df['label'].isna()]
text_df_unlabelled.shape

(0, 4)

In [15]:
# Load the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [16]:
unlabelled_texts = [
    "I was unable to work due to a severe headache.",
    "headache all day, busy day at work",
    "pain",
    "pain work",
    "I completed all my tasks without any issues.",
    "Feeling very ill and couldn't concentrate on work.",
    "Good day today",
    "Worked today",
    "Felt ill but stayed at work",
    "surgery",
    "Surgery",
    "sick day"
]

candidate_labels = ['yes', 'no']

# Classify each text
for text in unlabelled_texts:
    hypothesis_template = "The following is a subject’s self-report: {}. Are severe symptoms recorded that affected the subject's ability to work?"
    result = classifier(text, candidate_labels, hypothesis_template=hypothesis_template)
    label = result['labels'][0]  # The label with the highest score
    score = round(result['scores'][0], 2)  # The corresponding score
    print(f"Text: {text}\nLabel: {label}, Score:{score}\n")


Text: I was unable to work due to a severe headache.
Label: yes, Score:0.51

Text: headache all day, busy day at work
Label: yes, Score:0.56

Text: pain
Label: yes, Score:0.63

Text: pain work
Label: yes, Score:0.55

Text: I completed all my tasks without any issues.
Label: no, Score:0.65

Text: Feeling very ill and couldn't concentrate on work.
Label: yes, Score:0.55

Text: Good day today
Label: no, Score:0.65

Text: Worked today
Label: no, Score:0.68

Text: Felt ill but stayed at work
Label: no, Score:0.6

Text: surgery
Label: yes, Score:0.56

Text: Surgery
Label: yes, Score:0.53

Text: sick day
Label: yes, Score:0.6



In [None]:
# Loop through each text in 'text' column and update 'label'
for index, text in text_df['text'].items():
    hypothesis_template = "The following is a subject’s self-report: {}. Are severe symptoms recorded that affected the subject's ability to work?"
    result = classifier(text, candidate_labels, hypothesis_template=hypothesis_template)
    label = result['labels'][0]  # The label with the highest score
    print(f"{index} Text: {text}\nLabel: {label}\n")
    text_df.loc[index, 'label'] = label


In [18]:
# Save DataFrame to CSV in Google Drive
file_path = '/content/drive/My Drive/text_df.csv'
text_df.to_csv(file_path, index=False)

In [19]:
text_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
yes,3143
no,3097


In [20]:
label_counts = text_df.groupby('userID')['label'].value_counts()
label_counts = label_counts.reset_index()
label_counts.head()

Unnamed: 0,userID,label,count
0,2,yes,4
1,2,no,3
2,6,yes,1
3,88,yes,1
4,92,yes,2


In [22]:

label_counts_pv = label_counts.pivot_table(index='userID', columns='label', values='count', fill_value=0)
label_counts_pv.reset_index(inplace=True)
label_counts_pv.head()

label,userID,no,yes
0,2,3.0,4.0
1,6,0.0,1.0
2,88,0.0,1.0
3,92,1.0,2.0
4,93,0.0,4.0


In [36]:
# Count the number of users that report symptoms that affected work
yes_freq = label_counts_pv['yes'].value_counts().reset_index()
yes_freq.head()

Unnamed: 0,yes,count
0,0.0,379
1,1.0,251
2,2.0,150
3,3.0,54
4,4.0,44


In [43]:
note_takers = len(label_counts_pv)
print("Total note takers", note_takers)

zero_yes_users = yes_freq.loc[yes_freq['yes'] == 0.0, 'count'].values[0]
print("Users that did not report symptoms that affect work", zero_yes_users)

yes_users = note_takers - zero_yes_users
print("Users that did report symptoms affecting work at least once", yes_users)

print("Percentage of note takers that recorded symptoms that affect  ability to work: ", round(yes_users*100/users_total, 1), "%")

print("Percentage of total users recorded symptoms that affect  ability to work: ", round(yes_users*100/note_takers, 1), "%")


Total note takers 1032
Users that did not report symptoms that affect work 379
Users that did report symptoms affecting work at least once 653
Percentage of note takers that recorded symptoms that affect  ability to work:  33.8 %
Percentage of total users recorded symptoms that affect  ability to work:  63.3 %
