# Environment setup and imports

In [None]:
!pip install evaluate #install eval library

In [None]:
#importing libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
#hugging face
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          Trainer,TrainingArguments,pipeline)
from datasets import Dataset
import evaluate #for f1 and accuracy
#evaluation metrics
from sklearn.metrics import (accuracy_score,classification_report,confusion_matrix)

# Dataset loading and merging

In [None]:
depression = pd.read_csv("/content/drive/MyDrive/depoct21.csv")
anxiety1 = pd.read_csv("/content/drive/MyDrive/anxinov21.csv")
anxiety2 = pd.read_csv("/content/drive/MyDrive/anxioct21.csv")
neutral = pd.read_csv("/content/drive/MyDrive/reddit_wsb.csv")

In [None]:
#uploading and reading my uploaded csv from zip files
from google.colab import files
uploaded = files.upload()

Saving anxinov21.csv.zip to anxinov21.csv.zip
Saving anxioct21.csv.zip to anxioct21.csv.zip
Saving depoct21.csv.zip to depoct21.csv.zip
Saving reddit_wsb.csv.zip to reddit_wsb.csv.zip


In [None]:
#load uploaded zip files and read the csv inside each zip
import io, zipfile
dataframes = {} #empty dictionary to store DataFrames
for zip_name, zip_bytes in uploaded.items():#sloop through each uploaded file
    z = zipfile.ZipFile(io.BytesIO(zip_bytes))
    csv_inside = z.namelist()[0]
    dataframes[zip_name] = pd.read_csv(z.open(csv_inside))#read the CSV into a pandas DataFrame
    print("Loaded:", zip_name, "->", csv_inside, "shape:", dataframes[zip_name].shape)


Loaded: anxinov21.csv.zip -> anxinov21.csv shape: (7391, 8)
Loaded: anxioct21.csv.zip -> anxioct21.csv shape: (7680, 8)
Loaded: depoct21.csv.zip -> depoct21.csv shape: (16116, 8)
Loaded: reddit_wsb.csv.zip -> reddit_wsb.csv shape: (53187, 8)


In [None]:
depression = dataframes['depoct21.csv.zip']
anxiety1 = dataframes['anxinov21.csv.zip']
anxiety2 = dataframes['anxioct21.csv.zip']
neutral = dataframes['reddit_wsb.csv.zip']
# merge anxiety using pd.concat stacks rows on top of each other
anxiety = pd.concat([anxiety1, anxiety2], ignore_index=True) # ignore_index=True resets row numbering so they dont repeat


Anxiety combined dataset shape: (15071, 8)
Depression dataset shape: (16116, 8)


# Dataset cleaning and (minimal) preprocessing
including initial exploratory analysis

In [None]:
print("Anxiety combined dataset shape:", anxiety.shape)
print("Depression dataset shape:", depression.shape)

In [None]:
#add labels for training
depression["label"] = "depression"
anxiety["label"] = "anxiety"

In [None]:
neutral.shape

(53187, 8)

In [None]:
neutral.head()

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp
0,"It's not about the money, it's about sending a...",55,l6ulcx,https://v.redd.it/6j75regs72e61,6,1611863000.0,,2021-01-28 21:37:41
1,Math Professor Scott Steiner says the numbers ...,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,1611862000.0,,2021-01-28 21:32:10
2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1611862000.0,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,1611862000.0,,2021-01-28 21:28:57
4,"Not to distract from GME, just thought our AMC...",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,1611862000.0,,2021-01-28 21:26:56


In [None]:
#keeping only the column with the reddit post text
neutral= neutral[['body']]

In [None]:
neutral = neutral.rename(columns={'body': 'selftext'}) #renaming so it matches the other ds

In [None]:
neutral = neutral.dropna(subset=['selftext']) #remove rows where text is missing
neutral = neutral[neutral['selftext'].str.strip() != ""] #remove rows that are just empty strings or spaces

In [None]:
#checking sizes of my datasets
print(depression.shape[0], anxiety.shape[0], neutral.shape[0])

16116 15071 24738


In [None]:
#downsample neutral dataset to match the size of depression (about 13k) to maintain class balance
neutral_downsampled = neutral.sample(n=13350, random_state=42)
#randomly samples n rows, random_state ensures reproducibility

In [None]:
neutral_downsampled.shape #confirming the size of downsampled dataset

(13350, 2)

In [None]:
neutral_downsampled['label'] = 'neutral' #adding label to ds

In [None]:
depression.head()

Unnamed: 0.1,Unnamed: 0,author,created_utc,score,selftext,subreddit,title,timestamp,label
0,0,AnotherBluePlatypus,1635684905,1,[removed],depression,What is enthusiasm?,2021-10-31 23:55:05,depression
1,1,paper_samosa97,1635684815,1,i am just too tired…my body has started to res...,depression,pov: it has been 5 years,2021-10-31 23:53:35,depression
2,2,Emotional_Pea2059,1635683923,1,[removed],depression,Trauma left me emotionless help,2021-10-31 23:38:43,depression
3,3,TheMournfulLady,1635683277,1,Warning: ranty rant\n\n\nI was just trying to ...,depression,I couldn’t tell a creepy stranger to f*ck off ...,2021-10-31 23:27:57,depression
4,4,tinydog360,1635683257,1,[removed],depression,Forever alone and single,2021-10-31 23:27:37,depression


In [None]:
#count missing text
depression['selftext'].isna().sum()

np.int64(0)

In [None]:
depression = depression[
    (depression['selftext'].notna()) & #keep only rows where text is not missing
    (depression['selftext'] != '[removed]') & #remove removed posts
    (depression['selftext'] != '[deleted]')] #remove deleted posts

In [None]:
#keep only the text and label columns
depression = depression[['selftext', 'label']]

In [None]:
#quick check
depression.head()

Unnamed: 0,selftext,label
1,i am just too tired…my body has started to res...,depression
3,Warning: ranty rant\n\n\nI was just trying to ...,depression
6,It's making me feel depressed and frustrated. ...,depression
8,I cry myself to sleep over a relationship that...,depression
9,I made my ex (39 F) feel like we ware just fri...,depression


In [None]:
depression.shape #how many rows remain

(13332, 2)

In [None]:
anxiety = anxiety[
    (anxiety['selftext'].notna()) & #where text is empty
    (anxiety['selftext'] != '[removed]') & #where text is removed
    (anxiety['selftext'] != '[deleted]')] #where text has been deleted

In [None]:
anxiety.isna().sum() #how many mssig values r left

Unnamed: 0,0
Unnamed: 0,0
author,0
created_utc,0
score,0
selftext,0
subreddit,0
title,0
timestamp,0
label,0


In [None]:
#keep only text n label columns
anxiety = anxiety[['selftext', 'label']]
anxiety.head() #confirm its just text

Unnamed: 0,selftext,label
0,"I'm not sure if it is anxiety or what, but lat...",anxiety
1,"Hi:)\n\nI’m not sure if this is anxiety, but I...",anxiety
2,I’m finishing my first semester at college in ...,anxiety
3,I’m just sitting in the airport now waiting fo...,anxiety
4,"I’m really freaking out. I don’t know why, but...",anxiety


In [None]:
dataset = pd.concat([depression, anxiety, neutral_downsampled], ignore_index=True)
#combine all datasets into one

In [None]:
#save dataset
dataset.to_csv('dataset.csv', index=False)