In [1]:
import pandas

## Data Cleaning Utils

In [2]:
def normalize_colnames(df: pandas.DataFrame) -> pandas.DataFrame:
    rename_dict: dict[str, str] = {}

    for col in df.columns:
        if "text" in col:
            rename_dict[col] = "text"
        elif "label" in col:
            rename_dict[col] = "label"

    df = df.rename(columns=rename_dict)
    return df

In [3]:
def clean_colnames(df: pandas.DataFrame) -> pandas.DataFrame:
    clean_cols: list[str] = []

    for col in df.columns:
        clean_cols.append(col.lower().replace(" ", "_"))

    df.columns = clean_cols
    return df

In [4]:
def remove_ids(df: pandas.DataFrame) -> pandas.DataFrame:
    id_cols: list[str] = []

    for col in df.columns:
        if "id" in col:
            id_cols.append(col)

    df = df.drop(columns=id_cols)
    return df


## Colab-specific Utils

In [5]:
from google.colab import files

In [6]:
!mkdir -p data

In [None]:
uploaded: dict[str, bytes] = files.upload("data")

## DEPSEV Dataset

In [8]:
depsev: pandas.DataFrame = pandas.read_csv("data/depsev.csv")

In [9]:
depsev.shape

(3553, 2)

In [10]:
depsev.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3553 entries, 0 to 3552
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    3553 non-null   object
 1   label   3553 non-null   object
dtypes: object(2)
memory usage: 2.3 MB


In [11]:
depsev.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
minimum,2587
moderate,394
mild,290
severe,282


In [12]:
minimum: pandas.DataFrame = depsev[depsev.label == "minimum"]
minimum.head()

Unnamed: 0,text,label
1,"Hey there r/assistance, Not sure if this is th...",minimum
2,My mom then hit me with the newspaper and it s...,minimum
6,It was a big company so luckily I didn't have ...,minimum
10,Next week I’ll be flying for our family vacati...,minimum
13,"No place in my city has shelter space for us, ...",minimum


In [13]:
label_map: dict[str, str] = {
    "minimum": "not depression",
    "mild": "moderate"
}

depsev.label = depsev.label.replace(label_map)
depsev.label.unique()

array(['moderate', 'not depression', 'severe'], dtype=object)

In [14]:
depsev.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
not depression,2587
moderate,684
severe,282


## DEPSIGN Dataset

In [15]:
depsign_1: pandas.DataFrame = pandas.read_csv("data/depsign_1.tsv", sep="\t")
depsign_2: pandas.DataFrame = pandas.read_csv("data/depsign_2.tsv", sep="\t")
depsign_3: pandas.DataFrame = pandas.read_csv("data/depsign_3.tsv", sep="\t")

### DEPSIGN 1

In [16]:
depsign_1.shape

(4496, 3)

In [17]:
depsign_1.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4496 entries, 0 to 4495
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   PID        4496 non-null   object
 1   Text data  4496 non-null   object
 2   Label      4496 non-null   object
dtypes: object(3)
memory usage: 6.1 MB


In [18]:
depsign_1.Label.value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
moderate,2306
not depression,1830
severe,360


In [19]:
depsign_1 = clean_colnames(depsign_1)
depsign_1.columns

Index(['pid', 'text_data', 'label'], dtype='object')

In [20]:
depsign_1 = normalize_colnames(depsign_1)
depsign_1.columns

Index(['pid', 'text', 'label'], dtype='object')

In [21]:
depsign_1 = remove_ids(depsign_1)
depsign_1.columns

Index(['text', 'label'], dtype='object')

### DEPSIGN 2

In [22]:
depsign_2.shape

(3245, 3)

In [23]:
depsign_2.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3245 entries, 0 to 3244
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Pid           3245 non-null   object
 1   text data     3245 non-null   object
 2   Class labels  3245 non-null   object
dtypes: object(3)
memory usage: 4.4 MB


In [24]:
depsign_2["Class labels"].value_counts()

Unnamed: 0_level_0,count
Class labels,Unnamed: 1_level_1
moderate,2169
not depression,848
severe,228


In [25]:
depsign_2 = clean_colnames(depsign_2)
depsign_2.columns

Index(['pid', 'text_data', 'class_labels'], dtype='object')

In [26]:
depsign_2 = normalize_colnames(depsign_2)
depsign_2.columns

Index(['pid', 'text', 'label'], dtype='object')

In [27]:
depsign_2 = remove_ids(depsign_2)
depsign_2.columns

Index(['text', 'label'], dtype='object')

### DEPSIGN 3

In [28]:
depsign_3.shape

(8891, 3)

In [29]:
depsign_3.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8891 entries, 0 to 8890
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   PID        8891 non-null   object
 1   Text_data  8891 non-null   object
 2   Label      8891 non-null   object
dtypes: object(3)
memory usage: 10.7 MB


In [30]:
depsign_3.Label.value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
moderate,6019
not depression,1971
severe,901


In [31]:
depsign_3 = clean_colnames(depsign_3)
depsign_3.columns

Index(['pid', 'text_data', 'label'], dtype='object')

In [32]:
depsign_3 = normalize_colnames(depsign_3)
depsign_3.columns

Index(['pid', 'text', 'label'], dtype='object')

In [33]:
depsign_3 = remove_ids(depsign_3)
depsign_3.columns

Index(['text', 'label'], dtype='object')

### Merge DEPSIGN Datasets

In [34]:
depsign: pandas.DataFrame = pandas.concat([depsign_1, depsign_2, depsign_3], ignore_index=True)

In [35]:
depsign.shape

(16632, 2)

In [36]:
depsign.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16632 entries, 0 to 16631
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    16632 non-null  object
 1   label   16632 non-null  object
dtypes: object(2)
memory usage: 20.1 MB


In [37]:
depsign.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
moderate,10494
not depression,4649
severe,1489


In [38]:
depsign.to_csv("data/depsign.csv", index=False)

## Create the Final Dataset

In [39]:
depset: pandas.DataFrame = pandas.concat([depsev, depsign], ignore_index=True)

In [40]:
depset.shape

(20185, 2)

In [41]:
depset.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20185 entries, 0 to 20184
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    20185 non-null  object
 1   label   20185 non-null  object
dtypes: object(2)
memory usage: 22.6 MB


In [42]:
depset.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
moderate,11178
not depression,7236
severe,1771


In [43]:
depset.to_csv("data/depset.csv", index=False)

## Final Cleaning Steps

### Check for NULL Values

In [None]:
depset.isnull().sum()

Unnamed: 0,0
text,0
label,0


### Check for Duplicate Values

In [None]:
depset.duplicated(subset=["text"]).sum()

np.int64(6417)

In [None]:
duplicates = depset[depset.duplicated(subset=["text"])]
duplicates.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
moderate,4740
not depression,1124
severe,553


#### Drop Duplicate Values

In [None]:
depset = depset.drop_duplicates(subset=["text"], keep="first")

In [None]:
depset.duplicated(subset=["text"]).sum()

np.int64(0)

In [None]:
depset.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
moderate,6438
not depression,6112
severe,1218


## Preprocessing

In [None]:
!pip install contractions

In [None]:
import contractions
import re

In [None]:
def preprocessing(text: str) -> str:
    text = text.lower().strip()
    text = contractions.fix(text)
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub("\n", "", text)
    text = re.sub(r"([?!,+=—&%\'\";:|\(\){}\[\]/])", r" \1 ", text)
    text = re.sub("[ ]{2,}", " ", text).strip()

    return text

In [None]:
depset.text = depset.text.apply(preprocessing)

## Model Training

### Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = depset.text
y = depset.label

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19, stratify=y)

### Class Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
encoder = LabelEncoder()

y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)

In [None]:
encoder.classes_

array(['moderate', 'not depression', 'severe'], dtype=object)

In [None]:
y_train_encoded[:10]

array([1, 0, 1, 1, 0, 0, 0, 1, 0, 0])

### Cross-Validation Strategy

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=19)

### Class Weights

In [None]:
import numpy as np

In [None]:
from sklearn.utils.class_weight import compute_class_weight

In [None]:
class_weights = compute_class_weight("balanced", classes=np.unique(y_train_encoded), y=y_train_encoded)
class_weights

array([0.71288026, 0.75078391, 3.76933607])

In [None]:
class_weights_dict = dict(zip(np.unique(y_train_encoded), class_weights))
class_weights_dict

{np.int64(0): np.float64(0.7128802588996763),
 np.int64(1): np.float64(0.7507839127471029),
 np.int64(2): np.float64(3.7693360711841204)}

### Machine Learning Models

#### TF-IDF Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), max_df=0.95, min_df=5)

In [None]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

#### Sentence Embeddings

In [None]:
!pip install sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
X_train_embeddings = model.encode(X_train.tolist())
X_test_embeddings = model.encode(X_test.tolist())