In [None]:
import pandas

## Data Cleaning Utils

In [None]:
def normalize_colnames(df: pandas.DataFrame) -> pandas.DataFrame:
    rename_dict: dict[str, str] = {}

    for col in df.columns:
        if "text" in col:
            rename_dict[col] = "text"
        elif "label" in col:
            rename_dict[col] = "label"

    df = df.rename(columns=rename_dict)
    return df

In [None]:
def clean_colnames(df: pandas.DataFrame) -> pandas.DataFrame:
    clean_cols: list[str] = []
    
    for col in df.columns:
        clean_cols.append(col.lower().replace(" ", "_"))

    df.columns = clean_cols
    return df

In [None]:
def remove_ids(df: pandas.DataFrame) -> pandas.DataFrame:
    id_cols: list[str] = []

    for col in df.columns:
        if "id" in col:
            id_cols.append(col)

    df = df.drop(columns=id_cols)
    return df


## DEPSEV Dataset

In [None]:
depsev: pandas.DataFrame = pandas.read_csv("data/depsev.csv")

In [None]:
depsev.shape

(3553, 2)

In [None]:
depsev.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3553 entries, 0 to 3552
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    3553 non-null   object
 1   label   3553 non-null   object
dtypes: object(2)
memory usage: 2.3 MB


In [None]:
depsev.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
minimum,2587
moderate,394
mild,290
severe,282


## DEPSIGN Dataset

In [None]:
depsign_1: pandas.DataFrame = pandas.read_csv("data/depsign_1.tsv", sep="\t")
depsign_2: pandas.DataFrame = pandas.read_csv("data/depsign_2.tsv", sep="\t")
depsign_3: pandas.DataFrame = pandas.read_csv("data/depsign_3.tsv", sep="\t")

### DEPSIGN 1

In [None]:
depsign_1.shape

(4496, 3)

In [None]:
depsign_1.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4496 entries, 0 to 4495
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   PID        4496 non-null   object
 1   Text data  4496 non-null   object
 2   Label      4496 non-null   object
dtypes: object(3)
memory usage: 6.1 MB


In [None]:
depsign_1.Label.value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
moderate,2306
not depression,1830
severe,360


In [None]:
depsign_1 = clean_colnames(depsign_1)
depsign_1.columns

Index(['pid', 'text_data', 'label'], dtype='object')

In [None]:
depsign_1 = normalize_colnames(depsign_1)
depsign_1.columns

Index(['pid', 'text', 'label'], dtype='object')

In [None]:
depsign_1 = remove_ids(depsign_1)
depsign_1.columns

Index(['text', 'label'], dtype='object')

### DEPSIGN 2

In [None]:
depsign_2.shape

(3245, 3)

In [None]:
depsign_2.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3245 entries, 0 to 3244
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Pid           3245 non-null   object
 1   text data     3245 non-null   object
 2   Class labels  3245 non-null   object
dtypes: object(3)
memory usage: 4.4 MB


In [None]:
depsign_2["Class labels"].value_counts()

Unnamed: 0_level_0,count
Class labels,Unnamed: 1_level_1
moderate,2169
not depression,848
severe,228


In [None]:
depsign_2 = clean_colnames(depsign_2)
depsign_2.columns

Index(['pid', 'text_data', 'class_labels'], dtype='object')

In [None]:
depsign_2 = normalize_colnames(depsign_2)
depsign_2.columns

Index(['pid', 'text', 'label'], dtype='object')

In [None]:
depsign_2 = remove_ids(depsign_2)
depsign_2.columns

Index(['text', 'label'], dtype='object')

### DEPSIGN 3

In [None]:
depsign_3.shape

(8891, 3)

In [None]:
depsign_3.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8891 entries, 0 to 8890
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   PID        8891 non-null   object
 1   Text_data  8891 non-null   object
 2   Label      8891 non-null   object
dtypes: object(3)
memory usage: 10.7 MB


In [None]:
depsign_3.Label.value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
moderate,6019
not depression,1971
severe,901


In [None]:
depsign_3 = clean_colnames(depsign_3)
depsign_3.columns

Index(['pid', 'text_data', 'label'], dtype='object')

In [None]:
depsign_3 = normalize_colnames(depsign_3)
depsign_3.columns

Index(['pid', 'text', 'label'], dtype='object')

In [None]:
depsign_3 = remove_ids(depsign_3)
depsign_3.columns

Index(['text', 'label'], dtype='object')

### Merge DEPSIGN Datasets

In [None]:
depsign: pandas.DataFrame = pandas.concat([depsign_1, depsign_2, depsign_3])

In [None]:
depsign.shape

(16632, 2)

In [None]:
depsign.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Index: 16632 entries, 0 to 8890
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    16632 non-null  object
 1   label   16632 non-null  object
dtypes: object(2)
memory usage: 20.2 MB


In [None]:
depsign.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
moderate,10494
not depression,4649
severe,1489


In [None]:
depsign.to_csv("data/depsign.csv", index=False)