## Preprocessing V6 Datasets - Re-labelling the target variable

For Task 14 and Task 15, the target variable, "label", will be regenerated using the criteria:
- Reclassify urgent as 1,2,3 labels and not urgent as 4 & 5.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sys

In [8]:
system_context_all = pd.read_csv("Data/system_context_all_v5.csv")
system_context_all.rename(columns={"ID_x":"mid", "contextualized":"text"}, inplace=True)
system_context_all = system_context_all[["mid","Urgency","text"]]
system_context_all.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46303 entries, 0 to 46302
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   mid      46303 non-null  int64 
 1   Urgency  46303 non-null  int64 
 2   text     46303 non-null  object
dtypes: int64(2), object(1)
memory usage: 1.1+ MB


In [16]:
# 1. Reclassify urgent as 1,2,3 labels and not urgent as 4 & 5.

messages_df = system_context_all.assign(label = np.where( ((system_context_all.Urgency == 1) | (system_context_all.Urgency == 2) |
                                                          (system_context_all.Urgency == 3)), 1, 
                                                         np.where(((system_context_all.Urgency == 4) | (system_context_all.Urgency == 5)), 0, -2 )
                                                        ))
messages_df.label.value_counts()

-2    36651
 0     5081
 1     4571
Name: label, dtype: int64

In [19]:
#2. generate datasets

#get unlabeled system_context messages
system_context_unlbl = messages_df[messages_df.label < 0]
system_context_unlbl.to_csv("Data/system_context_all_unlbl_v6.csv")

#get labeled system_context messages
system_context_lbl = messages_df[messages_df.label >= 0]
system_context_lbl.to_csv("Data/system_context_all_lbl_v6.csv")

#System Context train test split
train_df, remain_df = train_test_split(system_context_lbl, random_state=42, train_size=0.7, stratify=system_context_lbl.label.values)
test_df, dev_df = train_test_split(remain_df, random_state=42, train_size=0.7, stratify=remain_df.label.values)

train_df.to_csv("Data/train_df_v6.csv")
test_df.to_csv("Data/test_df_v6.csv")
dev_df.to_csv("Data/dev_df_v6.csv")

#formulate system context pretraining dataset
system_context_pretrain_df = pd.concat([system_context_unlbl, train_df])
system_context_pretrain_df.to_csv("Data/system_context_pretrain_v6.csv")
system_pretrain_train, system_pretrain_validation = train_test_split(system_context_pretrain_df, test_size=0.2, random_state=42)
system_pretrain_train.text.to_csv("Data/system_pretrain_train_v6.csv")
system_pretrain_validation.text.to_csv("Data/system_pretrain_dev_v6.csv")



In [20]:
#3. Pretraining sets on labeled data
#remove test set from all labeled dataset
pretrain_lbl_df = system_context_lbl[~system_context_lbl.mid.isin(list(test_df.mid)) ]

system_pretrain_train_lbl, system_pretrain_dev_lbl = train_test_split(pretrain_lbl_df, test_size=0.2, random_state=42)

system_pretrain_train_lbl.text.to_csv("Data/system_pretrain_train_lbl_v6.csv")
system_pretrain_dev_lbl.text.to_csv("Data/system_pretrain_dev_lbl_v6.csv")