In [1]:
# connect to google colab
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
# base path
DATA_PATH = './drive/MyDrive/fyp-code/codes/data/emotion_classification/'

In [3]:
# libraries
import pandas as pd
import numpy as np

## Import all three datasets

In [6]:
data_toy = pd.read_csv(DATA_PATH+'emotion_classification_cleaned_toy_data_train.csv')[['Text', 'Label']]
data_short = pd.read_csv(DATA_PATH+'emotion_classification_cleaned_short_text_train.csv')[['Text', 'Label']]
data_long = pd.read_csv(DATA_PATH+'emotion_classification_cleaned_long_text_train.csv')[['Text', 'Label']]

## Check the number of data in each class in each of the 3 dataset

In [10]:
# toy dataset
data_toy['Label'].value_counts()

1    1809
0    1808
Name: Label, dtype: int64

In [11]:
# short dataset
data_short['Label'].value_counts()

1    667
0    667
Name: Label, dtype: int64

In [12]:
# long dataset
data_long['Label'].value_counts()

1    1029
0    1028
Name: Label, dtype: int64

## Do a sampling from the dataset
We will sample a quarter (25%) of the dataset for each of the 3 datasets, which will be used as the inter-annotator agreement

In [19]:
# sampling of the toy dataset
data_toy_sample_pos = data_toy[data_toy['Label']==1].sample(frac=0.25, replace=False)
print(f'Positive toy data sample: {data_toy_sample_pos.shape}')
data_toy_sample_neg = data_toy[data_toy['Label']==0].sample(frac=0.25, replace=False)
print(f'Negative toy data sample: {data_toy_sample_neg.shape}')

# sampling of the short text dataset
data_short_sample_pos = data_short[data_short['Label']==1].sample(frac=0.25, replace=False)
print(f'Positive short data sample: {data_short_sample_pos.shape}')
data_short_sample_neg = data_short[data_short['Label']==0].sample(frac=0.25, replace=False)
print(f'Negative short data sample: {data_short_sample_neg.shape}')

# sampling of the short text dataset
data_long_sample_pos = data_long[data_long['Label']==1].sample(frac=0.25, replace=False)
print(f'Positive long data sample: {data_long_sample_pos.shape}')
data_long_sample_neg = data_long[data_long['Label']==0].sample(frac=0.25, replace=False)
print(f'Negative long data sample: {data_long_sample_neg.shape}')

Positive toy data sample: (452, 2)
Negative toy data sample: (452, 2)
Positive short data sample: (167, 2)
Negative short data sample: (167, 2)
Positive long data sample: (257, 2)
Negative long data sample: (257, 2)


In [22]:
# merge the samples together for each of the dataset
data_sample_toy = pd.concat([data_toy_sample_pos, data_toy_sample_neg])
data_sample_short = pd.concat([data_short_sample_pos, data_short_sample_neg])
data_sample_long = pd.concat([data_long_sample_pos, data_long_sample_neg])

# shuffle the data to ensure better distribution of data
data_sample_toy = data_sample_toy.sample(frac=1).reset_index(drop=True)
data_sample_short = data_sample_short.sample(frac=1).reset_index(drop=True)
data_sample_long = data_sample_long.sample(frac=1).reset_index(drop=True)

print(f'Toy Sample: {data_sample_toy.shape}')
print(f'Short Sample: {data_sample_short.shape}')
print(f'Long Sample: {data_sample_long.shape}')

Toy Sample: (904, 2)
Short Sample: (334, 2)
Long Sample: (514, 2)


In [23]:
# Toy sample
data_sample_toy.head()

Unnamed: 0,Text,Label
0,Making a present!,0
1,@sids anytime mate,0
2,@Bradleysanborn Depression is a myth,1
3,Lol my depression is hitting me hard,1
4,Ready for my first long run in a month. Nothi...,0


In [24]:
# Short sample
data_sample_short.head()

Unnamed: 0,Text,Label
0,nobody could ever know the hidden pain or the ...,0
1,@MartinPincot Haha @MartinPincot - just sent ...,0
2,When you wished you never woke,1
3,The is no joke as I have suffered from myse...,0
4,I wish the best to those dealing w/ an interna...,1


In [26]:
# Long Sample
data_sample_long.head()

Unnamed: 0,Text,Label
0,Trying to repair a lost friendship First time ...,0
1,"haha, hey ya'll, i know i said i wouldnt be ...",0
2,heard about blog for some while... then June ...,0
3,This is the product of idle time. Usually pho...,0
4,"My life is so hollow Stupid self pitying rant,...",1


## Save the final sampled dataset for inter-annotator-agreement

In [27]:
data_sample_toy.to_csv(DATA_PATH+'emotion_classification_toy_data_sampled.csv', index=False)
data_sample_short.to_csv(DATA_PATH+'emotion_classification_short_text_sampled.csv', index=False)
data_sample_long.to_csv(DATA_PATH+'emotion_classification_long_text_sampled.csv', index=False)