## Load the dataset: en (English)

In [None]:
from google.colab import drive
# mount your google drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/My Drive/TFG/AuTexTification/subtask_1/en')

Mounted at /content/drive


In [None]:
import pandas as pd

train_df = pd.read_csv("train.tsv",  sep='\t', on_bad_lines='skip')
test_df = pd.read_csv("test.tsv",  sep='\t', on_bad_lines='skip')
print('Train dataset size:', train_df.shape)
print('Test dataset size:', test_df.shape)

Train dataset size: (33845, 6)
Test dataset size: (21832, 6)


In [None]:
# Show the first rows
train_df.head()

Unnamed: 0,id,prompt,text,label,model,domain
0,12322,you need to stop the,you need to stop the engine and wait until it ...,generated,B,tweets
1,1682,Article 11. On the basis of the information co...,The Commission shall publish the report; an in...,generated,C,legal
2,22592,I have not been tweeting,"I have not been tweeting a lot lately, but I d...",generated,D,tweets
3,17390,NO-PROMPT,I pass my exam and really thankgod for that bu...,human,NO-MODEL,tweets
4,30453,NO-PROMPT,The template will have 3 parts: a mustache sha...,human,NO-MODEL,wiki


In [None]:
# Since we will only focus on a binary classification, we do not need the id, prompt and model variables
# We will mantain the domain variable to later study how well the model works depending on the domain
train_df = train_df.drop(['id', 'prompt', 'model'], axis=1)
test_df = test_df.drop(['id', 'prompt', 'model'], axis=1)

In [None]:
train_df.head()

Unnamed: 0,text,label,domain
0,you need to stop the engine and wait until it ...,generated,tweets
1,The Commission shall publish the report; an in...,generated,legal
2,"I have not been tweeting a lot lately, but I d...",generated,tweets
3,I pass my exam and really thankgod for that bu...,human,tweets
4,The template will have 3 parts: a mustache sha...,human,wiki


### Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

train_df['label'] = train_df['label'].tolist()
test_df['label'] = test_df['label'].tolist()

le = LabelEncoder()

train_df['label'] = le.fit_transform(train_df['label'])
labels = le.classes_

test_df['label'] = le.transform(test_df['label'])

### Split the data

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and validation sets with stratification
train_df, validation_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=42)
validation_df.reset_index(drop=True).iloc[0]

text      @miss_tattoo LOL! cuuuute! how did the M&amp;G...
label                                                     1
domain                                               tweets
Name: 0, dtype: object

### Create Dictionary Object with our datasets

In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Installing collected pac

In [None]:
from datasets import DatasetDict, Dataset

# Convert pandas DataFrames to datasets.Dataset
train_dataset = Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(validation_df)
test_dataset = Dataset.from_pandas(test_df)

# Create a DatasetDict
dict_dataset = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset,
})

print(dict_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'domain', '__index_level_0__'],
        num_rows: 27076
    })
    validation: Dataset({
        features: ['text', 'label', 'domain', '__index_level_0__'],
        num_rows: 6769
    })
    test: Dataset({
        features: ['text', 'label', 'domain'],
        num_rows: 21832
    })
})


### Save dictionary object for future use.

In [None]:
import pickle

with open("dict_dataset.pkl", "wb") as myFile:
    pickle.dump(dict_dataset, myFile)

The labels are:

In [None]:
print('Labels:', labels, 'num_labels:', len(labels))

LABELS: ['generated' 'human'] num_labels: 2
