# Load the dataset: es (Spanish)

In [None]:
from google.colab import drive
# mount your google drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/My Drive/TFG/AuTexTification/subtask_1/es')

Mounted at /content/drive


In [None]:
import pandas as pd

train_df = pd.read_csv("train.tsv",  sep='\t', on_bad_lines='skip')
test_df = pd.read_csv("test.tsv",  sep='\t', on_bad_lines='skip')
print('Train dataset size:', train_df.shape)
print('Test dataset size:', test_df.shape)

Train dataset size: (32062, 6)
Test dataset size: (20129, 6)


In [None]:
# Show the first 5 rows
train_df.head()

Unnamed: 0,id,prompt,text,label,model,domain
0,5464,NO-PROMPT,Entrada en vigor. La presente Directiva entrar...,human,NO-MODEL,legal
1,30129,"Estos podrían ser preguntas, categorías de inf...",Preguntas: 1. ¿Cuáles son los principales argu...,generated,F,wiki
2,19553,-¿Desea algo? -Póngame una caja,¿Desea algo? Póngame una caja de madera. ¿Qué ...,generated,E,tweets
3,13005,NO-PROMPT,"@victor28088 1665 Tweets no originales, que as...",human,NO-MODEL,tweets
4,16919,NO-PROMPT,De pequeño Dios me dio a elegir entre tener un...,human,NO-MODEL,tweets


In [None]:
# Since we will only focus on a binary classification, we do not need the id, prompt and model variables
# We will mantain the domain variable to later study how well the model works depending on the domain
train_df = train_df.drop(['id', 'prompt', 'model'], axis=1)
test_df = test_df.drop(['id', 'prompt', 'model'], axis=1)

In [None]:
train_df.head()

Unnamed: 0,text,label,domain
0,Entrada en vigor. La presente Directiva entrar...,human,legal
1,Preguntas: 1. ¿Cuáles son los principales argu...,generated,wiki
2,¿Desea algo? Póngame una caja de madera. ¿Qué ...,generated,tweets
3,"@victor28088 1665 Tweets no originales, que as...",human,tweets
4,De pequeño Dios me dio a elegir entre tener un...,human,tweets


# Label Encoding
Transform the labels *generated* and *human* into 0 and 1, respectively.

In [None]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

train_df['label'] = train_df['label'].tolist()
test_df['label'] = test_df['label'].tolist()

le = LabelEncoder()

train_df['label'] = le.fit_transform(train_df['label'])
labels = le.classes_

test_df['label'] = le.transform(test_df['label'])

# Split the data
Divide the data into training (80% of the instances) and validation (remaining 20% of the instances) datasets.

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and validation sets with stratification
train_df, validation_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=42)
validation_df.reset_index(drop=True).iloc[0]

Unnamed: 0,0
text,—¡Alo! —Alo —Por favor con el gringo —¿Cual gr...
label,1
domain,tweets


# Create Dictionary Object with our datasets

In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
from datasets import DatasetDict, Dataset

# Convert pandas DataFrames to datasets.Dataset
train_dataset = Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(validation_df)
test_dataset = Dataset.from_pandas(test_df)

# Create a DatasetDict
dict_dataset = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset,
})

print(dict_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'domain', '__index_level_0__'],
        num_rows: 25649
    })
    validation: Dataset({
        features: ['text', 'label', 'domain', '__index_level_0__'],
        num_rows: 6413
    })
    test: Dataset({
        features: ['text', 'label', 'domain'],
        num_rows: 20129
    })
})


# Save dictionary object for future use.

In [None]:
import pickle

with open("es_dict_dataset.pkl", "wb") as myFile:
    pickle.dump(dict_dataset, myFile)

The labels are:

In [None]:
print('Labels:', labels, 'num_labels:', len(labels))

Labels: ['generated' 'human'] num_labels: 2
