In [16]:
import pathlib
import sklearn
import datasets
import pandas as pd


In [1]:
#! wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
#! tar -xf aclImdb_v1.tar.gz

In [10]:
def imdb_pandas(split_dir:str):
    split_dir = pathlib.Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos", "neg"]:
        for text_file in (split_dir/label_dir).iterdir():
            if str(text_file).endswith(".txt"):
                texts.append(text_file.read_text())
                labels.append(label_dir)

    
    df = pd.DataFrame({'text':texts , 'label_name':labels})
    return df

In [11]:
df_all = imdb_pandas ('aclImdb/train')
df_test = imdb_pandas ('aclImdb/test')


In [12]:
df_all.head()

Unnamed: 0,text,label_name
0,It's hard to find an outright bad historical d...,pos
1,"Very few so called ""remakes"" can be as good as...",pos
2,The quote I used for my summary occurs about h...,pos
3,"This movie is fun to watch , doesnt have much ...",pos
4,This movie is plain fun.I has nothing to do wi...,pos


In [13]:
df_test.head()

Unnamed: 0,text,label_name
0,"That's what I thought, when I heard about the ...",pos
1,The reason I am reviewing this is that the pre...,pos
2,I had the honor this evening to see a screenin...,pos
3,Fame was released in the U.S. a year before I ...,pos
4,Isabelle Huppert portrays a talented female pi...,pos


In [17]:
label_encoder = sklearn.preprocessing.LabelEncoder()

In [19]:
label_encoder.fit(df_all['label_name'])

LabelEncoder()

In [21]:
df_all['label'] = label_encoder.transform(df_all['label_name'])
df_test['label'] = label_encoder.transform(df_test['label_name'])

In [23]:
df_train, df_val = sklearn.model_selection.train_test_split(df_all, test_size=.2)

In [24]:
dataset_features = datasets.Features({'text': datasets.Value('string'), 'label': datasets.ClassLabel(names=list ( label_encoder.classes_ ))})

In [29]:
dataset_train = datasets.Dataset.from_pandas(df_train[['text','label']],features=dataset_features)
dataset_test = datasets.Dataset.from_pandas(df_test[['text','label']],features=dataset_features)
dataset_validation = datasets.Dataset.from_pandas(df_test[['text','label']],features=dataset_features)

dataset_all = datasets.DatasetDict({
    'train': dataset_train,
    'test': dataset_test,
    'valid': dataset_validation }
)

In [30]:
dataset_all

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})

In [31]:
dataset_all['train'].head()

AttributeError: 'Dataset' object has no attribute 'head'

In [36]:
dataset_path = 'datasets/imdb'

In [37]:
dataset_all.save_to_disk(dataset_path)

In [38]:
datasets.load_from_disk(dataset_path)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})