In [15]:
import pathlib
import sklearn
import datasets
import pandas as pd
import sklearn.preprocessing
import sklearn.model_selection

In [2]:
!pwd

/home/jupyter/tutorials/personal/pydata_bert/notebooks


In [3]:
!ls ../artifacts

dataset_processed  dataset_raw


In [4]:
!ls ../artifacts/dataset_raw

aclImdb  aclImdb_v1.tar.gz


In [None]:
%%bash
mkdir -p ../artifacts/dataset_raw
cd ../artifacts/dataset_raw
wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
tar -xf aclImdb_v1.tar.gz

In [5]:
def imdb_pandas(split_dir:str):
    split_dir = pathlib.Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos", "neg"]:
        for text_file in (split_dir/label_dir).iterdir():
            if str(text_file).endswith(".txt"):
                texts.append(text_file.read_text())
                labels.append(label_dir)

    
    df = pd.DataFrame({'text':texts , 'label_name':labels})
    return df

In [6]:
df_all = imdb_pandas ('../artifacts/dataset_raw/aclImdb/train')
df_test = imdb_pandas ('../artifacts/dataset_raw/aclImdb/test')


In [7]:
df_all.head()

Unnamed: 0,text,label_name
0,It's hard to find an outright bad historical d...,pos
1,"Very few so called ""remakes"" can be as good as...",pos
2,The quote I used for my summary occurs about h...,pos
3,"This movie is fun to watch , doesnt have much ...",pos
4,This movie is plain fun.I has nothing to do wi...,pos


In [8]:
df_test.head()

Unnamed: 0,text,label_name
0,"That's what I thought, when I heard about the ...",pos
1,The reason I am reviewing this is that the pre...,pos
2,I had the honor this evening to see a screenin...,pos
3,Fame was released in the U.S. a year before I ...,pos
4,Isabelle Huppert portrays a talented female pi...,pos


In [11]:
label_encoder = sklearn.preprocessing.LabelEncoder()

In [12]:
label_encoder.fit(df_all['label_name'])

LabelEncoder()

In [16]:
df_all['label'] = label_encoder.transform(df_all['label_name'])
df_test['label'] = label_encoder.transform(df_test['label_name'])

In [17]:
df_train, df_val = sklearn.model_selection.train_test_split(df_all, test_size=.2)

In [18]:
dataset_features = datasets.Features({'text': datasets.Value('string'), 'label': datasets.ClassLabel(names=list ( label_encoder.classes_ ))})

In [19]:
dataset_train = datasets.Dataset.from_pandas(df_train[['text','label']],features=dataset_features)
dataset_test = datasets.Dataset.from_pandas(df_test[['text','label']],features=dataset_features)
dataset_validation = datasets.Dataset.from_pandas(df_test[['text','label']],features=dataset_features)

dataset_all = datasets.DatasetDict({
    'train': dataset_train,
    'test': dataset_test,
    'valid': dataset_validation }
)

In [20]:
dataset_all

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})

In [25]:
dataset_all['train'][0]

{'text': "King of Masks (Bian Lian in China) is a shockingly beautiful and profoundly touching film. Winner of 16 awards from around the world, this film based on a true story centers on Wang Bianlian, a street performer in 1930s China who is growing older but has no heir to pass on his art of face-change opera. He has a unique talent of quickly changing masks in performance, and no one knows how he does it. He has a longing desire to have a grandson, as his art is a family heirloom that can only be passed on to a male heir. We then go to the streets, and see that people are selling their children because they can't afford to take care of them: some are even begging to take their daughters for free, because daughters are not worth much in this society. Wang Bianlian's story goes on from there.<br /><br />The film was so astonishingly good, the acting was amazing, and the issues were so weighty and well-addressed. There is the gender inequality and the depressing fact that in this time 

In [26]:
dataset_path = '../artifacts/dataset_processed/imdb'

In [27]:
dataset_all.save_to_disk(dataset_path)

In [28]:
datasets.load_from_disk(dataset_path)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})