In [59]:
import os
import glob
import random
import pandas as pd
import numpy as np

# Získanie datasetu

In [60]:
# Download dataset
DATASET_URL = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
DATASET_DIR = 'dataset'
DATASET_FILE_PATH = f'{DATASET_DIR}/aclImdb_v1.tar.gz'

if not os.path.isfile(DATASET_FILE_PATH):
    print(f'Downloading dataset into {DATASET_FILE_PATH} ...')
    with urllib.request.urlopen(DATASET_URL) as response, open(DATASET_FILE_PATH, 'wb') as out_file:
        shutil.copyfileobj(response, out_file)
else:
    print('Dataset already downloaded.')

Dataset already downloaded.


In [61]:
# Untar the dataset archive
if not os.path.isdir(f'{DATASET_DIR}/aclImdb'):
    with tarfile.open(DATASET_FILE_PATH) as archive:
        print(f'Extracting "{DATASET_FILE_PATH}" to "{DATASET_DIR}" ...')
        archive.extractall(DATASET_DIR)
        print('Extraction finished.')
else:
    print('Dataset already extracted.')


Dataset already extracted.


In [62]:
# Load data from folders
TEST_FOLDER = f'{DATASET_DIR}/aclImdb/test'
TEST_POSITIVE_FOLDER = f'{TEST_FOLDER}/pos'
TEST_NEGATIVE_FOLDER = f'{TEST_FOLDER}/neg'

TRAIN_FOLDER = f'{DATASET_DIR}/aclImdb/train'
TRAIN_POSITIVE_FOLDER = f'{TRAIN_FOLDER}/pos'
TRAIN_NEGATIVE_FOLDER = f'{TRAIN_FOLDER}/neg'
TRAIN_UNSUPERVISED_FOLDER = f'{TRAIN_FOLDER}/unsup'

VOCAB_SIZE = 10_000
MAX_SENTENCE_LEN = 100

In [63]:
TEST_FOLDER

'dataset/aclImdb/test'

In [64]:
def create_labeled_dataset_from_files(folders, label_map={'pos':[1, 0], 'neg': [0, 1]}, shuffle=True):
    files = map(lambda folder: [glob.glob(f'{folder}/*'), f'{folder}'], folders)

    # Assign label to every files based on folder they are in
    labeled_files = map(lambda files_with_label:
                        map(lambda file_path:
                            [file_path, files_with_label[1].split('/')[-1]] # Take only the last folde from the folder path
                        , files_with_label[0])
                    , files)

    # flatten list
    flat_labeled_files = []
    for lf in labeled_files:
        for fl in lf:
            flat_labeled_files.append(fl)

    if shuffle:
        random.shuffle(flat_labeled_files)

    # read file contents
    labeled_texts = map(lambda example: [open(example[0]).read().split(' ')[:MAX_SENTENCE_LEN], example[1]], flat_labeled_files)

    return labeled_texts

cls_test_ds = create_labeled_dataset_from_files([f'{TEST_POSITIVE_FOLDER}', f'{TEST_NEGATIVE_FOLDER}'])
cls_train_ds = create_labeled_dataset_from_files([f'{TRAIN_POSITIVE_FOLDER}', f'{TRAIN_NEGATIVE_FOLDER}'])

In [65]:
reviews_train = []
for review in list(cls_train_ds):
    reviews_train.append([" ".join(review[0]), review[1]])

reviews_test = []
for review in list(cls_test_ds):
    reviews_test.append([" ".join(review[0]), review[1]])

In [66]:
test_df = pd.DataFrame(np.array(reviews_test), columns=['review_text', 'sentiment'])
train_df = pd.DataFrame(np.array(reviews_train), columns=['review_text', 'sentiment'])

# Trénovací a testovací dataset

Datasety pozostávajú z dvoch atribútov - text recenzie a sentiment. Oba atribúty sú v textovej podobe, čiže je nutné ich upraviť do číselnej podoby.

## Ukážka datasetu train_df

In [67]:
train_df.head()

Unnamed: 0,review_text,sentiment
0,I was around 7 when I saw this movie first. It...,pos
1,i went into watching this movie knowing it was...,neg
2,This is a direct sequel to 'The Mummy's Hand' ...,neg
3,"I'm not particularly fond of remakes, or to st...",neg
4,I can't see the point in burying a movie like ...,pos


In [68]:
train_df.describe()

Unnamed: 0,review_text,sentiment
count,25000,25000
unique,24901,2
top,How has this piece of crap stayed on TV this l...,pos
freq,3,12500


In [69]:
train_df.dtypes

review_text    object
sentiment      object
dtype: object

In [70]:
train_df['sentiment'].value_counts()

pos    12500
neg    12500
Name: sentiment, dtype: int64

## Ukážka datasetu test_df

In [71]:
test_df.head()

Unnamed: 0,review_text,sentiment
0,First of all this is one of the worst soft-cor...,neg
1,This movie was outright painful for me to watc...,neg
2,"I just watched ""return from lonesome dove"" and...",pos
3,I seem to be disagreeing with a lot of folks h...,neg
4,Two of Hollywood's great child stars (Elizabet...,pos


In [72]:
test_df.describe()

Unnamed: 0,review_text,sentiment
count,25000,25000
unique,24795,2
top,Loved today's show!!! It was a variety and not...,pos
freq,5,12500


Z nasledujúcích údajov môžeme vidieť, že v datasete sa nachádza 24795 unikátnych recenzií, respektíve 225 duplikátov.

In [73]:
test_df.dtypes

review_text    object
sentiment      object
dtype: object

In [74]:
test_df['sentiment'].value_counts()

pos    12500
neg    12500
Name: sentiment, dtype: int64

# Prekryv prvkov z train a test datasetu

In [76]:
overlap = train_df.isin(test_df)

In [85]:
overlap[overlap['review_text'] == True]

Unnamed: 0,review_text,sentiment


Môžeme vidieť, že v našom datasete sa nenachádzajú žiadne recenzie, ktoré sa nachádzajú v trénovacej aj testovacej množine.

# min, mod, median -> dlzky reviews v pos a neg.


In [91]:
test_df['review_text'].str.len().describe()

count    25000.000000
mean       539.760840
std         79.903574
min         32.000000
25%        526.000000
50%        555.000000
75%        582.000000
max        806.000000
Name: review_text, dtype: float64

In [92]:
train_df['review_text'].str.len().describe()

count    25000.000000
mean       541.909960
std         77.688289
min         52.000000
25%        527.000000
50%        556.000000
75%        583.000000
max        758.000000
Name: review_text, dtype: float64