## Install and import the required dependencies

In [None]:
! pip install datasets langdetect

In [None]:
from datasets import load_dataset
import numpy as np
import pandas as pd
from langdetect import detect

## Download the IMDB Training dataset

In [None]:
dataset = load_dataset("imdb", split="train")

## Shuffle the data

In [None]:
df = pd.DataFrame(dataset)
df = df.sample(frac=1, random_state=42)

## Reduce dataset to 100 records - make sure all of the text is in English

In [None]:
df_100 = df.head(100)
assert len(df_100[df_100.apply(lambda x: detect(x['text']) == 'en', axis=1)]) == 100

## Clean dataset by removing HTML tags

In [None]:
df_100.iloc[2]['text'][:200]

In [None]:
import re
regex = re.compile('<.*?>') 

def cleanhtml(raw_html):
    cleantext = re.sub(regex, '', raw_html)
    return cleantext

In [None]:
df_100.loc[:, ['text']] = df_100['text'].apply(cleanhtml)

In [None]:
df_100.iloc[2]['text']

## Split dataset into train, test, and validation datasets

In [None]:
train_ratio = 0.8
val_ratio = 0.1

train_df, val_df, test_df = np.split(df_100.sample(frac=1), [int(train_ratio*len(df_100)), int((train_ratio+val_ratio)*len(df_100))])

## Save datasets to S3

In [None]:
import boto3
import sagemaker

sess = sagemaker.Session()
bucket = sess.default_bucket()

train_df.to_csv('train.csv', index_label="ID")
val_df.to_csv('validation.csv', index_label="ID")
test_df.to_csv('test.csv', index_label="ID")

s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'sm-nlp-data-aug/data/train/train.csv').upload_file('train.csv')
s3_resource.Object(bucket, 'sm-nlp-data-aug/data/validation/validation.csv').upload_file('validation.csv')
s3_resource.Object(bucket, 'sm-nlp-data-aug/data/test/test.csv').upload_file('test.csv')