## Install and import the required dependencies

In [None]:
!pip install datasets langdetect

In [8]:
from datasets import load_dataset
import pandas as pd
from langdetect import detect

## Download the IMDB Training dataset

In [None]:
dataset = load_dataset("imdb", split="train")

## Shuffle the data

In [75]:
df = pd.DataFrame(dataset)
df = df.sample(frac=1, random_state=42)

## Reduce dataset to 100 records - make sure all of the text is in English

In [76]:
df_100 = df.head(100)
assert len(df_100[df_100.apply(lambda x: detect(x['text']) == 'en', axis=1)]) == 100

## Clean dataset by removing HTML tags

In [78]:
df_100.iloc[2]['text'][:200]

'After watching this movie I was honestly disappointed - not because of the actors, story or directing - I was disappointed by this film advertisements.<br /><br />The trailers were suggesting that the'

In [62]:
import re
regex = re.compile('<.*?>') 

def cleanhtml(raw_html):
    cleantext = re.sub(regex, '', raw_html)
    return cleantext

In [63]:
df_100.loc[:, ['text']] = df_100['text'].apply(cleanhtml)

In [64]:
df_100.iloc[2]['text']

'After watching this movie I was honestly disappointed - not because of the actors, story or directing - I was disappointed by this film advertisements.The trailers were suggesting that the battalion "have chosen the third way out" other than surrender or die (Polish infos were even misguiding that they had the choice between being killed by own artillery or German guns, they even translated the title wrong as "misplaced battalion"). This have tickled the right spot and I bought the movie.The disappointment started when I realized that the third way is to just sit down and count dead bodies followed by sitting down and counting dead bodies... Then I began to think "hey, this story can\'t be that simple... I bet this clever officer will find some cunning way to save what left of his troops". Well, he didn\'t, they were just sitting and waiting for something to happen. And so was I.The story was based on real events of World War I, so the writers couldn\'t make much use of their imaginat

## Save dataset to S3

In [None]:
from io import StringIO
import boto3
import sagemaker

sess = sagemaker.Session()
bucket = sess.default_bucket()

csv_buffer = StringIO()
df_100.to_csv(csv_buffer, index_label="Text ID")
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'imdb.csv').put(Body=csv_buffer.getvalue())