In [2]:
import os
import glob
import s3fs

In [3]:
#data_dir= './aclImdb_v1/aclImdb'
data_dir= 's3://sagemaker-ap-south-1-831906679170/aclImdb/'
folders= ['train', 'test']
fs = s3fs.S3FileSystem()

def read_data(path):
    #srch_path= os.path.join(path, "*.txt")
    #print("Searching for "+ srch_path)
    #files= glob.glob(srch_path)
    files= fs.ls(path)
    print("number of files: "+ str(len(files)))
    data= []
    for file in files:
        with fs.open(file, 'r') as fp:
            content= fp.read()
            data.append(content)
    
    return data

In [4]:
d='s3://sagemaker-ap-south-1-831906679170/aclImdb/train/pos/'
fs.ls(d)[:5]

['sagemaker-ap-south-1-831906679170/aclImdb/train/pos/0_9.txt',
 'sagemaker-ap-south-1-831906679170/aclImdb/train/pos/10000_8.txt',
 'sagemaker-ap-south-1-831906679170/aclImdb/train/pos/10001_10.txt',
 'sagemaker-ap-south-1-831906679170/aclImdb/train/pos/10002_7.txt',
 'sagemaker-ap-south-1-831906679170/aclImdb/train/pos/10003_8.txt']

In [5]:
train_pos_reviews= read_data(os.path.join(data_dir,'train','pos'))
train_neg_reviews= read_data(os.path.join(data_dir,'train','neg'))

number of files: 12500
number of files: 12500


In [6]:
from sklearn.utils import shuffle

In [7]:
train_reviews= train_pos_reviews + train_neg_reviews
train_labels= [1]* len(train_pos_reviews) + [0] * len(train_neg_reviews)

train_reviews, train_labels= shuffle(train_reviews, train_labels, random_state= 711)

In [13]:
import pickle

with open("./train_reviews.pkl",'wb') as fp:
    pickle.dump(train_reviews, fp)
    
with open("./train_labels.pkl", "wb") as fp:
    pickle.dump(train_labels, fp)

In [8]:
train_labels[:10]

[1, 1, 1, 1, 0, 1, 1, 0, 0, 1]

In [1]:
!pip install -qq transformers

In [3]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
#import torch
import numpy as np

In [4]:
MODEL_NAME = 'bert-base-uncased'
tokenizer= BertTokenizer.from_pretrained(MODEL_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [5]:
sentences= ["I like it :) ", "Go baby GO BABY it is a great movie"]
for sent in sentences:
    tokens= tokenizer.tokenize(sent)
    print(tokens)

['i', 'like', 'it', ':', ')']
['go', 'baby', 'go', 'baby', 'it', 'is', 'a', 'great', 'movie']


In [6]:
tokenizer(sentences,
         padding= 'max_length',
         max_length= 15,
         return_tensors= 'pt',
         return_token_type_ids= False
         )

ImportError: Unable to convert output to PyTorch tensors format, PyTorch is not installed.

In [11]:
print(tokenizer.pad_token_id)
print(tokenizer.sep_token_id)
print(tokenizer.cls_token_id)

0
102
101


In [12]:
from torch.utils.data import Dataset, DataLoader
from bs4 import BeautifulSoup

In [13]:
class ReviewDataset(Dataset):
    
    def __init__(self, reviews, labels, tokenizer, max_len):
        self.reviews= reviews
        self.labels= labels
        self.tokenizer= tokenizer
        self.max_len= max_len
        
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, item):
        review= self.reviews[item]
        review= BeautifulSoup(review, "html.parser").get_text()
        label= self.labels[item]
        
        encodings= tokenizer(
                review,
                padding= 'max_length',
                max_length= self.max_len,
                truncation= True,
                return_tensors= 'pt'
                )
        
        return {
            #'review': review,
            'encoding': encodings,
            'label': torch.tensor(label, dtype=torch.long)
        }

In [14]:
chk_data= ReviewDataset(sentences, [1,0], tokenizer, 5)

In [15]:
chk_data[1]

{'encoding': {'input_ids': tensor([[  101,  3414,  2963, 27157,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])},
 'label': tensor(0)}

In [16]:
def createDataLoader(dataset, tokenizer, max_len= 150, batch_size= 32, num_workers=0):
    '''
    dataset is a dictionary with 2 keys 'data', 'labels'
    '''
    encoded_dataset= ReviewDataset(dataset['data'], dataset['labels'], tokenizer, max_len)
    
    return DataLoader(encoded_dataset, batch_size= batch_size, num_workers= num_workers)

In [17]:
train_dataset= {'data': train_reviews, 'labels':train_labels}
train_dataloader= createDataLoader(train_dataset, tokenizer)

In [18]:
dl_item= next(iter(train_dataloader))
print(len(dl_item))

2


In [19]:
dl_item

{'encoding': {'input_ids': tensor([[[  101,  1135,   112,  ...,     0,     0,     0]],
  
          [[  101,   107,  1109,  ...,  1341, 11679,   102]],
  
          [[  101,  1109,  5855,  ...,     0,     0,     0]],
  
          ...,
  
          [[  101,  9913,  5098,  ...,   188,   107,   102]],
  
          [[  101,  6963,  2029,  ...,  1145,  9178,   102]],
  
          [[  101,  1448,  1104,  ...,   112,   188,   102]]]),
  'token_type_ids': tensor([[[0, 0, 0,  ..., 0, 0, 0]],
  
          [[0, 0, 0,  ..., 0, 0, 0]],
  
          [[0, 0, 0,  ..., 0, 0, 0]],
  
          ...,
  
          [[0, 0, 0,  ..., 0, 0, 0]],
  
          [[0, 0, 0,  ..., 0, 0, 0]],
  
          [[0, 0, 0,  ..., 0, 0, 0]]]),
  'attention_mask': tensor([[[1, 1, 1,  ..., 0, 0, 0]],
  
          [[1, 1, 1,  ..., 1, 1, 1]],
  
          [[1, 1, 1,  ..., 0, 0, 0]],
  
          ...,
  
          [[1, 1, 1,  ..., 1, 1, 1]],
  
          [[1, 1, 1,  ..., 1, 1, 1]],
  
          [[1, 1, 1,  ..., 1, 1, 1]]])},
 'lab

In [20]:
ip_tensor= dl_item['encoding']['input_ids']
ip_tensor.shape

torch.Size([32, 1, 150])

In [21]:
dl_item['label'].shape

torch.Size([32])

In [23]:
bert_model = BertModel.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [24]:
ip_id_tensor= dl_item['encoding']['input_ids']
attention_mask_tensor= dl_item['encoding']['attention_mask']
print(ip_id_tensor.shape)
print(attention_mask_tensor.shape)

torch.Size([32, 1, 150])
torch.Size([32, 1, 150])


In [25]:
ip_id_tensor= ip_id_tensor.squeeze()
attention_mask_tensor= attention_mask_tensor.squeeze()
print(ip_id_tensor.shape)
print(attention_mask_tensor.shape)

torch.Size([32, 150])
torch.Size([32, 150])


In [None]:
last_hidden_state, pooled_output = bert_model(
  input_ids= ip_id_tensor, 
  attention_mask= attention_mask_tensor
)

In [1]:
import boto3

In [2]:
import sagemaker 

In [3]:
sagemaker_session= sagemaker.Session()
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()
print(bucket)

sagemaker-ap-south-1-831906679170


In [11]:
s3= boto3.client('s3')
s3.download_fileobj(bucket, 'aclImdb','train')

ValueError: Fileobj must implement write

In [10]:
help(s3.download_fileobj)

Help on method download_fileobj in module boto3.s3.inject:

download_fileobj(Bucket, Key, Fileobj, ExtraArgs=None, Callback=None, Config=None) method of botocore.client.S3 instance
    Download an object from S3 to a file-like object.
    
    The file-like object must be in binary mode.
    
    This is a managed transfer which will perform a multipart download in
    multiple threads if necessary.
    
    Usage::
    
        import boto3
        s3 = boto3.client('s3')
    
        with open('filename', 'wb') as data:
            s3.download_fileobj('mybucket', 'mykey', data)
    
    :type Bucket: str
    :param Bucket: The name of the bucket to download from.
    
    :type Key: str
    :param Key: The name of the key to download from.
    
    :type Fileobj: a file-like object
    :param Fileobj: A file-like object to download into. At a minimum, it must
        implement the `write` method and must accept bytes.
    
    :type ExtraArgs: dict
    :param ExtraArgs: Extra argumen