In [2]:
import numpy as np
import pandas as pd

### Reading and Splitting the Data

In [2]:
data = pd.read_csv("IMDB_Dataset.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
data["label"] = data["sentiment"].apply(lambda x: int(x == "positive"))
data.head()

Unnamed: 0,review,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [4]:
data = data.rename(columns={"sentiment": "label_str"})
data.head()

Unnamed: 0,review,label_str,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   label_str  50000 non-null  object
 2   label      50000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.1+ MB


In [6]:
from sklearn.model_selection import train_test_split

data_train, data_val = train_test_split(data, test_size=0.2, random_state=271)
data_val, data_test = train_test_split(data_val, test_size=0.5, random_state=314)

print(data_train.shape, data_val.shape, data_test.shape)

(40000, 3) (5000, 3) (5000, 3)


In [7]:
data_train.to_csv("./imdb_train.csv", index=False)
data_val.to_csv("./imdb_val.csv", index=False)
data_test.to_csv("./imdb_test.csv", index=False)

### Processing, Tokenizing and Padding Datasets

In [6]:
import sys
sys.path.append("/Users/emrebaloglu/Documents/RL/basic_reinforcement_learning")
import NLP_utils.preprocessing as nlp_processing

data_path = "/Users/emrebaloglu/Documents/RL/basic_reinforcement_learning/NLP_datasets/IMDB_reviews"
data = pd.read_csv(data_path + "/imdb_train.csv")
print(data.columns)
data_val = pd.read_csv(data_path + "/imdb_val.csv")
data_test = pd.read_csv(data_path + "/imdb_test.csv")

Index(['review', 'label_str', 'label'], dtype='object')


#### **Process with default tokenizer** (skip this part if you wish bert tokenization)

In [4]:
import json

data, tokenizer = nlp_processing.tokenize_data(data, ["review"], preprocess=True)
print("n_words_1: ", len(tokenizer.word_index) + 1)
data, max_len = nlp_processing.pad_tokenized_data(data, ["review_tokenized"])
print("max_len_1: ", max_len)


data_val, tokenizer = nlp_processing.tokenize_data(data_val, ["review"], preprocess=True, tokenizer=tokenizer)
print("n_words_2: ", len(tokenizer.word_index) + 1)
data_val, _ = nlp_processing.pad_tokenized_data(data_val, ["review_tokenized"], max_len=max_len)


data_test, tokenizer = nlp_processing.tokenize_data(data_test, ["review"], preprocess=True, tokenizer=tokenizer)
print("n_words_2: ", len(tokenizer.word_index) + 1)
data_test, _ = nlp_processing.pad_tokenized_data(data_test, ["review_tokenized"], max_len=max_len)
print(np.stack(data_test["review_tokenized"].values).shape) 
   
# save the processed data in pickle files

nlp_processing.storeDf2Pickle(data, data_path + "/imdb-train.pkl")
nlp_processing.storeDf2Pickle(data_val, data_path + "/imdb-val.pkl")
nlp_processing.storeDf2Pickle(data_test, data_path + "/imdb-test.pkl")

data_info = {"path": data_path, "max_len": max_len, "vocab_size": len(tokenizer.word_index) + 1}
with open(data_path + "/data_info.json", "w") as out:
    json.dump(data_info, out)

data_test = nlp_processing.openDfFromPickle(data_path + "/imdb-test.pkl")
print(data_test.sample(5))
samples = np.stack(data_test["review_tokenized"].values)
print(samples.shape)
ss = []
for j in range(samples.shape[0]):
    ss.append(np.split(samples[j], 2))

print(len(ss), len(ss[0]))
print(ss[0][0].shape)

Vocabulary size: 115956.


100%|██████████| 1/1 [00:04<00:00,  4.19s/it]


n_words_1:  115956
On column ['review_tokenized'], maximum sentence length is 2388.
max_len_1:  2388
Vocabulary size: 122058.


100%|██████████| 1/1 [00:00<00:00,  1.50it/s]


n_words_2:  122058
On column ['review_tokenized'], maximum sentence length is 2388.
Vocabulary size: 127568.


100%|██████████| 1/1 [00:00<00:00,  1.83it/s]


n_words_2:  127568
On column ['review_tokenized'], maximum sentence length is 2388.
(5000, 2388)
                                                 review label_str  label  \
2773  National Lampoon Class Reunion is classic come...  positive      1   
2791  Sondra Locke stinks in this film but then she ...  negative      0   
3114  I wouldn go so far as to not recommend this mo...  positive      1   
2692   Gone With The Wind is one of the most overrat...  negative      0   
774   You ve been fouled and beaten up in submission...  negative      0   

                                       review_tokenized  
2773  [3147, 8965, 9237, 16970, 5, 367, 211, 17, 34,...  
2791  [11366, 8580, 5125, 7, 10, 17, 18, 106, 63, 11...  
3114  [131, 605, 147, 37, 236, 14, 4, 22, 372, 10, 1...  
2692  [4338, 539, 12, 4106, 5, 26, 3, 1, 87, 4554, 9...  
774   [193, 138, 72, 45875, 2, 3620, 50, 7, 12686, 2...  
(5000, 2388)
5000 2
(1194,)


#### **Process with pretrained bert tokenizer**

In [9]:
data_path = "/Users/emrebaloglu/Documents/RL/basic_reinforcement_learning/NLP_datasets/IMDB_reviews"
data = pd.read_csv(data_path + "/imdb_train.csv")
max_len = max(len(x.split(" "))-1 for x in data["review"].values)
print(max_len)

2469


In [10]:
data_path = "/Users/emrebaloglu/Documents/RL/basic_reinforcement_learning/NLP_datasets/IMDB_reviews"
data = pd.read_csv(data_path + "/imdb_train.csv")
print(data.columns)
data_val = pd.read_csv(data_path + "/imdb_val.csv")
data_test = pd.read_csv(data_path + "/imdb_test.csv")

data = nlp_processing.process_df_texts(data, ["review"])
data_val = nlp_processing.process_df_texts(data_val, ["review"])
data_test = nlp_processing.process_df_texts(data_test, ["review"])
max_len = 512
data, tokenizer = nlp_processing.bert_tokenize_data(data, None, ["review"], max_len=max_len)
data_val, tokenizer = nlp_processing.bert_tokenize_data(data_val, tokenizer, ["review"], max_len=max_len)
data_test, tokenizer = nlp_processing.bert_tokenize_data(data_test, tokenizer, ["review"], max_len=max_len)

data_info_bert = {"path": data_path, "max_len": max_len, "vocab_size": tokenizer.vocab_size}
with open(data_path + "/data_info_bert.json", "w") as out:
    json.dump(data_info_bert, out)

print(data.columns)

nlp_processing.storeDf2Pickle(data, data_path + "/imdb-train-bert.pkl")
nlp_processing.storeDf2Pickle(data_val, data_path + "/imdb-val-bert.pkl")
nlp_processing.storeDf2Pickle(data_test, data_path + "/imdb-test-bert.pkl")

data = nlp_processing.openDfFromPickle(data_path + "/imdb-train-bert.pkl")
print(data.columns)
print(data.head())
dd = data.iloc[0,:]
print(dd)
print(data["review_bert_input_ids"].iloc[0])
print(tokenizer.decode(data["review_bert_input_ids"].iloc[0]))

Index(['review', 'label_str', 'label'], dtype='object')


Applying bert-tokenization on review.: 100%|██████████| 40000/40000 [03:03<00:00, 218.37it/s]
Applying bert-tokenization on review.: 100%|██████████| 5000/5000 [00:23<00:00, 214.20it/s]
Applying bert-tokenization on review.: 100%|██████████| 5000/5000 [00:23<00:00, 214.14it/s]


Index(['review', 'label_str', 'label', 'review_bert_input_ids',
       'review_bert_token_type_ids', 'review_bert_attention_mask'],
      dtype='object')
Index(['review', 'label_str', 'label', 'review_bert_input_ids',
       'review_bert_token_type_ids', 'review_bert_attention_mask'],
      dtype='object')
                                              review label_str  label  \
0  Once again fell for it in my roots crave fun a...  negative      0   
1  Of all the movies in the history of movies can...  negative      0   
2  Like most other reviewers really enjoyed this ...  positive      1   
3  What waste of time ve tried to sit through Sky...  negative      0   
4  Not only is this very interesting exploration ...  positive      1   

                               review_bert_input_ids  \
0  [101, 2857, 1254, 2204, 1111, 1122, 1107, 1139...   
1  [101, 2096, 1155, 1103, 5558, 1107, 1103, 1607...   
2  [101, 2409, 1211, 1168, 19475, 1541, 4927, 114...   
3  [101, 1327, 5671, 1104, 11

In [12]:
def foo(a, b, c):
   return a, b, c

a, *b = foo(1, 2, 3)
a

1

In [13]:
b

[2, 3]