# Training a Bert model for Sentiment Prediction 

Project for udemy course

In [None]:
!pip install ktrain

## Get the dataset IMDB movie Review 

In [3]:
!git clone https://github.com/laxmimerit/IMDB-Movie-Reviews-Large-Dataset-50k.git

Cloning into 'IMDB-Movie-Reviews-Large-Dataset-50k'...
remote: Enumerating objects: 10, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 10 (delta 1), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (10/10), done.


In [4]:
import numpy as np
import pandas as pd
import ktrain
from ktrain import text
import tensorflow as tf

In [5]:
data_train = pd.read_excel('/content/IMDB-Movie-Reviews-Large-Dataset-50k/train.xlsx',dtype= str)
data_test = pd.read_excel('/content/IMDB-Movie-Reviews-Large-Dataset-50k/test.xlsx',dtype= str)

In [6]:
data_train.head()

Unnamed: 0,Reviews,Sentiment
0,"When I first tuned in on this morning news, I ...",neg
1,"Mere thoughts of ""Going Overboard"" (aka ""Babes...",neg
2,Why does this movie fall WELL below standards?...,neg
3,Wow and I thought that any Steven Segal movie ...,neg
4,"The story is seen before, but that does'n matt...",neg


In [7]:
data_test.head()

Unnamed: 0,Reviews,Sentiment
0,Who would have thought that a movie about a ma...,pos
1,After realizing what is going on around us ......,pos
2,I grew up watching the original Disney Cindere...,neg
3,David Mamet wrote the screenplay and made his ...,pos
4,"Admittedly, I didn't have high expectations of...",neg


In [8]:
data_train.shape, data_test.shape

((25000, 2), (25000, 2))

## Preprocessing using ktrain

In [9]:
(X_train,y_train),(X_test,y_test),preproc = text.texts_from_df(train_df = data_train, 
                                                               text_column= 'Reviews' ,
                                                               label_columns= 'Sentiment',
                                                               val_df= data_test,
                                                               maxlen = 500,
                                                               preprocess_mode = 'bert')

['neg', 'pos']
   neg  pos
0  1.0  0.0
1  1.0  0.0
2  1.0  0.0
3  1.0  0.0
4  1.0  0.0
['neg', 'pos']
   neg  pos
0  0.0  1.0
1  0.0  1.0
2  1.0  0.0
3  0.0  1.0
4  1.0  0.0
downloading pretrained BERT model (uncased_L-12_H-768_A-12.zip)...
[██████████████████████████████████████████████████]
extracting pretrained BERT model...
done.

cleanup downloaded zip...
done.

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


In [10]:
model = text.text_classifier(name = 'bert',
                             train_data= (X_train,y_train),
                             preproc = preproc)

Is Multi-Label? False
maxlen is 500
done.


In [23]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, 500)]        0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, 500)]        0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 500, 768), ( 23440896    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 500, 768)     1536        Input-Segment[0][0]              
____________________________________________________________________________________________

In [11]:
learner = ktrain.get_learner(model=model,train_data=(X_train,y_train),
                             val_data = (X_test,y_test),
                             batch_size=6)

In [27]:
learner

<ktrain.text.learner.BERTTextClassLearner at 0x7f542ed18810>

In [12]:

#takes a few days tp find the best learning rate 
#learner.lr_find()
#learner.lr_plot()

## Training the model 

In [21]:
learner.fit_onecycle(lr=2e-5,epochs = 1)
# it took around 1.5- 2 hrs 

In [28]:
#load the bert model 
#predictor = ktrain.load_predictor('/content/bert')
predictor = ktrain.get_predictor(learner.model,preproc)
predictor

<ktrain.text.predictor.TextPredictor at 0x7f53eba7bdd0>

## Test on some sample String 

In [15]:
data = ['This movie was terrible, acting was bad', 
        'it was a great movie',
        'What a fantastic movie we loved it ' ]

In [16]:
predictor.predict(data)

['neg', 'pos', 'pos']

In [17]:
predictor.predict(data,return_proba= True)

array([[0.9974025 , 0.00259753],
       [0.00459951, 0.9954005 ],
       [0.0054377 , 0.9945623 ]], dtype=float32)

In [18]:
predictor.get_classes()

['neg', 'pos']

## Save the model 

In [19]:
predictor.save('/content/bert')

In [20]:
!zip -r /content/bert.zip /content/bert
#download the zip file 

  adding: content/bert/ (stored 0%)
  adding: content/bert/tf_model.h5 (deflated 11%)
  adding: content/bert/tf_model.preproc (deflated 52%)


## load the model 

In [22]:
predictor_load = ktrain.load_predictor('/content/bert')

In [26]:
predictor_load

<ktrain.text.predictor.TextPredictor at 0x7f542e38ddd0>