In [0]:
!pip install transformers

In [0]:
import numpy as np
import pandas as pd
import torch
import transformers as ppb # pytorch transformers
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)
df.head()


Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


**Data Pre-processing (Raw data --> Features)**

In [0]:
#Utility functions

def tokenize_sentence(sentence):
  tokenized = tokenizer.encode(review, add_special_tokens=True)
  return tokenized

def max_len_for_padding(tokenized_sentences):
  max_len = 0
  for i in range(len(tokenized_sentences)):
      if len(tokenized_sentences[i])>max_len:
          max_len = len(tokenized_sentences[i])
  return max_len

def padding_tokenized_sentences(tokenized_sentences):
  max_len = max_len_for_padding(tokenized_sentences)
  padded = np.array([token_sent + [0]*(max_len-len(token_sent)) for token_sent in tokenized_sentences])
  padded = np.array(padded)
  padded = np.reshape(padded,(len(tokenized_sentences),max_len))
  attention_mask = np.where(padded != 0, 1, 0)
  return padded,attention_mask


def get_features(padded,attention_mask,model):
  input_ids = torch.tensor(padded)  
  attention_mask = torch.tensor(attention_mask)

  with torch.no_grad():
      last_hidden_states = model(input_ids, attention_mask=attention_mask)
  features = last_hidden_states[0][:,0,:].numpy()
  return features

In [5]:
# Pretrained Model
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel,ppb.BertTokenizer,'bert-base-uncased')
#model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)


# Pre-processing
tokenized_sentences = []

for review in df[0][:2000]: #training on 2000 sentences only due to performance issues
  tokenized_sentences.append(tokenize_sentence(review))

max_len = max_len_for_padding(tokenized_sentences)
print(len(tokenized_sentences))

padded,attention_mask = padding_tokenized_sentences(tokenized_sentences)
print(padded.shape,attention_mask.shape)

features = get_features(padded,attention_mask,model)
print(features.shape)


2000
(2000, 59) (2000, 59)
(2000, 768)


**Classification Algorithm - Sckit-learn**


In [6]:
labels = df[1][:2000] # target variable for the reviews in the dataset
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)
lr_clf.score(test_features, test_labels)



0.842

##**Predict the Sentiment**




In [0]:
def feature_extraction(review):
  tokenized = tokenizer.encode(review, add_special_tokens=True)
  padded = np.array(tokenized + [0]*(max_len-len(tokenized)))
  padded = np.array(padded)
  padded = np.reshape(padded,(1,max_len))
  attention_mask = np.where(padded != 0, 1, 0)
  input_ids = torch.tensor(padded)  
  attention_mask = torch.tensor(attention_mask)

  with torch.no_grad():
      last_hidden_states = model(input_ids, attention_mask=attention_mask)
  features = last_hidden_states[0][:,0,:].numpy()
  return features

def predict_sentiment(review,model):
  features = feature_extraction(review)
  prediction = model.predict(features)
  if prediction[0]== 1:
    return 'Positive'
  else:
    return 'Negative'


In [8]:
review= "I'm glad that I have watched this movie"
sentiment = predict_sentiment(review,lr_clf)
print(sentiment)

Positive


In [9]:
review= "I'm glad that I haven't watched this movie"
sentiment = predict_sentiment(review,lr_clf)
print(sentiment)

Negative
