# Preparing dependencies...

In [None]:
!pip install transformers
!pip install torch

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
% pip install sentencepiece

In [None]:
! git clone https://github.com/google/sentencepiece.git 
! cd sentencepiece
! mkdir build
! cd build
! cmake ..
! make -j $(nproc)
! sudo make install
! sudo ldconfig -v

In [None]:
!git clone https://github.com/Microsoft/vcpkg.git
!cd vcpkg
!./bootstrap-vcpkg.sh
!./vcpkg integrate install
!./vcpkg install sentencepiece

In [None]:
!pip install -q -U watermark

In [None]:
%reload_ext watermark
%watermark -v -p numpy,pandas,torch,transformers

In [None]:
!pip install numpy git+https://github.com/makcedward/nlpaug.git

In [None]:
!pip install torch>=1.6.0 transformers>=4.0.0 sentencepiece

In [None]:
!pip install nltk>=3.4.5

In [None]:
!pip install textaugment

In [None]:
!brew install wget

!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
!gzip -d GoogleNews-vectors-negative300.bin.gz

In [None]:
!pip install librosa>=0.7.1 matplotlib

In [None]:
!pip install wget

In [None]:
!pip install -U sentence-transformers


# Read Dataset

In [None]:
import transformers
from transformers import XLNetTokenizer, XLNetModel, AdamW, get_linear_schedule_with_warmup
import torch

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn import metrics
from collections import defaultdict
from textwrap import wrap
from pylab import rcParams

from torch import nn, optim
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset,RandomSampler,SequentialSampler
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from sklearn.utils import shuffle
import re
from transformers import XLNetTokenizer, XLNetModel

import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action
from nltk.corpus import stopwords
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.tokenize import word_tokenize
from sklearn import metrics
from transformers import XLNetForSequenceClassification
import sklearn
from sentence_transformers import SentenceTransformer, util
import graphviz
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

nltk.download('punkt')
nltk.download('sentiwordnet')
nltk.download('stopwords')
stop_words=stopwords.words('english')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [None]:
path_to_data = "resources/imdb.csv"
df = pd.read_csv(path_to_data)
df.head()

In [None]:
df = shuffle(df)

In [None]:
df = df[:40000]
len(df)

40000

In [None]:
def clean_text(text):
    text = re.sub(r"@[A-Za-z0-9]+", ' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    text = re.sub(r"[^a-zA-z.!?'0-9]", ' ', text)
    text = re.sub('\t', ' ',  text)
    text = re.sub(r" +", ' ', text)
    return text

In [None]:
df['review'] = df['review'].apply(clean_text)

In [None]:
def sentiment2label(sentiment):
    if sentiment == "positive":
        return 1
    else :
        return 0

df['sentiment'] = df['sentiment'].apply(sentiment2label)

In [None]:
df['sentiment'].value_counts()

1    20004
0    19996
Name: sentiment, dtype: int64

In [None]:
class_names = ['negative', 'positive']

In [None]:
PRE_TRAINED_MODEL_NAME = 'xlnet-base-cased'
tokenizer = XLNetTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798011.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1382015.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760.0, style=ProgressStyle(description_…




In [None]:
token_lens = []
for txt in df['review']:
  tokens = tokenizer.encode(txt, max_length=512)
  token_lens.append(len(tokens))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
MAX_LEN = 512

In [None]:
class ImdbDataset(Dataset):

    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, item):
        review = str(self.reviews[item])
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
        review,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        pad_to_max_length=False,
        return_attention_mask=True,
        return_tensors='pt',
        )

        input_ids = pad_sequences(encoding['input_ids'], maxlen=MAX_LEN, dtype=torch.Tensor ,truncating="post",padding="post")
        input_ids = input_ids.astype(dtype = 'int64')
        input_ids = torch.tensor(input_ids) 

        attention_mask = pad_sequences(encoding['attention_mask'], maxlen=MAX_LEN, dtype=torch.Tensor ,truncating="post",padding="post")
        attention_mask = attention_mask.astype(dtype = 'int64')
        attention_mask = torch.tensor(attention_mask)       

        return {
        'review_text': review,
        'input_ids': input_ids,
        'attention_mask': attention_mask.flatten(),
        'targets': torch.tensor(target, dtype=torch.long)
        }

In [None]:
df_train, df_test = train_test_split(df, test_size=0.5, random_state=101)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=101)

In [None]:
df_train.shape, df_val.shape, df_test.shape

In [None]:
test_text=[]
for txt in df_test['review']:
  if len(txt.split(' '))<30:
    test_text.append(txt)
print(len(test_text))
print(test_text)

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = ImdbDataset(
    reviews=df.review.to_numpy(),
    targets=df.sentiment.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )

In [None]:
BATCH_SIZE = 4

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels = 2)
model = model.to(device)

In [None]:
model

# Training The XLNET Model

In [None]:
EPOCHS = 3

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
                                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
                                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay':0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)

total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)


In [None]:
data = next(iter(val_data_loader))
data.keys()

In [None]:
def train_epoch(model, data_loader, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    acc = 0
    counter = 0
  
    for d in data_loader:
        input_ids = d["input_ids"].reshape(4,512).to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)
        
        outputs = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels = targets)
        loss = outputs[0]
        logits = outputs[1]

        # preds = preds.cpu().detach().numpy()
        _, prediction = torch.max(outputs[1], dim=1)
        targets = targets.cpu().detach().numpy()
        prediction = prediction.cpu().detach().numpy()
        accuracy = metrics.accuracy_score(targets, prediction)

        acc += accuracy
        losses.append(loss.item())
        
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        counter = counter + 1

    return acc / counter, np.mean(losses)

In [None]:
def eval_model(model, data_loader, device, n_examples):
    model = model.eval()
    losses = []
    acc = 0
    counter = 0
  
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].reshape(4,512).to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            
            outputs = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels = targets)
            loss = outputs[0]
            logits = outputs[1]

            _, prediction = torch.max(outputs[1], dim=1)
            targets = targets.cpu().detach().numpy()
            prediction = prediction.cpu().detach().numpy()
            accuracy = metrics.accuracy_score(targets, prediction)

            acc += accuracy
            losses.append(loss.item())
            counter += 1

    return acc / counter, np.mean(losses)

In [None]:
%%time
history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,     
        optimizer, 
        device, 
        scheduler, 
        len(df_train)
    )

    print(f'Train loss {train_loss} Train accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        val_data_loader, 
        device, 
        len(df_val)
    )

    print(f'Val loss {val_loss} Val accuracy {val_acc}')
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'models/xlnet_model.bin')
        best_accuracy = val_acc

Epoch 1/3
----------


  cpuset_checked))


Train loss 0.38821425964966766 Train accuracy 0.91075
Val loss 0.304517989745643 Val accuracy 0.9383

Epoch 2/3
----------
Train loss 0.20119765334959958 Train accuracy 0.9595
Val loss 0.30087769410003673 Val accuracy 0.9417

Epoch 3/3
----------
Train loss 0.08985227801068103 Train accuracy 0.9839
Val loss 0.31702735697345924 Val accuracy 0.9479

CPU times: user 1h 55min 20s, sys: 2min 27s, total: 1h 57min 48s
Wall time: 1h 57min 45s


In [None]:
model.load_state_dict(torch.load('models/xlnet_model.bin'))

In [None]:
model = model.to(device)

In [None]:
test_acc, test_loss = eval_model(
  model,
  test_data_loader,
  device,
  len(df_test)
)

print('Test Accuracy :', test_acc)
print('Test Loss :', test_loss)

Test Accuracy : 0.9478
Test Loss : 0.3218533544650476


In [None]:
def get_predictions(model, data_loader):
    model = model.eval()
    
    review_texts = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:

            texts = d["review_text"]
            input_ids = d["input_ids"].reshape(4,512).to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            
            outputs = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels = targets)

            loss = outputs[0]
            logits = outputs[1]
            
            _, preds = torch.max(outputs[1], dim=1)

            probs = F.softmax(outputs[1], dim=1)

            review_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(probs)
            real_values.extend(targets)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()
    return review_texts, predictions, prediction_probs, real_values

In [None]:
y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
  model,
  test_data_loader
)

In [None]:
print(classification_report(y_test, y_pred, target_names=class_names))

              precision    recall  f1-score   support

    negative       0.96      0.94      0.95      5004
    positive       0.94      0.96      0.95      4996

    accuracy                           0.95     10000
   macro avg       0.95      0.95      0.95     10000
weighted avg       0.95      0.95      0.95     10000



In [None]:
def predict_sentiment(text):
    review_text = text

    encoded_review = tokenizer.encode_plus(
    review_text,
    max_length=MAX_LEN,
    add_special_tokens=True,
    return_token_type_ids=False,
    pad_to_max_length=False,
    return_attention_mask=True,
    return_tensors='pt',
    )

    input_ids = pad_sequences(encoded_review['input_ids'], maxlen=MAX_LEN, dtype=torch.Tensor ,truncating="post",padding="post")
    input_ids = input_ids.astype(dtype = 'int64')
    input_ids = torch.tensor(input_ids) 

    attention_mask = pad_sequences(encoded_review['attention_mask'], maxlen=MAX_LEN, dtype=torch.Tensor ,truncating="post",padding="post")
    attention_mask = attention_mask.astype(dtype = 'int64')
    attention_mask = torch.tensor(attention_mask) 

    input_ids = input_ids.reshape(1,512).to(device)
    attention_mask = attention_mask.to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    outputs = outputs[0][0].cpu().detach()

    probs = F.softmax(outputs, dim=-1).cpu().detach().numpy().tolist()
    _, prediction = torch.max(outputs, dim =-1)

    print(prediction)
    print("Positive score:", probs[1])
    print("Negative score:", probs[0])
    print(f'Review text: {review_text}')
    print(f'Sentiment  : {class_names[prediction]}')
    return prediction



In [None]:
text = "Movie is the worst one I have ever seen!! The story has no meaning at all"
predict_sentiment(text)

In [None]:
text = "This is the best movie I have ever seen!! The story is such a motivation"
predict_sentiment(text)

# ALIME And tree-ALIME

In [None]:
def data_augment_text(text,n,stop_words):
  aug_we_insert = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert",aug_p=1.0 , aug_min=5)
  aug_we_bert_subs = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute",aug_p=1.0, aug_min=5)
  aug_we_distill_bert_subs= naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', action="substitute",aug_p=1.0, aug_min=5)
  aug_we_roberta_subs = naw.ContextualWordEmbsAug(model_path='roberta-base', action="substitute",aug_p=1.0, aug_min=5)
  aug_syn = naw.SynonymAug(aug_src='wordnet',stopwords=stop_words,aug_p=1.0, aug_min=5)
  aug_ant= naw.AntonymAug(stopwords=stop_words,aug_p=1.0, aug_min=5)
  aug_random = naw.RandomWordAug(stopwords=stop_words,aug_p=1.0, aug_min=5)
  result=[]
  m1=(int)(n/7)
  m2=(int)(n/7)
  m3=(int)(n/7)
  m4=(int)(n/3.5)
  m5=(int)(n/3.5)
  m6=(int)(n/28)
  m7=n-(m1+m2+m3+m4+m5+m6)
  result+=aug_we_insert.augment(text,m1)
  result+=aug_we_bert_subs.augment(text,m2)
  result+=aug_we_distill_bert_subs.augment(text,m3)
  result+=aug_we_roberta_subs.augment(text,m4)
  result+=aug_syn.augment(text,m5)
  result+=aug_ant.augment(text,m6)
  result+=aug_random.augment(text,m7)

  # for i in range(len(result)):
  #   if np.random.rand()<0.5:
  #     res=result[i]
  #     res=res.replace('no','')
  #     res=res.replace('has ','has not ')
  #     res=res.replace('have ', 'have not ')
  #     res=res.replace('did ', 'did not ')
  #     res=res.replace('do ', 'do not ')
  #     res=res.replace('does ', 'does not ')
  #     res=res.replace('could ', 'could not ')
  #     res=res.replace('should ', 'should not ')
  #     res=res.replace('can ', 'can not ')
  #     res=res.replace('had ', 'had not ')
  #     res=res.replace('ever ', 'never ')
  #     res=res.replace('did ', 'did not ')
  #     res=res.replace('is ', 'is not ')
  #     res=res.replace('was ', 'was not ')
  #     res=res.replace('are ', 'are not ')
  #     res=res.replace('were ', 'were not ')
  #     result[i]=res
  # print(result)
  return result

In [None]:
model_sim = SentenceTransformer('paraphrase-mpnet-base-v2')

In [None]:
def calc_sims(data , x):
  sim=np.zeros(len(data))
  for i in range(len(data)):
    sim[i]=util.pytorch_cos_sim(model_sim.encode(data[i]),model_sim.encode(x))
  return sim

In [None]:
def model_predict(dataset):
  labels=np.zeros(len(dataset))
  for i in range(len(dataset)):
    labels[i]=predict_sentiment(dataset[i])
  return labels

In [None]:
def extract_features(data):
  tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95)
  tfidf_vectorizer.fit_transform(data)
  feature_set=tfidf_vectorizer.transform(data)
  return feature_set,tfidf_vectorizer

In [None]:
# nltk.download('stopwords')
# stop_words = set(stopwords.words('english'))
def preprocess_text(text):
  text=clean_text(text)
  text=text.lower()
  text=text.replace('.',' ')
  text=text.replace('n\'t',' not')
  text_tokens = word_tokenize(text)
  tokens_without_sw = [word for word in text_tokens if not word in stop_words]
  filtered_sentence = (" ").join(tokens_without_sw)
  return filtered_sentence

In [None]:
 def get_num_of_pos_and_neg_words(text):
   text_tokens = word_tokenize(text)
   num_pos_words=0
   num_neg_words=0
   for token in text_tokens:
     sent_syns=swn.senti_synsets(token)
     num_pos=0
     num_neg=0
     for sent_syn in sent_syns:
      #  print(sent_syn)
       if sent_syn.pos_score()>sent_syn.neg_score():
         num_pos+=1
       elif sent_syn.pos_score()<sent_syn.neg_score():
         num_neg+=1
     if num_pos>num_neg:
       num_pos_words+=1
     elif num_pos<num_neg:
       num_neg_words+=1
   return num_pos_words,num_neg_words

In [None]:
num_pos_words,num_neg_words=get_num_of_pos_and_neg_words(preprocess_text(text))

In [None]:
x='This movie is the worst one I have ever seen!! The story has no meaning at all'

In [None]:
def ALIME_main_and_tree(x, n , model):
  prep_x=preprocess_text(x)
  new_data=data_augment_text(x,n,stop_words)
  prep_data=[]
  for dat in new_data:
    prep_data.append(preprocess_text(dat))
  sim=calc_sims(prep_data,prep_x)
  local_dataset=[]
  weights=np.zeros(len(sim))
  # local_dataset=[]
  for i in range(len(sim)):
    local_dataset.append(prep_data[i])
    weights[i]=np.exp(-(1-sim[i]))
 
  labels=model_predict(local_dataset)

  sum=np.sum(labels)
  l_model=None
  tree_model=None
  predict=None
  feature_set,tfidf_vectorizer= extract_features(local_dataset)
  new_features_pos=[]
  new_features_neg=[]
  for dat in prep_data:
    num_pos_words,num_neg_words=get_num_of_pos_and_neg_words(dat)
    new_features_pos.append(num_pos_words)
    new_features_neg.append(num_neg_words)
  feature_set_array=feature_set.toarray()
  new_feature_set=np.concatenate((np.asarray(new_features_neg).reshape(-1,1), feature_set_array), axis=1)
  new_feature_set=np.concatenate((np.asarray(new_features_pos).reshape(-1,1), new_feature_set), axis=1)  
  if sum==0 or sum==len(labels):
    predict=sum/len(labels)
  else:
    l_model=sklearn.linear_model.LogisticRegression(max_iter=150,solver='liblinear', random_state=0)
  # print(local_dataset)
    l_model.fit(feature_set,labels,sample_weight=weights )

    tree_model=sklearn.tree.DecisionTreeClassifier(random_state=0, max_depth=5)
    tree_model.fit(new_feature_set,labels,sample_weight=weights)

  return l_model,tree_model,predict,feature_set,tfidf_vectorizer


In [None]:
file_stop=open('stop_words.txt')
stop_words= [x.replace('\n','') for x in file_stop.readlines()]

In [None]:
count=0
f=open('text_data_test.txt')
for x in f.readlines():
  print(count)
# x='Extremely weak movie, it felt as though the director watched some Roger Moore era bond movie and took it from there. You never felt that any of the main characters were in danger and the main villain\'s motivation and plan was very \'Doctor Evil\' including evil lair. Poorly directed and choreographed fight scenes. Meh.'
  prep_text=x
  l_model,tree_model,predict,feature_set,tfidf_vectorizer= ALIME_main_and_tree(prep_text,150,model)
  fig = plt.figure(figsize =(10,10))
  coefs=[]
  feature_names=[]
  select_df = pd.DataFrame(columns=['coefs','feature_name'])
  if l_model !=None and tree_model!=None:
    coefs_row= l_model.coef_.reshape(-1)
    df = pd.DataFrame({'coefs':coefs_row,
                      'feature_name':tfidf_vectorizer.get_feature_names()})
    df = df.sort_values(by='coefs')
    select_df = df.head(5).append(df.tail(5))

    print(coefs_row.shape)

    bar=plt.barh(select_df.feature_name, select_df.coefs)

    fig.savefig('text/linear_model/{}.png'.format(count))
    tree_features=['negative_word_count','positive_word_count']
    tree_features+=tfidf_vectorizer.get_feature_names()
    dot_data=sklearn.tree.export_graphviz(tree_model, out_file=None, 
                                  feature_names=tree_features,  
                                  class_names=class_names,
                                  filled=True,impurity=False)
    graph = graphviz.Source(dot_data, format="png") 
    graph.render('text/tree_model/{}'.format(count))
  count+=1

In [None]:
!zip -r text_test.zip /text

In [None]:
print(tfidf_vectorizer.get_feature_names())

In [None]:
l_model.coef_