# Demonstrating Hugging Face Modin Interoperability
## All the examples in this section are taken/ adapted from https://www.kaggle.com/code/satyampd/imdb-sentiment-analysis-using-bert-w-huggingface/notebook

In [1]:
import modin.pandas as pd
import numpy as np # linear algebra

In [2]:
import tensorflow as tf
import sklearn
from tqdm import tqdm

In [3]:
import urllib.request
url_path = "https://modin-datasets.intel.com/testing/IMDB_Dataset.csv"
urllib.request.urlretrieve(url_path, "imdb.csv")

('imdb.csv', <http.client.HTTPMessage at 0x7fb765106610>)

In [4]:
%%time
modin_df = pd.read_csv("imdb.csv")


    import ray
    ray.init(runtime_env={'env_vars': {'__MODIN_AUTOIMPORT_PANDAS__': '1'}})

2023-04-11 10:27:18,363	INFO worker.py:1553 -- Started a local Ray instance.


CPU times: user 575 ms, sys: 261 ms, total: 836 ms
Wall time: 8.58 s


In [5]:
modin_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
type(modin_df)

modin.pandas.dataframe.DataFrame

In [7]:
modin_df.sample()

Unnamed: 0,review,sentiment
30204,Jack Lemmon was one of our great actors. His p...,negative


In [8]:
from transformers import BertTokenizer, TFBertForSequenceClassification

In [9]:
# Loading the BERT Classifier and Tokenizer along with Input module
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

2023-04-11 10:27:24.824712: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [11]:
# changing positive and negative into numeric values

def cat2num(value):
    if value=='positive': 
        return 1
    else: 
        return 0
    
modin_df['sentiment']  =  modin_df['sentiment'].apply(cat2num)
train = modin_df[:45000]
test = modin_df[45000:]

In [12]:
# But first see BERT tokenizer exmaples and other required stuff!

example='In this Kaggle notebook, I will do sentiment analysis using BERT with Huggingface'
tokens=tokenizer.tokenize(example)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(tokens)
print(token_ids)

['in', 'this', 'ka', '##ggle', 'notebook', ',', 'i', 'will', 'do', 'sentiment', 'analysis', 'using', 'bert', 'with', 'hugging', '##face']
[1999, 2023, 10556, 24679, 14960, 1010, 1045, 2097, 2079, 15792, 4106, 2478, 14324, 2007, 17662, 12172]


In [13]:
type(train)

modin.pandas.dataframe.DataFrame

In [14]:
def convert_data_to_examples(train, test, review, sentiment): 
    train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[review], 
                                                          label = x[sentiment]), axis = 1)

    validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[review], 
                                                          label = x[sentiment]), axis = 1,)
  
    return train_InputExamples, validation_InputExamples

train_InputExamples, validation_InputExamples = convert_data_to_examples(train,  test, 'review',  'sentiment')

In [15]:
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in tqdm(examples):
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,    # Add 'CLS' and 'SEP'
            max_length=max_length,    # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],input_dict["token_type_ids"], input_dict['attention_mask'])
        features.append(InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label) )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'review'
LABEL_COLUMN = 'sentiment'

In [16]:
train_InputExamples

0        InputExample(guid=None, text_a="One of the oth...
1        InputExample(guid=None, text_a='A wonderful li...
2        InputExample(guid=None, text_a='I thought this...
3        InputExample(guid=None, text_a="Basically ther...
4        InputExample(guid=None, text_a='Petter Mattei\...
                               ...                        
44995    InputExample(guid=None, text_a="I watched this...
44996    InputExample(guid=None, text_a="I am a sucker ...
44997    InputExample(guid=None, text_a="I am a college...
44998    InputExample(guid=None, text_a="huge Ramones f...
44999    InputExample(guid=None, text_a='I rented this ...
Length: 45000, dtype: object

In [17]:
train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

100%|██████████| 45000/45000 [03:38<00:00, 205.86it/s]


In [18]:
validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)

100%|██████████| 5000/5000 [00:22<00:00, 221.29it/s]


In [19]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

In [20]:
model.fit(train_data, epochs=2, validation_data=validation_data)

Epoch 1/2
    318/Unknown - 5811s 18s/step - loss: 0.3633 - accuracy: 0.8348

KeyboardInterrupt: 

In [21]:
pred_sentences = ['worst movie of my life, will never watch movies from this series', 
                  'Wow, blew my mind, what a movie by Marvel, animation and story is amazing']

In [22]:
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')   # we are tokenizing before sending into our trained model
tf_outputs = model(tf_batch)                                  
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)       # axis=-1, this means that the index that will be returned by argmax will be taken from the *last* axis.
labels = ['Negative','Positive']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
    print(pred_sentences[i], ": ", labels[label[i]])

worst movie of my life, will never watch movies from this series :  Negative
Wow, blew my mind, what a movie by Marvel, animation and story is amazing :  Positive
