In [1]:
import tensorflow as tf


In [2]:
num_gpus_available = len(tf.config.experimental.list_physical_devices('GPU'))
print("Num GPUs Available: ", num_gpus_available)
assert num_gpus_available > 0

Num GPUs Available:  1


In [3]:
!pip install transformers



In [4]:
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification
import pandas as pd
import numpy as np

In [5]:
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# Load training data
df = pd.read_excel('/content/trynow.xlsx')

aspect_mapping = {'P"': 'P' ,'N"':'N' , 'NEU"': 'NEU','P': 'P','NEU':'NEU','N':'N'}

df['Polarity'] = df['Polarity'].map(aspect_mapping)

In [8]:
df.head()

Unnamed: 0,"Comment,Aspect,Polarity;;;",Aspect,Polarity,True,"Comment,Aspect,Polarity",Comment
0,end of course project was challenging and fun....,C,P,True,end of course project was challenging and fun....,end of course project was challenging and fun....
1,teaches you how to use gdb and debug code effe...,C,NEU,True,teaches you how to use gdb and debug code effe...,teaches you how to use gdb and debug code effe...
2,wow! i took my first course that was heavy in ...,C,P,True,wow! i took my first course that was heavy in ...,wow! i took my first course that was heavy in ...
3,"difficult project but it was worth it,S,NEU;;;",S,NEU,True,"difficult project but it was worth it,S,NEU",difficult project but it was worth it
4,"""the course requires 100% perfection in order ...",S,N,True,these flaws could be overlooked if auditing th...,these flaws could be overlooked if auditing th...


In [9]:
aspect_mapping = {'P': 0 ,'N':1 , 'NEU': 2}

df['Polarity_label'] = df['Polarity'].map(aspect_mapping)

In [11]:
df = df[['Polarity_label','Comment']]

In [10]:
index = df.index
number_of_rows = len(index)
print(number_of_rows)

21940


In [12]:
df.tail()

Unnamed: 0,Polarity_label,Comment
21935,1,no instructor participates in discussion. no o...
21936,1,"bad useless garbage,this is the most bad onlin..."
21937,1,poor quality and even worse presentation by th...
21938,1,need more (understandable) subtitles. peer-rev...
21939,1,the instructor looks nerves when he talking.


In [13]:
df.head()

Unnamed: 0,Polarity_label,Comment
0,0,end of course project was challenging and fun....
1,2,teaches you how to use gdb and debug code effe...
2,0,wow! i took my first course that was heavy in ...
3,2,difficult project but it was worth it
4,1,these flaws could be overlooked if auditing th...


In [14]:
reviews = df['Comment'].values.tolist()
labels = df['Polarity_label'].tolist()

In [15]:
print(reviews[:2])
print(labels[:2])

['end of course project was challenging and fun. lots of opportunity to learn how to debug memory issues with valgrind.', 'teaches you how to use gdb and debug code effectively. challenging and engaging homework.']
[0, 2]


In [16]:
from sklearn.model_selection import train_test_split
training_sentences, validation_sentences, training_labels, validation_labels = train_test_split(reviews, labels, test_size=.2)

validation_sentences, test_sentences, validation_labels, test_labels = train_test_split(validation_sentences, validation_labels, test_size=.1)

In [17]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [18]:
tokenizer([training_sentences[0]], truncation=True,
                            padding=True, max_length=512)

{'input_ids': [[101, 1045, 7303, 2005, 1037, 25416, 8630, 1998, 1045, 2106, 2025, 4374, 2009, 1012, 1045, 3984, 1045, 2180, 1005, 1056, 2131, 2009, 2127, 2055, 1016, 3134, 1012, 1045, 2215, 1037, 25416, 8630, 2138, 2009, 1005, 1055, 2205, 3733, 1010, 2023, 2003, 4933, 1045, 4342, 1999, 10452, 7366, 1012, 2009, 1005, 1055, 3835, 2008, 2009, 4107, 4219, 1998, 6971, 2174, 1010, 1045, 2359, 2000, 2202, 1037, 2607, 1998, 2131, 1037, 8196, 1999, 2028, 2012, 2026, 2504, 2000, 6011, 2008, 1045, 2064, 4553, 3458, 1037, 2152, 2082, 7366, 2504, 1012, 1998, 1996, 2607, 2758, 2009, 1005, 1055, 2005, 19156, 2021, 2009, 2987, 1005, 1056, 2175, 2046, 2172, 6987, 2006, 2129, 2000, 2941, 6570, 1996, 2592, 1012, 1045, 3984, 2023, 2003, 2204, 2065, 2017, 2215, 2000, 2022, 1037, 7681, 2012, 1037, 2690, 2082, 2030, 2152, 2082, 1010, 2021, 2009, 1005, 1055, 2025, 2005, 2033, 1012, 1045, 2001, 2559, 2005, 2242, 2008, 2001, 2062, 2061, 2012, 1037, 2267, 2504, 1012, 2036, 2006, 1996, 2590, 2592, 1010, 2027, 406

In [20]:
train_encodings = tokenizer(training_sentences,
                            truncation=True,
                            padding=True)
val_encodings = tokenizer(validation_sentences,
                            truncation=True,
                            padding=True)

In [24]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    training_labels
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    validation_labels
))

In [25]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=3)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_transform', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_19', 'pre_classifier', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [26]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
model.fit(train_dataset.shuffle(100).batch(16),
          epochs=2,
          batch_size=16,
          validation_data=val_dataset.shuffle(100).batch(16))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f57f5acba90>

In [27]:
model.save_pretrained("./sentiment")

In [28]:
loaded_model = TFDistilBertForSequenceClassification.from_pretrained("./sentiment")

Some layers from the model checkpoint at ./sentiment were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at ./sentiment and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [56]:
test_sentence = "I absolutely loved the lecturer, George Siedel. He presents in a way where I wanted to listen and I feel like I learned a lot from him just with the way that he spoke. I definitely think that this course is worth doing and I am really happy that did it. The fact that the course was online, didn't feel like it was a barrier to me at all. In fact, I preferred it because I was able to go through the work in my own pace and was thus able to complete the entire course in one week. I really enjoyed this course, and definitely think that the lecturer, George Siedel, played a big part in it. I haven't done many other online courses before, but I'm definitely motivated to try more of Coursera's courses due to the great experience that I had taking this course. Due to the introduction of the University of Michigan made, I am now also strongly considering to apply for the MBA course when I am able."


predict_input = tokenizer.encode(test_sentence,
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf")

tf_output = loaded_model.predict(predict_input)[0]


tf_prediction = tf.nn.softmax(tf_output, axis=1)
label = tf.argmax(tf_prediction, axis=1)
label = label.numpy()
print(label)

[0]


In [36]:
test_labels[0]

0

In [45]:
predict_input = []
y_hat = []
predictions = []

for x in test_sentences:
   predict_input.append(tokenizer.encode(x,truncation=True,padding=True,return_tensors="tf"))

for x in predict_input:
  y_hat.append(loaded_model.predict(x)[0])


for j in y_hat:
  tf_prediction = tf.nn.softmax(j, axis=1)
  label = tf.argmax(tf_prediction, axis=1)
  label = label.numpy()
  predictions.append(label)






In [43]:
label

array([0])

In [46]:
from sklearn.metrics import classification_report



print('Classification Report')
print(classification_report(test_labels,predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.97      0.96      0.97       375
           1       0.73      0.84      0.78        45
           2       0.47      0.37      0.41        19

    accuracy                           0.92       439
   macro avg       0.72      0.73      0.72       439
weighted avg       0.92      0.92      0.92       439

