In [1]:
import tensorflow as tf


In [2]:
!pip install transformers



In [3]:
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification
import pandas as pd
import numpy as np

from scipy.stats import norm
from scipy import stats

import warnings
warnings.filterwarnings('ignore')

#for displaying 500 results in pandas dataframe
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


from sklearn.model_selection import train_test_split

import nltk
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
from collections import defaultdict,Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import string
nltk.download('stopwords')


stop=set(stopwords.words('english'))



[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.0.9-py2.py3-none-any.whl (242 kB)
     |████████████████████████████████| 242 kB 896 kB/s            
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.9


In [6]:
# Load training data
df = pd.read_excel('../input/coursera21k/trynow.xlsx')

aspect_mapping = {'P"': 'P' ,'N"':'N' , 'NEU"': 'NEU','P': 'P','NEU':'NEU','N':'N'}

df['Polarity'] = df['Polarity'].map(aspect_mapping)

In [7]:
df.head()

Unnamed: 0,"Comment,Aspect,Polarity;;;",Aspect,Polarity,True,"Comment,Aspect,Polarity",Comment
0,end of course project was challenging and fun....,C,P,True,end of course project was challenging and fun....,end of course project was challenging and fun....
1,teaches you how to use gdb and debug code effe...,C,NEU,True,teaches you how to use gdb and debug code effe...,teaches you how to use gdb and debug code effe...
2,wow! i took my first course that was heavy in ...,C,P,True,wow! i took my first course that was heavy in ...,wow! i took my first course that was heavy in ...
3,"difficult project but it was worth it,S,NEU;;;",S,NEU,True,"difficult project but it was worth it,S,NEU",difficult project but it was worth it
4,"""the course requires 100% perfection in order ...",S,N,True,these flaws could be overlooked if auditing th...,these flaws could be overlooked if auditing th...


In [8]:
aspect_mapping = {'P': 0 ,'N':1 , 'NEU': 2}

df['Polarity_label'] = df['Polarity'].map(aspect_mapping)

In [9]:
df = df[['Polarity_label','Comment']]

In [10]:
index = df.index
number_of_rows = len(index)
print(number_of_rows)

21940


In [11]:
df.tail()

Unnamed: 0,Polarity_label,Comment
21935,1,no instructor participates in discussion. no o...
21936,1,"bad useless garbage,this is the most bad onlin..."
21937,1,poor quality and even worse presentation by th...
21938,1,need more (understandable) subtitles. peer-rev...
21939,1,the instructor looks nerves when he talking.


In [12]:
df.head()

Unnamed: 0,Polarity_label,Comment
0,0,end of course project was challenging and fun....
1,2,teaches you how to use gdb and debug code effe...
2,0,wow! i took my first course that was heavy in ...
3,2,difficult project but it was worth it
4,1,these flaws could be overlooked if auditing th...


## Preprocessing

In [13]:
#Remove Urls and HTML links
def remove_urls(text):
    url_remove = re.compile(r'https?://\S+|www\.\S+')
    return url_remove.sub(r'', text)
df['comment_new']=df['Comment'].apply(lambda x:remove_urls(x))

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)
df['comment_new']=df['comment_new'].apply(lambda x:remove_html(x))

In [34]:
# Lower casing
def lower(text):
    low_text= text.lower()
    return low_text
df['comment_new']=df['comment_new'].apply(lambda x:lower(x))


# Number removal
def remove_num(text):
    remove= re.sub(r'\d+', '', text)
    return remove
df['comment_new']=df['comment_new'].apply(lambda x:remove_num(x))

In [35]:
#Remove stopwords & Punctuations
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))
STOPWORDS = set(stopwords.words('english'))

def punct_remove(text):
    punct = re.sub(r"[^\w\s\d]","", text)
    return punct
df['comment_new']=df['comment_new'].apply(lambda x:punct_remove(x))



def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
df['comment_new']=df['comment_new'].apply(lambda x:remove_stopwords(x))

In [36]:
#Remove extra white space left while removing stuff
def remove_space(text):
    space_remove = re.sub(r"\s+"," ",text).strip()
    return space_remove
df['comment_new']=df['comment_new'].apply(lambda x:remove_space(x))

In [37]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

def lemmatize_words(text):
    wnl = WordNetLemmatizer()
    lem = ' '.join([wnl.lemmatize(word) for word in text.split()])    
    return lem

df['comment_new'] = df['comment_new'].apply(lemmatize_words)

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [38]:
reviews = df['comment_new'].values.tolist()
labels = df['Polarity_label'].tolist()

In [39]:
print(reviews[:2])
print(labels[:2])

['end course project challenging fun lot opportunity learn debug memory issue valgrind', 'teach use gdb debug code effectively challenging engaging homework']
[0, 2]


In [40]:
from sklearn.model_selection import train_test_split
training_sentences, validation_sentences, training_labels, validation_labels = train_test_split(reviews, labels, test_size=.2,random_state = 23)

validation_sentences, test_sentences, validation_labels, test_labels = train_test_split(validation_sentences, validation_labels, test_size=.5,random_state = 23)

In [41]:
len(training_sentences)

17552

In [17]:
len(validation_sentences)

2194

In [18]:
len(test_sentences)

2194

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
  
tfidf = TfidfVectorizer(min_df = 2, max_df = 0.5, ngram_range = (1, 2))

#training_sentences_Tf = tfidf.fit_transform(training_sentences)

#test_sentences_Tf = tfidf.transform(test_sentences)

DISTILBERT 

In [19]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [20]:
tokenizer([training_sentences[0]], truncation=True,
                            padding=True, max_length=512)

{'input_ids': [[101, 2026, 2034, 3784, 2607, 1010, 2428, 5379, 2005, 2047, 2272, 2869, 1012, 20014, 28533, 2075, 1998, 12367, 8082, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [21]:
train_encodings = tokenizer(training_sentences,
                            truncation=True,
                            padding=True)
val_encodings = tokenizer(validation_sentences,
                            truncation=True,
                            padding=True)

In [22]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    training_labels
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    validation_labels
))

2022-02-04 08:01:14.938427: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-04 08:01:14.939517: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-04 08:01:14.940173: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-04 08:01:14.942254: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [23]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=3)

Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

2022-02-04 08:02:01.588780: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_layer_norm', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint 

In [24]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
model.fit(train_dataset.shuffle(100).batch(16),
          epochs=5,
          batch_size=16,
          validation_data=val_dataset.shuffle(100).batch(16))

Epoch 1/5


2022-02-04 08:02:12.895658: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7effc88a0610>

In [25]:
model.save_pretrained("./sentiment")

In [26]:
loaded_model = TFDistilBertForSequenceClassification.from_pretrained("./sentiment")

Some layers from the model checkpoint at ./sentiment were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at ./sentiment and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
test_sentence = "I absolutely loved the lecturer, George Siedel. He presents in a way where I wanted to listen and I feel like I learned a lot from him just with the way that he spoke. I definitely think that this course is worth doing and I am really happy that did it. The fact that the course was online, didn't feel like it was a barrier to me at all. In fact, I preferred it because I was able to go through the work in my own pace and was thus able to complete the entire course in one week. I really enjoyed this course, and definitely think that the lecturer, George Siedel, played a big part in it. I haven't done many other online courses before, but I'm definitely motivated to try more of Coursera's courses due to the great experience that I had taking this course. Due to the introduction of the University of Michigan made, I am now also strongly considering to apply for the MBA course when I am able."


predict_input = tokenizer.encode(test_sentence,
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf")

tf_output = loaded_model.predict(predict_input)[0]


tf_prediction = tf.nn.softmax(tf_output, axis=1)
label = tf.argmax(tf_prediction, axis=1)
label = label.numpy()
print(label)

[0]


In [28]:
test_labels[0]

0

In [29]:
predict_input = []
y_hat = []
predictions = []

for x in test_sentences:
   predict_input.append(tokenizer.encode(x,truncation=True,padding=True,return_tensors="tf"))

for x in predict_input:
  y_hat.append(loaded_model.predict(x)[0])


for j in y_hat:
  tf_prediction = tf.nn.softmax(j, axis=1)
  label = tf.argmax(tf_prediction, axis=1)
  label = label.numpy()
  predictions.append(label)


In [30]:
tokenizer.save_pretrained("./sentiment")

('./sentiment/tokenizer_config.json',
 './sentiment/special_tokens_map.json',
 './sentiment/vocab.txt',
 './sentiment/added_tokens.json',
 './sentiment/tokenizer.json')

In [31]:
label

array([0])

In [32]:
from sklearn.metrics import classification_report

print('Classification Report')
print(classification_report(test_labels,predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      1836
           1       0.79      0.86      0.82       238
           2       0.43      0.45      0.44       120

    accuracy                           0.92      2194
   macro avg       0.73      0.75      0.74      2194
weighted avg       0.92      0.92      0.92      2194



In [33]:
# demonstration of calculating metrics for a neural network model using sklearn
from sklearn.datasets import make_circles
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score

precision = precision_score(test_labels, predictions,average='weighted')
print('Precision: %f' % precision)
recall = recall_score(test_labels, predictions,average='weighted')
print('Recall: %f' % recall)
f1 = f1_score(test_labels, predictions,average='weighted')
print('F1 Score: %f' % f1)


Precision: 0.920300
Recall: 0.917046
F1 Score: 0.918481


## NAIVE BAYES

In [43]:
# Model 1 - default parameter 
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

from sklearn.pipeline import Pipeline

nb_classifier1 = MultinomialNB()

# define the stages of the pipeline
pipeline = Pipeline(steps= [('tfidf', tfidf),
                            ('model',nb_classifier1) ])

pipeline.fit(training_sentences,training_labels)


pred1 = pipeline.predict(test_sentences)

print(classification_report(test_labels,pred1, target_names = ['Positive','Negative','Neutral']))

              precision    recall  f1-score   support

    Positive       0.84      1.00      0.91      1836
    Negative       1.00      0.06      0.12       238
     Neutral       0.00      0.00      0.00       120

    accuracy                           0.84      2194
   macro avg       0.61      0.35      0.34      2194
weighted avg       0.81      0.84      0.78      2194



In [44]:
precision = precision_score(test_labels, pred1,average='weighted')
print('Precision: %f' % precision)
recall = recall_score(test_labels, pred1,average='weighted')
print('Recall: %f' % recall)
f1 = f1_score(test_labels, pred1,average='weighted')
print('F1 Score: %f' % f1)


Precision: 0.813579
Recall: 0.843665
F1 Score: 0.778201


In [45]:
# 1
from sklearn.svm import SVC

svc_model1 = SVC(C=1, kernel='linear', gamma= 1)

# define the stages of the pipeline
pipeline = Pipeline(steps= [('tfidf', tfidf),
                            ('model',svc_model1) ])

pipeline.fit(training_sentences,training_labels)

pred1 = pipeline.predict(test_sentences)

print(classification_report(test_labels, pred1, target_names = ['Positive','Negative','Neutral']))

              precision    recall  f1-score   support

    Positive       0.91      0.99      0.95      1836
    Negative       0.81      0.64      0.71       238
     Neutral       0.00      0.00      0.00       120

    accuracy                           0.90      2194
   macro avg       0.57      0.54      0.55      2194
weighted avg       0.85      0.90      0.87      2194



In [46]:
precision = precision_score(test_labels, pred1,average='weighted')
print('Precision: %f' % precision)
recall = recall_score(test_labels, pred1,average='weighted')
print('Recall: %f' % recall)
f1 = f1_score(test_labels, pred1,average='weighted')
print('F1 Score: %f' % f1)


Precision: 0.846407
Recall: 0.896992
F1 Score: 0.869117


In [47]:
# 1
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0)
# define the stages of the pipeline
pipeline = Pipeline(steps= [('tfidf', tfidf),
                            ('model',rf) ])

pipeline.fit(training_sentences,training_labels)

prediction1 = pipeline.predict(test_sentences)

print(classification_report(test_labels, prediction1, target_names = ['Positive','Negative','Neutral']))

              precision    recall  f1-score   support

    Positive       0.84      1.00      0.91      1836
    Negative       0.00      0.00      0.00       238
     Neutral       0.00      0.00      0.00       120

    accuracy                           0.84      2194
   macro avg       0.28      0.33      0.30      2194
weighted avg       0.70      0.84      0.76      2194



In [48]:
precision = precision_score(test_labels, prediction1,average='weighted')
print('Precision: %f' % precision)
recall = recall_score(test_labels, prediction1,average='weighted')
print('Recall: %f' % recall)
f1 = f1_score(test_labels, prediction1,average='weighted')
print('F1 Score: %f' % f1)


Precision: 0.700281
Recall: 0.836828
F1 Score: 0.762489


In [49]:
!pip install catboost
from catboost import CatBoostClassifier

clfs = CatBoostClassifier(
    iterations=5, 
    learning_rate=0.1, 
    #loss_function='CrossEntropy'
)

# define the stages of the pipeline
pipeline = Pipeline(steps= [('tfidf', tfidf),
                            ('model',clfs) ])

pipeline.fit(training_sentences,training_labels)

pred1 = pipeline.predict(test_sentences)

print(classification_report(test_labels, prediction1, target_names = ['Positive','Negative','Neutral']))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
0:	learn: 0.9868453	total: 912ms	remaining: 3.65s
1:	learn: 0.9009445	total: 1.5s	remaining: 2.26s
2:	learn: 0.8326727	total: 2.1s	remaining: 1.4s
3:	learn: 0.7800074	total: 2.69s	remaining: 671ms
4:	learn: 0.7355812	total: 3.29s	remaining: 0us
              precision    recall  f1-score   support

    Positive       0.84      1.00      0.91      1836
    Negative       0.00      0.00      0.00       238
     Neutral       0.00      0.00      0.00       120

    accuracy                           0.84      2194
   macro avg       0.28      0.33      0.30      2194
weighted avg       0.70      0.84      0.76      2194



In [50]:
precision = precision_score(test_labels, prediction1,average='weighted')
print('Precision: %f' % precision)
recall = recall_score(test_labels, prediction1,average='weighted')
print('Recall: %f' % recall)
f1 = f1_score(test_labels, prediction1,average='weighted')
print('F1 Score: %f' % f1)


Precision: 0.700281
Recall: 0.836828
F1 Score: 0.762489


In [51]:
!pip install AdaBoost
from sklearn.ensemble import AdaBoostClassifier

abc = AdaBoostClassifier()

# define the stages of the pipeline
pipeline = Pipeline(steps= [('tfidf', tfidf),
                            ('model',abc) ])

pipeline.fit(training_sentences,training_labels)

pred1 = pipeline.predict(test_sentences)

print(classification_report(test_labels, prediction1, target_names = ['Positive','Negative','Neutral']))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[31mERROR: Could not find a version that satisfies the requirement AdaBoost (from versions: none)[0m
[31mERROR: No matching distribution found for AdaBoost[0m
              precision    recall  f1-score   support

    Positive       0.84      1.00      0.91      1836
    Negative       0.00      0.00      0.00       238
     Neutral       0.00      0.00      0.00       120

    accuracy                           0.84      2194
   macro avg       0.28      0.33      0.30      2194
weighted avg       0.70      0.84      0.76      2194



In [52]:
precision = precision_score(test_labels, prediction1,average='weighted')
print('Precision: %f' % precision)
recall = recall_score(test_labels, prediction1,average='weighted')
print('Recall: %f' % recall)
f1 = f1_score(test_labels, prediction1,average='weighted')
print('F1 Score: %f' % f1)


Precision: 0.700281
Recall: 0.836828
F1 Score: 0.762489
