# Natural Language Processing

## Original Dataset

Natural Language Processing using a dataset of Statements and their respective Emotion: each statement is either fear, anger or joy. 

Check if the dataset is being read properly and if values match, after removing duplicate data:

In [None]:
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Emotions.csv', encoding='cp1252')
dataset.drop_duplicates(subset="Statement", keep='first', inplace=True)

# Printing the statements
for index, row in dataset.iterrows():
  print(row['Statement'])

# Check if everything looks alright
print('\nDataset size:\n')
print(dataset['Statement'].size)
print('\nFear count:\n')
print((dataset['Emotion'] == 'fear').value_counts(normalize=True))
dataset_fear = [(dataset['Emotion'] == 'fear')]
print('\nAnger count:\n')
print((dataset['Emotion'] == 'anger').value_counts(normalize=True))
dataset_anger = [(dataset['Emotion'] == 'anger')]
print('\nJoy count:\n')
print((dataset['Emotion'] == 'joy').value_counts(normalize=True))
dataset_joy = [(dataset['Emotion'] == 'joy')]

## Clean Up and Normalization

As only around 4% of the words are in upper case, we consider it is not important to keep for the training of the model:

In [None]:
import re

upper_counter = 0
full_counter = 0

for index, row in dataset.iterrows():
  # replace asterisk for empty
  review = re.sub('\*', '', row['Statement'])
  # remove non alpha chars
  review = re.sub('[^a-zA-Z]', ' ', review)
  for w in review.split():
    if w.isupper():
      upper_counter += 1
    full_counter += 1

print((upper_counter / full_counter) * 100)

Removing non alpha chars from the statements, lowercasing, stopword removal and stemming:


In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

stopwords = list(set(stopwords.words('english')))

# Add word
stopwords.append('amp')

# Remove negations
filter = "'t"
for word in stopwords:
  if filter in word:
    stopwords.remove(word)
stopwords.remove('not')
stopwords.remove('no')

print(stopwords)
corpus = []
ps = PorterStemmer()
for index, row in dataset.iterrows():
    # replace asterisk for empty
    review = re.sub('\*', '', row['Statement'])
    # remove non alpha chars
    review = re.sub('[^a-zA-Z]', ' ', review)
    # to lower case
    review = review.lower()
    # split into tokens, apply stemming and remove stop words
    review = ' '.join([ps.stem(w) for w in review.split() if not w in stopwords])
    corpus.append(review)

print(corpus)
print(len(corpus))

## Wordclouds

Generating the global wordcloud:

In [None]:
%pip install wordcloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt

wordcloud = WordCloud().generate(" ".join(corpus))

plt.figure()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

Generating wordcloud for fear:

In [None]:
corpus_fear = []

for index, row in dataset.iterrows():
  if row['Emotion'] == 'fear':
    corpus_fear.append(corpus[index])

wordcloud = WordCloud().generate(" ".join(corpus_fear))

plt.figure()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

Generationg wordcloud for anger:

In [None]:
corpus_anger = []

for index, row in dataset.iterrows():
  if row['Emotion'] == 'anger':
    corpus_anger.append(corpus[index])

wordcloud = WordCloud().generate(" ".join(corpus_anger))

plt.figure()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

Generating wordcloud for joy:

In [None]:
corpus_joy = []

for index, row in dataset.iterrows():
  if row['Emotion'] == 'joy' and index < dataset['Statement'].size: 
    corpus_joy.append(corpus[index])

wordcloud = WordCloud().generate(" ".join(corpus_joy))

plt.figure()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

## Generating a Dataset 

We need to transform the data into a dataset that can be used by machine learning models.

We can choose scikit-learn's Bag of Words:

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus).toarray()

Or scikit-learn's TF-IDF:

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

We can also use N-grams, which is useful for negations:

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1,2))
X = vectorizer.fit_transform(corpus).toarray()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))
X = vectorizer.fit_transform(corpus)

Look at shape and features we've got:

In [None]:
print(X.shape)
print(vectorizer.get_feature_names())

Compare the contents of one review with its representation vector following the bag-of-words model:

In [None]:
print(dataset['Statement'][999])
print(corpus[999])
print(X[999])

Obtaining the classes:

In [None]:
y = dataset['Emotion']
print(X.shape, y.shape) 

## Training Classifiers

In [None]:
print("\nLabel distribution in the training set:")
print(y.value_counts())

### Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X, y)

### Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X, y)

### Support Vector Classification

In [None]:
from sklearn.svm import SVC

clf = SVC() # can add probability=True but it will take longer
clf.fit(X, y)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0)
clf.fit(X, y)

## Testing

### Cross Validation

In [None]:
from sklearn.model_selection import cross_validate

# can increase the cv parameter but it will take longer
scores = cross_validate(clf, X, y, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'], cv=3, return_train_score=True)

print(scores)
print(scores['test_accuracy'].mean())

## Test Set

Building the test set:

In [None]:
test_set = pd.read_csv('Testingdata.csv', encoding='cp1252')

test_corpus = []
for index, row in test_set.iterrows():
  # replace asterisk for empty
  review = re.sub('\*', '', row['Statement'])
  # remove non alpha chars
  review = re.sub('[^a-zA-Z]', ' ', review)
  # to lower case
  review = review.lower()
  # split into tokens, apply stemming and remove stop words
  review = ' '.join([ps.stem(w) for w in review.split() if not w in stopwords])
  test_corpus.append(review)

X_test = vectorizer.transform(test_corpus).toarray()
y_test = test_set['Emotion']
print(X_test.shape, y_test.shape)
print("\nLabel distribution in the testing set:")
print(y_test.value_counts())

Let's see the model's output on the test set:

In [None]:
%pip install colorama
from colorama import Fore

y_pred = clf.predict(X_test)

for i in range(0, len(y_pred)):
  if (y_pred[i] == y_test[i]):
    print(Fore.GREEN + y_pred[i])
  else:
    print(Fore.RED + y_pred[i])

Assess the performance of our model by looking at different metrics:

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

# confusion matrix
labels = ["anger", "fear", "joy"]
cm = confusion_matrix(y_test, y_pred)
cmd_obj = ConfusionMatrixDisplay(cm, display_labels=labels)
cmd_obj.plot()
cmd_obj.ax_.set(
                title='Confusion Matrix', 
                xlabel='Predicted Emotion', 
                ylabel='Actual Emotion')
print("Confusion matrix:")
plt.show()

# accuracy
print("\nAccuracy:") 
print(accuracy_score(y_test, y_pred))

# precision
print("\nPrecision:")
print(precision_score(y_test, y_pred, labels=labels, average=None)) 

# recall
print("\nRecall:")
print(recall_score(y_test, y_pred, labels=labels, average=None)) 

# f1
print("\nf1:")
print(f1_score(y_test, y_pred, labels=labels, average=None)) 

## Try it Yourself

In [None]:
your_statement = "I am happy" 

# replace asterisk for empty
review = re.sub('\*', '', your_statement)
# remove non alpha chars
review = re.sub('[^a-zA-Z]', ' ', review)
# to lower case
review = review.lower()
# split into tokens, apply stemming and remove stop words
review = ' '.join([ps.stem(w) for w in review.split() if not w in stopwords]) 

V = vectorizer.transform([review]).toarray()

print(V.shape)
print(V)
print(clf.predict(V))

If the classifier has it you can check probability:

In [None]:
print(clf.classes_)
print(clf.predict_proba(V))