# Tutorial 2: Edit Distance, Sentiment Analysis
First, we need to import required modules:

In [66]:
import pandas as pd
import nltk
from tqdm import tqdm

## Load IMDB Dataset
Next, we load our dataset like in the first tutorial:


In [67]:
import tensorflow_datasets as tfds
imbd_dataset = tfds.load('imdb_reviews')

Get the train subset texts:

In [68]:
imdb_texts = [ a["text"].numpy().decode("utf-8") for a in imbd_dataset["train"] ]

In [69]:
imdb_labels = [ a["label"].numpy() for a in imbd_dataset["train"] ]

In [70]:
df_imdb = pd.DataFrame({"texts": imdb_texts, "labels": imdb_labels})

In [71]:
df_imdb

Unnamed: 0,texts,labels
0,This was an absolutely terrible movie. Don't b...,0
1,"I have been known to fall asleep during films,...",0
2,Mann photographs the Alberta Rocky Mountains i...,0
3,This is the kind of film for a snowy Sunday af...,1
4,"As others have mentioned, all the women that g...",1
...,...,...
24995,"I have a severe problem with this show, severa...",0
24996,"The year is 1964. Ernesto ""Che"" Guevara, havin...",1
24997,Okay. So I just got back. Before I start my re...,0
24998,When I saw this trailer on TV I was surprised....,0


### Tokenization
We also use the Tokenization techniques from the last tutorial:

In [72]:
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
nltk.download("words")


# for texts with emojis etc.:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
def tweet_tokenize(text):
  return tknzr.tokenize(text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [73]:
tweet_tokenize("Ein coole Vorlesung :-)!")

['Ein', 'coole', 'Vorlesung', ':-)', '!']

In [74]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'[A-Za-z0-9`´\'\\*]+')

In [75]:
imdb_tokens_sent_tok = [sent_tokenize(text) for text in imdb_texts]
imdb_tokens_word_tok = [word_tokenize(text) for text in imdb_texts]
imdb_tweet_tokens = [tweet_tokenize(text) for text in imdb_texts]
imdb_tokens = [tokenizer.tokenize(text) for text in imdb_texts]
df_imdb["tokenized_sent-tokenizer"] = imdb_tokens_sent_tok
df_imdb["tokenized_word-tokenizer"] = imdb_tokens_word_tok
df_imdb["tokenized_tweet"] = imdb_tweet_tokens
df_imdb["tokenized"] = imdb_tokens

In [76]:
df_imdb

Unnamed: 0,texts,labels,tokenized_sent-tokenizer,tokenized_word-tokenizer,tokenized_tweet,tokenized
0,This was an absolutely terrible movie. Don't b...,0,"[This was an absolutely terrible movie., Don't...","[This, was, an, absolutely, terrible, movie, ....","[This, was, an, absolutely, terrible, movie, ....","[This, was, an, absolutely, terrible, movie, D..."
1,"I have been known to fall asleep during films,...",0,[I have been known to fall asleep during films...,"[I, have, been, known, to, fall, asleep, durin...","[I, have, been, known, to, fall, asleep, durin...","[I, have, been, known, to, fall, asleep, durin..."
2,Mann photographs the Alberta Rocky Mountains i...,0,[Mann photographs the Alberta Rocky Mountains ...,"[Mann, photographs, the, Alberta, Rocky, Mount...","[Mann, photographs, the, Alberta, Rocky, Mount...","[Mann, photographs, the, Alberta, Rocky, Mount..."
3,This is the kind of film for a snowy Sunday af...,1,[This is the kind of film for a snowy Sunday a...,"[This, is, the, kind, of, film, for, a, snowy,...","[This, is, the, kind, of, film, for, a, snowy,...","[This, is, the, kind, of, film, for, a, snowy,..."
4,"As others have mentioned, all the women that g...",1,"[As others have mentioned, all the women that ...","[As, others, have, mentioned, ,, all, the, wom...","[As, others, have, mentioned, ,, all, the, wom...","[As, others, have, mentioned, all, the, women,..."
...,...,...,...,...,...,...
24995,"I have a severe problem with this show, severa...",0,"[I have a severe problem with this show, sever...","[I, have, a, severe, problem, with, this, show...","[I, have, a, severe, problem, with, this, show...","[I, have, a, severe, problem, with, this, show..."
24996,"The year is 1964. Ernesto ""Che"" Guevara, havin...",1,"[The year is 1964., Ernesto ""Che"" Guevara, hav...","[The, year, is, 1964, ., Ernesto, ``, Che, '',...","[The, year, is, 1964, ., Ernesto, "", Che, "", G...","[The, year, is, 1964, Ernesto, Che, Guevara, h..."
24997,Okay. So I just got back. Before I start my re...,0,"[Okay., So I just got back., Before I start my...","[Okay, ., So, I, just, got, back, ., Before, I...","[Okay, ., So, I, just, got, back, ., Before, I...","[Okay, So, I, just, got, back, Before, I, star..."
24998,When I saw this trailer on TV I was surprised....,0,[When I saw this trailer on TV I was surprised...,"[When, I, saw, this, trailer, on, TV, I, was, ...","[When, I, saw, this, trailer, on, TV, I, was, ...","[When, I, saw, this, trailer, on, TV, I, was, ..."


## Edit Distance
Now we can look at the edit distance, a measure for how similar two words/texts are.

In [77]:
# a list of english words, we first need to download it:
nltk.download("words")

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [78]:
from nltk.corpus import words

In [79]:
len(words.words())

236736

In [80]:
"netral" in words.words()

False

In [81]:
# lets get some distance metrics, check the documentation for more infos
from nltk.metrics import binary_distance, edit_distance

In [82]:
binary_distance("Hallo", "Hello")

1.0

In [83]:
binary_distance("Hallo", "Hola")

1.0

In [84]:
edit_distance("Hallo", "Hello")

1

In [85]:
edit_distance("Hallo", "Hola")

3

In [86]:
edit_distance("Hallo", "Halol", transpositions=True)

1

In [87]:
satz = "Hi mum, I am lerning computer scyence!"

In [88]:
tweet_tokenize(satz)

['Hi', 'mum', ',', 'I', 'am', 'lerning', 'computer', 'scyence', '!']

## TASK 1:
Baue einen Spellchecker: Die Methode nimmt einen String oder List of Tokens und


1.   schaut, welche Wörter "falsch" sind (nicht in words.words() )
2.   welche Wörter aus words.words() haben die geringste edit distance zu den falschen wörtern?

BONUS: Wortlänge berücksichtigen!



In [89]:
from tqdm import tqdm

In [90]:
for word in tweet_tokenize(satz):
  if len(word)>1:

    w_dict = {}

    if word.lower() not in words.words():
      print(word)
      for w in tqdm(words.words()):
        w_dict[w] = edit_distance(w, word)

      minval = min(w_dict.values())
      res = [k for k, v in w_dict.items() if v == minval]
      print(res)



lerning


100%|██████████| 236736/236736 [00:19<00:00, 11962.91it/s]


['leaning', 'learning']
scyence


100%|██████████| 236736/236736 [00:21<00:00, 10810.57it/s]

['science']





In [91]:
# some words are not in words...
"processing" in words.words()

False

In [92]:
# a better spellchecker:
def spellchecker(text):
  words_by_len = {}
  for w in tqdm(words.words()):
    a = len(w)
    if a in words_by_len.keys():
      words_by_len[a].extend([w])
    else:
      words_by_len[a] = [w]

  for w in tokenizer.tokenize(text):
    checklist = []
    checklist = words_by_len[len(w)]
    checklist.extend(words_by_len[len(w)+1])
    if len(w)>1:
      checklist.extend(words_by_len[len(w)-1])

    if not ( w.lower() in checklist):
      print(w)
      w_dic = {}

      for word in tqdm(checklist):
        w_dic[word] = edit_distance(w, word)

      minval = min(w_dic.values())
      res = [k for k, v in w_dic.items() if v==minval]
      print(res)

# Sentiment Analysis
Next, we want to perform some sentiment analysis. We start with VADER, a lexicon-based approach.

In [93]:
!pip install vaderSentiment



In [94]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [95]:
# vader gives three scores, positive, negative, and neutral, that roughly
# represent how much of the sentence has the corresponding attribute.
# Further, it gives a compound score, that reflect the polarity of a text on a -1 to 1 scale.
analyzer.polarity_scores("this is amazing!")

{'neg': 0.0, 'neu': 0.328, 'pos': 0.672, 'compound': 0.6239}

In [96]:
analyzer.polarity_scores(df_imdb["texts"][11])

{'neg': 0.041, 'neu': 0.738, 'pos': 0.222, 'compound': 0.9957}

In [97]:
df_imdb["texts"][11]

'Cute film about three lively sisters from Switzerland (often seen running about in matching outfits) who want to get their parents back together (seems mom is still carrying the torch for dad) - so they sail off to New York to stop the dad from marrying a blonde gold-digger he calls "Precious". Dad hasn\'t seen his daughters in ten years, they (oddly enough) don\'t seem to mind and think he\'s wonderful, and meanwhile Precious seems to lead a life mainly run by her overbearing mother (Alice Brady), a woman who just wants to see to it her daughter marries a rich man. The sisters get the idea of pushing Precious into the path of a drunken Hungarian count, tricking the two gold-digging women into thinking he is one of the richest men in Europe. But a case of mistaken identity makes the girls think the count is good-looking Ray Milland, who goes along with the scheme \'cause he has a crush on sister Kay.<br /><br />This film is enjoyable, light fare. Barbara Read as Kay comes across as sw

In [98]:
df_imdb["labels"][11]

1

## Naive Bayes
next, we implement naive Bayes classification for sentiment classification. It is notable here, that NB is NOT a good approach for sentiment analysis, because it disregards context, syntax and semantics and is based on words rather than n-grams.

In [99]:
import random

In [100]:
poss = df_imdb.loc[df_imdb["labels"]==1]["texts"]
negs = df_imdb.loc[df_imdb["labels"]==0]["texts"]

In [101]:
type(poss)

pandas.core.series.Series

In [102]:
data_train = ([(pos, "positive") for pos in poss.to_list()[:10]]+
              [(neg, "negative") for neg in negs.to_list()[:10]])

In [103]:
type(data_train[2])

tuple

In [104]:
random.shuffle(data_train)

In [105]:
from nltk import NaiveBayesClassifier as nbc
from itertools import chain

In [106]:
# first, we build our vocabulary
vocabulary = set( chain(*[tweet_tokenize(i[0].lower()) for i in data_train ] ) )

In [107]:
vocabulary = []
for sentence in data_train:
  vocabulary += tweet_tokenize(sentence[0].lower())
vocabulary = list(set(vocabulary))

In [108]:
len(vocabulary)

1567

In [109]:
# we now build the feature set
feature_set = [( {i: (i in tweet_tokenize(sentence.lower())) for i in vocabulary }, tag) for sentence, tag  in data_train ]

In [110]:
classifier = nbc.train(feature_set)

In [111]:
def featurize_sentence(sentence):
  return {i: (i in tweet_tokenize(sentence.lower())) for i in vocabulary }

classifier.classify(featurize_sentence("the movie is not bad!"))

'negative'

In [112]:
neg_vocab = list(chain(*[tweet_tokenize(neg) for neg in negs.to_list()[:10]]))

In [113]:
nltk.FreqDist(neg_vocab)

FreqDist({'.': 107, 'the': 77, ',': 72, '<': 66, 'br': 66, '/': 66, '>': 66, 'of': 48, '"': 48, 'and': 42, ...})

We can improve that a little bit by removing all of these characters and stowords.

In [114]:
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [115]:
stopwords.words("english")[:20]

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his']

In [116]:
def tokenize(sentence):
  return [ w for w in tweet_tokenize(sentence) if w.lower() not in stopwords.words("english")+[".", ",", "\"", ";", ":"]]

In [117]:
tokenize("I am learning NLP.")

['learning', 'NLP']

In [118]:
vocabulary2 = set( chain(*[tokenize(i[0].lower()) for i in data_train ] ) )

In [119]:
feature_set2 = [( {i: (i in tokenize(sentence.lower())) for i in vocabulary2 }, tag) for sentence, tag  in data_train ]

In [120]:
neg_vocab = list(chain(*[tokenize(neg) for neg in negs.to_list()[:10]]))
nltk.FreqDist(neg_vocab)

FreqDist({'<': 66, 'br': 66, '/': 66, '>': 66, '(': 22, '!': 22, ')': 22, 'film': 11, '-': 11, 'movie': 8, ...})

## Bonus: A good Sentiment analysis solution: Transformer based Language Models

In [121]:
!pip install -q transformers

In [122]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [123]:
for i in range(1,20):
  print(df_imdb["labels"][i])
  print(sentiment_pipeline(df_imdb["texts"][i][:500]))

0
[{'label': 'NEGATIVE', 'score': 0.9995928406715393}]
0
[{'label': 'POSITIVE', 'score': 0.9809615015983582}]
1
[{'label': 'POSITIVE', 'score': 0.9997604489326477}]
1
[{'label': 'POSITIVE', 'score': 0.999556839466095}]
1
[{'label': 'POSITIVE', 'score': 0.9997072815895081}]
0
[{'label': 'NEGATIVE', 'score': 0.9919043183326721}]
0
[{'label': 'POSITIVE', 'score': 0.9856621623039246}]
0
[{'label': 'NEGATIVE', 'score': 0.9877058863639832}]
0
[{'label': 'NEGATIVE', 'score': 0.9952402114868164}]
0
[{'label': 'NEGATIVE', 'score': 0.998923122882843}]
1
[{'label': 'POSITIVE', 'score': 0.9072058796882629}]
1
[{'label': 'POSITIVE', 'score': 0.9997244477272034}]
0
[{'label': 'NEGATIVE', 'score': 0.997840404510498}]
1
[{'label': 'POSITIVE', 'score': 0.9996960163116455}]
0
[{'label': 'NEGATIVE', 'score': 0.9644811153411865}]
1
[{'label': 'POSITIVE', 'score': 0.9983195662498474}]
1
[{'label': 'NEGATIVE', 'score': 0.9901657700538635}]
1
[{'label': 'NEGATIVE', 'score': 0.9934014678001404}]
0
[{'label': 