# Sentiment Analysis Bag-of-Words

In [90]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## 1. Data Loading

In [91]:
import random
data = pd.read_csv ("train.tsv", sep = '\t')

# Teilsätze rausschmeißen
data = data.groupby('SentenceId').first().reset_index()
data.head()


# so Daten laden, wenn das test_set Sentiment Labels hätte:
#train_set = pd.read_csv ("train.tsv", sep = '\t')
#test_set= pd.read_csv ("test.tsv", sep = '\t')
#pd.set_option('display.max_colwidth', None)
#test_set.head(10)

Unnamed: 0,SentenceId,PhraseId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,64,"This quiet , introspective and entertaining in...",4
2,3,82,"Even fans of Ismail Merchant 's work , I suspe...",1
3,4,117,A positively thrilling combination of ethnogra...,3
4,5,157,Aggressive self-glorification and a manipulati...,1


In [92]:
data.Sentiment.value_counts()

# Daten ausbalancieren im Trainings-Set? 
# Jeder Sentiment Value sollte gleiche Anzahl an Samples haben 

3    2321
1    2200
2    1655
4    1281
0    1072
Name: Sentiment, dtype: int64

## 2. Data cleaning

In [93]:
import nltk
# nltk.download('stopwords')
# pip install nltk
import re


# Für Sentimentanalyse zählen nur Wörter  
def keep_only_letters(text):
    text=re.sub(r'[^a-zA-Z\s]','',text)
    return text
 
# Groß- und Kleinschreibung egal 
def convert_to_lowercase(text):
    return text.lower()
 
def clean_reviews(text):
    text = keep_only_letters(text)
    text = convert_to_lowercase(text)
    return text

data['Phrase'] = data['Phrase'].apply(clean_reviews)
# train_set['Phrase'] = train_set['Phrase'].apply(clean_reviews)
# test_set['Phrase'] = test_set['Phrase'].apply(clean_reviews)

# Stop Words definition
english_stop_words = nltk.corpus.stopwords.words('english')
print(len(english_stop_words))
print (english_stop_words[:20])

# Stop Words removal
def remove_stop_words(text):
    for stopword in english_stop_words:
        stopword = ' ' + stopword + ' '
        text = text.replace(stopword, ' ')
    return text
 

data['Phrase'] = data['Phrase'].apply(remove_stop_words) 
#train_set['Phrase'] = train_set['Phrase'].apply(remove_stop_words)
#test_set['Phrase'] = test_set['Phrase'].apply(remove_stop_words)


# Stemming
def text_stemming(text):
    stemmer = nltk.porter.PorterStemmer()
    stemmed = ' '.join([stemmer.stem(token) for token in text.split()])
    return stemmed

data['Phrase'] = data['Phrase'].apply(text_stemming) 
#train_set['Phrase'] = train_set['Phrase'].apply(text_stemming)
#test_set['Phrase'] = test_set['Phrase'].apply(text_stemming)

#train_set.head(10)
#train_set.shape
#test_set.shape

179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']


In [94]:
# Train Test Split
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(data, test_size=0.2)
train_set.Sentiment.value_counts()

3    1839
1    1752
2    1314
4    1047
0     871
Name: Sentiment, dtype: int64

In [95]:
# Daten balancieren

Sentiment_groups = train_set.Sentiment.value_counts()
for i in range(5):
  while Sentiment_groups[i] < Sentiment_groups[3]:
    
    sent_i_rows = train_set.loc[train_set['Sentiment'] == i]
    adding_row = sent_i_rows.sample()
    train_set = train_set.append(adding_row, ignore_index=True)

    Sentiment_groups = train_set.Sentiment.value_counts()

train_set.Sentiment.value_counts()

  train_set = train_set.append(adding_row, ignore_index=True)
  train_set = train_set.append(adding_row, ignore_index=True)
  train_set = train_set.append(adding_row, ignore_index=True)
  train_set = train_set.append(adding_row, ignore_index=True)
  train_set = train_set.append(adding_row, ignore_index=True)
  train_set = train_set.append(adding_row, ignore_index=True)
  train_set = train_set.append(adding_row, ignore_index=True)
  train_set = train_set.append(adding_row, ignore_index=True)
  train_set = train_set.append(adding_row, ignore_index=True)
  train_set = train_set.append(adding_row, ignore_index=True)
  train_set = train_set.append(adding_row, ignore_index=True)
  train_set = train_set.append(adding_row, ignore_index=True)
  train_set = train_set.append(adding_row, ignore_index=True)
  train_set = train_set.append(adding_row, ignore_index=True)
  train_set = train_set.append(adding_row, ignore_index=True)
  train_set = train_set.append(adding_row, ignore_index=True)
  train_

1    1839
0    1839
4    1839
2    1839
3    1839
Name: Sentiment, dtype: int64

## 3. Text Vectorization
CountVectorizer aus sklearn um bag-of-words Darstellung von unserem Trainings- und Testset zu erhalten
naive bag-of-words text vectorization

Nur Trainingsdatensatz zur Definition des Vokabulars heranziehen und 
das gleiche Vokabular zur Darstellung des Test-Datensatzes verwenden
-> Vektorizer an Trainingsdaten anpassen und zur Transformation der Testdaten verwenden

weighted version of BOW ausprobieren?

### N-Grams

Unigramme: Alle eindeutigen Wörter in einem Dokument

BiGramme: Alle Permutationen von zwei aufeinanderfolgenden Wörtern in einem Dokument

TriGrams: Alle Permutationen von drei aufeinanderfolgenden Wörtern in einem Dokument

In [96]:
import sklearn
# convert text data to numeric

# Argument binary=False: Vocabel-Vector mit term-frequency füllen
# binary = True: Vocabel-Vector mit Vorhandensein der Token füllen (1 vorhanden, 0 nicht vorhanden) 
# ngram _range = Unigram: (1,1); Bigram: (1,2); Trigram: (1,3)

vectorizer_uni = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,1))
vectorizer_bi = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,2))
vectorizer_tri = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,3))

In [97]:
uni_features_train = vectorizer_uni.fit_transform(train_set['Phrase'])
uni_features_test = vectorizer_uni.transform(test_set['Phrase'])
#print (uni_features_train.shape, uni_features_test.shape)

bi_features_train = vectorizer_bi.fit_transform(train_set['Phrase'])
bi_features_test = vectorizer_bi.transform(test_set['Phrase'])
#print (bi_features_train.shape, bi_features_test.shape)

tri_features_train = vectorizer_tri.fit_transform(train_set['Phrase'])
tri_features_test = vectorizer_tri.transform(test_set['Phrase'])
#print (tri_features_train.shape, tri_features_test.shape)

Unigram:
(156060, 10998) (66292, 10998): 10998 einzigartige englische token in unserem Vokabular (abgeleitet aus Trainingsdatensatz)
Jeder Token wird durch eine Spalte im Datensatz repräsentiert
Für jedes Review im Datensatz wird die Frequency der Token (term-frequency) durch Vokabel-Vector der Größe 10998 dargestellt.
= Daher haben wir 156060 solcher Vektoren in unserem Trainings-Datensatz und 66292 in unserem Test-Datensatz = Anzahl der Reviews 

In [98]:
train_labels = train_set['Sentiment']
test_labels = test_set['Sentiment']

## 4. Klassifizierungsmodelle trainieren
### 4.1.  Naive Bayes
#### 4.1.1. Unigram (Logistic Regression classifier on unigram features)

In [99]:
from sklearn.naive_bayes import MultinomialNB
 
vectorizer_uni = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,1))
uni_features_train = vectorizer_uni.fit_transform(train_set['Phrase'])
uni_features_test = vectorizer_uni.transform(test_set['Phrase'])

uni_nb = MultinomialNB()
uni_nb.fit(uni_features_train, train_labels)
 
predictions = uni_nb.predict(uni_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))


              precision    recall  f1-score   support

           0       0.25      0.31      0.28       201
           1       0.38      0.35      0.36       448
           2       0.30      0.24      0.27       341
           3       0.43      0.39      0.41       482
           4       0.35      0.50      0.41       234

    accuracy                           0.35      1706
   macro avg       0.34      0.36      0.35      1706
weighted avg       0.36      0.35      0.35      1706

[[ 63  83  26  20   9]
 [115 156  80  70  27]
 [ 38  97  83  80  43]
 [ 27  56  76 187 136]
 [ 11  18  15  74 116]]


#### 4.1.2 Naive Bayes: Unigram + Bigram

In [100]:
#vectorizer_bi = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,2))
#bi_features_train = vectorizer_bi.fit_transform(train_set['Phrase'])
#bi_features_test = vectorizer_bi.transform(test_set['Phrase'])

bi_nb = MultinomialNB()
bi_nb.fit(bi_features_train, train_labels)
 
predictions = bi_nb.predict(bi_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))


              precision    recall  f1-score   support

           0       0.23      0.28      0.25       201
           1       0.37      0.36      0.36       448
           2       0.31      0.23      0.26       341
           3       0.45      0.42      0.43       482
           4       0.34      0.47      0.40       234

    accuracy                           0.36      1706
   macro avg       0.34      0.35      0.34      1706
weighted avg       0.36      0.36      0.35      1706

[[ 57  89  23  20  12]
 [111 160  78  67  32]
 [ 42 101  78  79  41]
 [ 25  65  60 201 131]
 [ 15  16  11  81 111]]


#### 4.1.3 Naive Bayes: Unigram + Bigram + Trigram

In [101]:
tri_nb = MultinomialNB()
tri_nb.fit(tri_features_train, train_labels)
 
predictions = tri_nb.predict(tri_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))


              precision    recall  f1-score   support

           0       0.24      0.29      0.26       201
           1       0.37      0.36      0.36       448
           2       0.31      0.23      0.26       341
           3       0.44      0.42      0.43       482
           4       0.34      0.47      0.40       234

    accuracy                           0.36      1706
   macro avg       0.34      0.35      0.34      1706
weighted avg       0.36      0.36      0.35      1706

[[ 59  87  21  23  11]
 [109 160  80  68  31]
 [ 40 104  77  79  41]
 [ 25  65  59 201 132]
 [ 14  15  12  82 111]]


### 4.2 Logistische Regression
#### 4.2.1. Unigram

In [102]:
uni_logreg = sklearn.linear_model.LogisticRegression(max_iter=10000)
uni_logreg.fit(uni_features_train, train_labels)
#print (uni_logreg)

""" default state of the classifier: 
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)"""

# Predictions für unseren Test-Datensatz, Accuracy, Confusion-Matrix berechnen: 

predictions = uni_logreg.predict(uni_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))


              precision    recall  f1-score   support

           0       0.30      0.29      0.29       201
           1       0.37      0.37      0.37       448
           2       0.31      0.32      0.31       341
           3       0.45      0.43      0.44       482
           4       0.35      0.38      0.36       234

    accuracy                           0.37      1706
   macro avg       0.35      0.36      0.36      1706
weighted avg       0.37      0.37      0.37      1706

[[ 58  89  31  17   6]
 [ 88 164 104  68  24]
 [ 27  99 108  76  31]
 [ 16  71  87 208 100]
 [  5  25  24  92  88]]


#### 4.2.2. Log Regression: Unigrams + Bigrams

In [103]:
bi_logreg = sklearn.linear_model.LogisticRegression(max_iter=10000)
bi_logreg.fit(bi_features_train, train_labels)
#print (bi_logreg)

predictions = bi_logreg.predict(bi_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))
# Feature-Set Größe erhöht sich weil auch Bi-Gramme berücksichtigt 

              precision    recall  f1-score   support

           0       0.33      0.25      0.29       201
           1       0.40      0.45      0.42       448
           2       0.32      0.29      0.31       341
           3       0.46      0.49      0.48       482
           4       0.34      0.33      0.34       234

    accuracy                           0.39      1706
   macro avg       0.37      0.36      0.37      1706
weighted avg       0.39      0.39      0.39      1706

[[ 51  88  32  19  11]
 [ 65 201  97  63  22]
 [ 16 112 100  89  24]
 [ 17  77  62 236  90]
 [  4  27  22 104  77]]


#### 4.2.3. Log Regression: Unigrams + Bigrams + Trigrams

In [104]:
tri_logreg = sklearn.linear_model.LogisticRegression(max_iter=10000)
tri_logreg.fit(tri_features_train, train_labels)
#print (tri_logreg)

predictions = tri_logreg.predict(tri_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))

              precision    recall  f1-score   support

           0       0.37      0.23      0.29       201
           1       0.41      0.47      0.44       448
           2       0.33      0.31      0.32       341
           3       0.46      0.50      0.48       482
           4       0.35      0.30      0.33       234

    accuracy                           0.40      1706
   macro avg       0.38      0.37      0.37      1706
weighted avg       0.39      0.40      0.39      1706

[[ 47  89  33  21  11]
 [ 48 212 100  70  18]
 [ 15 109 106  89  22]
 [ 13  83  63 243  80]
 [  4  29  24 106  71]]


### 4.3 Linear Support Vector Machine (LSVM) 
#### 4.3.1. Unigram 

In [None]:
uni_lsvm = sklearn.svm.LinearSVC()
uni_lsvm.fit(uni_features_train, train_labels)
 
predictions = uni_lsvm.predict(uni_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))

              precision    recall  f1-score   support

           0       0.36      0.29      0.32       227
           1       0.37      0.37      0.37       432
           2       0.23      0.24      0.23       313
           3       0.40      0.41      0.40       461
           4       0.41      0.43      0.42       273

    accuracy                           0.36      1706
   macro avg       0.35      0.35      0.35      1706
weighted avg       0.36      0.36      0.36      1706

[[ 65  86  38  26  12]
 [ 64 161 116  63  28]
 [ 24  94  74  92  29]
 [ 23  76  75 189  98]
 [  5  24  23 103 118]]


#### 4.3.2. LSVM: Unigram + Bigram 

In [None]:
bi_lsvm = sklearn.svm.LinearSVC()
bi_lsvm.fit(bi_features_train, train_labels)
 
predictions = bi_lsvm.predict(bi_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))

              precision    recall  f1-score   support

           0       0.39      0.27      0.32       227
           1       0.39      0.43      0.41       432
           2       0.27      0.29      0.28       313
           3       0.41      0.44      0.43       461
           4       0.44      0.38      0.41       273

    accuracy                           0.38      1706
   macro avg       0.38      0.36      0.37      1706
weighted avg       0.38      0.38      0.38      1706

[[ 62  94  29  31  11]
 [ 52 187 101  70  22]
 [ 21  96  91  83  22]
 [ 18  77  83 204  79]
 [  6  23  30 109 105]]


#### 4.3.3. LSVM: Unigram + Bigram + Trigram

In [None]:
tri_lsvm = sklearn.svm.LinearSVC()
tri_lsvm.fit(tri_features_train, train_labels)
 
predictions = tri_lsvm.predict(tri_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))

              precision    recall  f1-score   support

           0       0.44      0.27      0.34       227
           1       0.41      0.45      0.43       432
           2       0.26      0.30      0.28       313
           3       0.40      0.44      0.42       461
           4       0.46      0.37      0.41       273

    accuracy                           0.38      1706
   macro avg       0.39      0.37      0.37      1706
weighted avg       0.39      0.38      0.38      1706

[[ 62  86  34  35  10]
 [ 38 195 106  74  19]
 [ 23  96  93  86  15]
 [ 13  78  93 203  74]
 [  6  26  32 107 102]]
