# Sentiment Analysis Bag-of-Words

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## 1. Data Loading

In [2]:
import random
data = pd.read_csv ("train.tsv", sep = '\t')

# Teilsätze rausschmeißen
data = data.groupby('SentenceId').first().reset_index()
data.head()


# so Daten laden, wenn das test_set Sentiment Labels hätte:
#train_set = pd.read_csv ("train.tsv", sep = '\t')
#test_set= pd.read_csv ("test.tsv", sep = '\t')
#pd.set_option('display.max_colwidth', None)
#test_set.head(10)

Unnamed: 0,SentenceId,PhraseId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,64,"This quiet , introspective and entertaining in...",4
2,3,82,"Even fans of Ismail Merchant 's work , I suspe...",1
3,4,117,A positively thrilling combination of ethnogra...,3
4,5,157,Aggressive self-glorification and a manipulati...,1


In [3]:
data.Sentiment.value_counts()

# Daten ausbalancieren im Trainings-Set? 
# Jeder Sentiment Value sollte gleiche Anzahl an Samples haben 

3    2321
1    2200
2    1655
4    1281
0    1072
Name: Sentiment, dtype: int64

## 2. Data cleaning

In [4]:
import nltk
# nltk.download('stopwords')
# pip install nltk
import re


# Für Sentimentanalyse zählen nur Wörter  
def keep_only_letters(text):
    text=re.sub(r'[^a-zA-Z\s]','',text)
    return text
 
# Groß- und Kleinschreibung egal 
def convert_to_lowercase(text):
    return text.lower()
 
def clean_reviews(text):
    text = keep_only_letters(text)
    text = convert_to_lowercase(text)
    return text

data['Phrase'] = data['Phrase'].apply(clean_reviews)
# train_set['Phrase'] = train_set['Phrase'].apply(clean_reviews)
# test_set['Phrase'] = test_set['Phrase'].apply(clean_reviews)

# Stop Words definition
english_stop_words = nltk.corpus.stopwords.words('english')
print(len(english_stop_words))
print (english_stop_words[:20])

# Stop Words removal
def remove_stop_words(text):
    for stopword in english_stop_words:
        stopword = ' ' + stopword + ' '
        text = text.replace(stopword, ' ')
    return text
 

data['Phrase'] = data['Phrase'].apply(remove_stop_words) 
#train_set['Phrase'] = train_set['Phrase'].apply(remove_stop_words)
#test_set['Phrase'] = test_set['Phrase'].apply(remove_stop_words)


# Stemming
def text_stemming(text):
    stemmer = nltk.porter.PorterStemmer()
    stemmed = ' '.join([stemmer.stem(token) for token in text.split()])
    return stemmed

data['Phrase'] = data['Phrase'].apply(text_stemming) 
#train_set['Phrase'] = train_set['Phrase'].apply(text_stemming)
#test_set['Phrase'] = test_set['Phrase'].apply(text_stemming)

#train_set.head(10)
#train_set.shape
#test_set.shape

179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']


In [5]:
# Train Test Split
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(data, test_size=0.2)
train_set.Sentiment.value_counts()

3    1849
1    1764
2    1330
4    1024
0     856
Name: Sentiment, dtype: int64

In [6]:
# Daten balancieren

Sentiment_groups = train_set.Sentiment.value_counts()
for i in range(5):
  while Sentiment_groups[i] < Sentiment_groups[3]:
    
    sent_i_rows = train_set.loc[train_set['Sentiment'] == i]
    adding_row = sent_i_rows.sample()
    train_set = train_set.append(adding_row, ignore_index=True)

    Sentiment_groups = train_set.Sentiment.value_counts()

train_set.Sentiment.value_counts()

3    1849
4    1849
0    1849
1    1849
2    1849
Name: Sentiment, dtype: int64

## 3. Text Vectorization
CountVectorizer aus sklearn um bag-of-words Darstellung von unserem Trainings- und Testset zu erhalten
naive bag-of-words text vectorization

Nur Trainingsdatensatz zur Definition des Vokabulars heranziehen und 
das gleiche Vokabular zur Darstellung des Test-Datensatzes verwenden
-> Vektorizer an Trainingsdaten anpassen und zur Transformation der Testdaten verwenden

weighted version of BOW ausprobieren?

### N-Grams

Unigramme: Alle eindeutigen Wörter in einem Dokument

BiGramme: Alle Permutationen von zwei aufeinanderfolgenden Wörtern in einem Dokument

TriGrams: Alle Permutationen von drei aufeinanderfolgenden Wörtern in einem Dokument

In [7]:
import sklearn
# convert text data to numeric

# Argument binary=False: Vocabel-Vector mit term-frequency füllen
# binary = True: Vocabel-Vector mit Vorhandensein der Token füllen (1 vorhanden, 0 nicht vorhanden) 
# ngram _range = Unigram: (1,1); Bigram: (1,2); Trigram: (1,3)

vectorizer_uni = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,1))
vectorizer_bi = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,2))
vectorizer_tri = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,3))

In [8]:
uni_features_train = vectorizer_uni.fit_transform(train_set['Phrase'])
uni_features_test = vectorizer_uni.transform(test_set['Phrase'])
#print (uni_features_train.shape, uni_features_test.shape)

bi_features_train = vectorizer_bi.fit_transform(train_set['Phrase'])
bi_features_test = vectorizer_bi.transform(test_set['Phrase'])
#print (bi_features_train.shape, bi_features_test.shape)

tri_features_train = vectorizer_tri.fit_transform(train_set['Phrase'])
tri_features_test = vectorizer_tri.transform(test_set['Phrase'])
#print (tri_features_train.shape, tri_features_test.shape)

Unigram:
(156060, 10998) (66292, 10998): 10998 einzigartige englische token in unserem Vokabular (abgeleitet aus Trainingsdatensatz)
Jeder Token wird durch eine Spalte im Datensatz repräsentiert
Für jedes Review im Datensatz wird die Frequency der Token (term-frequency) durch Vokabel-Vector der Größe 10998 dargestellt.
= Daher haben wir 156060 solcher Vektoren in unserem Trainings-Datensatz und 66292 in unserem Test-Datensatz = Anzahl der Reviews 

In [9]:
train_labels = train_set['Sentiment']
test_labels = test_set['Sentiment']

## 4. Klassifizierungsmodelle trainieren
### 4.1.  Naive Bayes
#### 4.1.1. Unigram (Logistic Regression classifier on unigram features)

In [10]:
from sklearn.naive_bayes import MultinomialNB
 
vectorizer_uni = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,1))
uni_features_train = vectorizer_uni.fit_transform(train_set['Phrase'])
uni_features_test = vectorizer_uni.transform(test_set['Phrase'])

uni_nb = MultinomialNB()
uni_nb.fit(uni_features_train, train_labels)
 
predictions = uni_nb.predict(uni_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))


              precision    recall  f1-score   support

           0       0.30      0.38      0.34       216
           1       0.41      0.42      0.41       436
           2       0.27      0.19      0.23       325
           3       0.43      0.39      0.41       472
           4       0.39      0.49      0.43       257

    accuracy                           0.37      1706
   macro avg       0.36      0.37      0.36      1706
weighted avg       0.37      0.37      0.37      1706

[[ 82  81  23  19  11]
 [ 98 182  63  56  37]
 [ 59  95  63  79  29]
 [ 29  70  70 186 117]
 [  5  21  16  90 125]]


#### 4.1.2 Naive Bayes: Unigram + Bigram

In [11]:
#vectorizer_bi = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,2))
#bi_features_train = vectorizer_bi.fit_transform(train_set['Phrase'])
#bi_features_test = vectorizer_bi.transform(test_set['Phrase'])

bi_nb = MultinomialNB()
bi_nb.fit(bi_features_train, train_labels)
 
predictions = bi_nb.predict(bi_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))


              precision    recall  f1-score   support

           0       0.28      0.35      0.31       216
           1       0.40      0.41      0.41       436
           2       0.30      0.20      0.24       325
           3       0.43      0.42      0.42       472
           4       0.38      0.46      0.42       257

    accuracy                           0.37      1706
   macro avg       0.36      0.37      0.36      1706
weighted avg       0.37      0.37      0.37      1706

[[ 75  85  25  22   9]
 [103 180  54  61  38]
 [ 52  95  65  85  28]
 [ 28  69  61 196 118]
 [  8  21  14  96 118]]


#### 4.1.3 Naive Bayes: Unigram + Bigram + Trigram

In [12]:
tri_nb = MultinomialNB()
tri_nb.fit(tri_features_train, train_labels)
 
predictions = tri_nb.predict(tri_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))


              precision    recall  f1-score   support

           0       0.28      0.35      0.31       216
           1       0.40      0.42      0.41       436
           2       0.31      0.20      0.25       325
           3       0.43      0.42      0.42       472
           4       0.37      0.46      0.41       257

    accuracy                           0.37      1706
   macro avg       0.36      0.37      0.36      1706
weighted avg       0.37      0.37      0.37      1706

[[ 75  87  23  21  10]
 [101 181  53  64  37]
 [ 53  93  66  83  30]
 [ 27  70  56 198 121]
 [  8  21  13  98 117]]


### 4.2 Logistische Regression
#### 4.2.1. Unigram

In [13]:
uni_logreg = sklearn.linear_model.LogisticRegression(max_iter=10000)
uni_logreg.fit(uni_features_train, train_labels)
#print (uni_logreg)

""" default state of the classifier: 
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)"""

# Predictions für unseren Test-Datensatz, Accuracy, Confusion-Matrix berechnen: 

predictions = uni_logreg.predict(uni_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))


              precision    recall  f1-score   support

           0       0.32      0.33      0.33       216
           1       0.40      0.41      0.40       436
           2       0.30      0.30      0.30       325
           3       0.41      0.39      0.40       472
           4       0.37      0.37      0.37       257

    accuracy                           0.37      1706
   macro avg       0.36      0.36      0.36      1706
weighted avg       0.37      0.37      0.37      1706

[[ 71  85  27  24   9]
 [ 80 180  82  64  30]
 [ 37  92  99  69  28]
 [ 25  75  95 182  95]
 [  7  21  31 103  95]]


#### 4.2.2. Log Regression: Unigrams + Bigrams

In [14]:
bi_logreg = sklearn.linear_model.LogisticRegression(max_iter=10000)
bi_logreg.fit(bi_features_train, train_labels)
#print (bi_logreg)

predictions = bi_logreg.predict(bi_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))
# Feature-Set Größe erhöht sich weil auch Bi-Gramme berücksichtigt 

              precision    recall  f1-score   support

           0       0.39      0.29      0.33       216
           1       0.40      0.46      0.43       436
           2       0.32      0.31      0.32       325
           3       0.41      0.44      0.43       472
           4       0.39      0.34      0.36       257

    accuracy                           0.39      1706
   macro avg       0.38      0.37      0.37      1706
weighted avg       0.39      0.39      0.38      1706

[[ 62  97  30  20   7]
 [ 56 201  78  77  24]
 [ 24  99 100  83  19]
 [ 12  85  76 209  90]
 [  4  24  24 117  88]]


#### 4.2.3. Log Regression: Unigrams + Bigrams + Trigrams

In [15]:
tri_logreg = sklearn.linear_model.LogisticRegression(max_iter=10000)
tri_logreg.fit(tri_features_train, train_labels)
#print (tri_logreg)

predictions = tri_logreg.predict(tri_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))

              precision    recall  f1-score   support

           0       0.42      0.27      0.33       216
           1       0.39      0.47      0.43       436
           2       0.31      0.30      0.31       325
           3       0.42      0.46      0.44       472
           4       0.39      0.33      0.36       257

    accuracy                           0.39      1706
   macro avg       0.39      0.36      0.37      1706
weighted avg       0.39      0.39      0.38      1706

[[ 58  98  32  21   7]
 [ 47 203  85  79  22]
 [ 21 106  97  85  16]
 [  9  86  72 217  88]
 [  3  26  24 119  85]]


### 4.3 Linear Support Vector Machine (LSVM) 
#### 4.3.1. Unigram 

In [16]:
uni_lsvm = sklearn.svm.LinearSVC()
uni_lsvm.fit(uni_features_train, train_labels)
 
predictions = uni_lsvm.predict(uni_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))

              precision    recall  f1-score   support

           0       0.29      0.31      0.30       216
           1       0.36      0.36      0.36       436
           2       0.28      0.27      0.27       325
           3       0.37      0.36      0.36       472
           4       0.35      0.37      0.36       257

    accuracy                           0.34      1706
   macro avg       0.33      0.34      0.33      1706
weighted avg       0.34      0.34      0.34      1706

[[ 68  86  25  27  10]
 [ 82 159  90  80  25]
 [ 39  95  88  72  31]
 [ 34  69  90 168 111]
 [  9  27  24 102  95]]


#### 4.3.2. LSVM: Unigram + Bigram 

In [17]:
bi_lsvm = sklearn.svm.LinearSVC()
bi_lsvm.fit(bi_features_train, train_labels)
 
predictions = bi_lsvm.predict(bi_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))

              precision    recall  f1-score   support

           0       0.36      0.28      0.31       216
           1       0.38      0.43      0.40       436
           2       0.29      0.29      0.29       325
           3       0.42      0.42      0.42       472
           4       0.35      0.33      0.34       257

    accuracy                           0.37      1706
   macro avg       0.36      0.35      0.35      1706
weighted avg       0.37      0.37      0.36      1706

[[ 60  99  34  14   9]
 [ 58 187  91  71  29]
 [ 35  99  95  77  19]
 [ 12  79  84 198  99]
 [  3  31  29 110  84]]


#### 4.3.3. LSVM: Unigram + Bigram + Trigram

In [18]:
tri_lsvm = sklearn.svm.LinearSVC()
tri_lsvm.fit(tri_features_train, train_labels)
 
predictions = tri_lsvm.predict(tri_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))

              precision    recall  f1-score   support

           0       0.39      0.25      0.31       216
           1       0.38      0.44      0.41       436
           2       0.30      0.32      0.31       325
           3       0.43      0.44      0.43       472
           4       0.37      0.34      0.35       257

    accuracy                           0.38      1706
   macro avg       0.37      0.36      0.36      1706
weighted avg       0.38      0.38      0.37      1706

[[ 55 101  35  18   7]
 [ 49 192  97  67  31]
 [ 20 101 103  79  22]
 [ 13  78  84 206  91]
 [  3  29  26 112  87]]
