# Sentiment Analysis Bag-of-Words

In [66]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## 1. Data Loading

In [67]:
import random
data = pd.read_csv ("train.tsv", sep = '\t')

# Teilsätze rausschmeißen
data = data.groupby('SentenceId').first().reset_index()
data.head()


# so Daten laden, wenn das test_set Sentiment Labels hätte:
#train_set = pd.read_csv ("train.tsv", sep = '\t')
#test_set= pd.read_csv ("test.tsv", sep = '\t')
#pd.set_option('display.max_colwidth', None)
#test_set.head(10)

Unnamed: 0,SentenceId,PhraseId,Phrase,Sentiment
0,1,1,"A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .",1
1,2,64,"This quiet , introspective and entertaining independent is worth seeking .",4
2,3,82,"Even fans of Ismail Merchant 's work , I suspect , would have a hard time sitting through this one .",1
3,4,117,"A positively thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder of a Shakespearean tragedy or a juicy soap opera .",3
4,5,157,Aggressive self-glorification and a manipulative whitewash .,1


In [68]:
data.Sentiment.value_counts()

# Daten ausbalancieren im Trainings-Set? 
# Jeder Sentiment Value sollte gleiche Anzahl an Samples haben 

3    2321
1    2200
2    1655
4    1281
0    1072
Name: Sentiment, dtype: int64

## 2. Data cleaning

In [69]:
import nltk
# nltk.download('stopwords')
# pip install nltk
import re


# Für Sentimentanalyse zählen nur Wörter  
def keep_only_letters(text):
    text=re.sub(r'[^a-zA-Z\s]','',text)
    return text
 
# Groß- und Kleinschreibung egal 
def convert_to_lowercase(text):
    return text.lower()
 
def clean_reviews(text):
    text = keep_only_letters(text)
    text = convert_to_lowercase(text)
    return text

data['Phrase'] = data['Phrase'].apply(clean_reviews)
# train_set['Phrase'] = train_set['Phrase'].apply(clean_reviews)
# test_set['Phrase'] = test_set['Phrase'].apply(clean_reviews)

# Stop Words definition
english_stop_words = nltk.corpus.stopwords.words('english')
print(len(english_stop_words))
print (english_stop_words[:20])

# Stop Words removal
def remove_stop_words(text):
    for stopword in english_stop_words:
        stopword = ' ' + stopword + ' '
        text = text.replace(stopword, ' ')
    return text
 

data['Phrase'] = data['Phrase'].apply(remove_stop_words) 
#train_set['Phrase'] = train_set['Phrase'].apply(remove_stop_words)
#test_set['Phrase'] = test_set['Phrase'].apply(remove_stop_words)


# Stemming
def text_stemming(text):
    stemmer = nltk.porter.PorterStemmer()
    stemmed = ' '.join([stemmer.stem(token) for token in text.split()])
    return stemmed

data['Phrase'] = data['Phrase'].apply(text_stemming) 
#train_set['Phrase'] = train_set['Phrase'].apply(text_stemming)
#test_set['Phrase'] = test_set['Phrase'].apply(text_stemming)

#train_set.head(10)
#train_set.shape
#test_set.shape

179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']


In [70]:
# Train Test Split
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(data, test_size=0.2)

## 3. Text Vectorization
CountVectorizer aus sklearn um bag-of-words Darstellung von unserem Trainings- und Testset zu erhalten
naive bag-of-words text vectorization

Nur Trainingsdatensatz zur Definition des Vokabulars heranziehen und 
das gleiche Vokabular zur Darstellung des Test-Datensatzes verwenden
-> Vektorizer an Trainingsdaten anpassen und zur Transformation der Testdaten verwenden

weighted version of BOW ausprobieren?

Unigramme: Alle eindeutigen Wörter in einem Dokument

BiGramme: Alle Permutationen von zwei aufeinanderfolgenden Wörtern in einem Dokument

TriGrams: Alle Permutationen von drei aufeinanderfolgenden Wörtern in einem Dokument

In [71]:
import sklearn

# Argument binary=False: Vocabel-Vector mit term-frequency füllen
# binary = True: Vocabel-Vector mit Vorhandensein der Token füllen (1 vorhanden, 0 nicht vorhanden) 
# ngram _range = Unigram: (1,1); Bigram: (1,2); Trigram: (1,3)

vectorizer_uni = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,1))
vectorizer_bi = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,2))
vectorizer_tri = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,3))

uni_features_train = vectorizer_uni.fit_transform(train_set['Phrase'])
uni_features_test = vectorizer_uni.transform(test_set['Phrase'])
#print (uni_features_train.shape, uni_features_test.shape)

bi_features_train = vectorizer_bi.fit_transform(train_set['Phrase'])
bi_features_test = vectorizer_bi.transform(test_set['Phrase'])
#print (bi_features_train.shape, bi_features_test.shape)

tri_features_train = vectorizer_tri.fit_transform(train_set['Phrase'])
tri_features_test = vectorizer_tri.transform(test_set['Phrase'])
#print (tri_features_train.shape, tri_features_test.shape)


Unigram:
(156060, 10998) (66292, 10998): 10998 einzigartige englische token in unserem Vokabular (abgeleitet aus Trainingsdatensatz)
Jeder Token wird durch eine Spalte im Datensatz repräsentiert
Für jedes Review im Datensatz wird die Frequency der Token (term-frequency) durch Vokabel-Vector der Größe 10998 dargestellt.
= Daher haben wir 156060 solcher Vektoren in unserem Trainings-Datensatz und 66292 in unserem Test-Datensatz = Anzahl der Reviews 

In [72]:
train_labels = train_set['Sentiment']
test_labels = test_set['Sentiment']

## 4. Klassifizierungsmodelle trainieren
### 4.1.  Naive Bayes
#### 4.1.1. Unigram (Logistic Regression classifier on unigram features)

In [74]:
from sklearn.naive_bayes import MultinomialNB
 
vectorizer_uni = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,1))
uni_features_train = vectorizer_uni.fit_transform(train_set['Phrase'])
uni_features_test = vectorizer_uni.transform(test_set['Phrase'])

uni_nb = MultinomialNB()
uni_nb.fit(uni_features_train, train_labels)
 
predictions = uni_nb.predict(uni_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))


              precision    recall  f1-score   support

           0       0.54      0.18      0.26       217
           1       0.41      0.56      0.47       457
           2       0.29      0.16      0.21       305
           3       0.42      0.63      0.51       475
           4       0.44      0.24      0.31       252

    accuracy                           0.41      1706
   macro avg       0.42      0.35      0.35      1706
weighted avg       0.41      0.41      0.38      1706

[[ 38 117  17  40   5]
 [ 30 255  52 108  12]
 [  1 130  50 105  19]
 [  1  91  43 297  43]
 [  0  30  10 151  61]]


#### 4.1.2 Naive Bayes: Unigram + Bigram

In [75]:
#vectorizer_bi = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,2))
#bi_features_train = vectorizer_bi.fit_transform(train_set['Phrase'])
#bi_features_test = vectorizer_bi.transform(test_set['Phrase'])

bi_nb = MultinomialNB()
bi_nb.fit(bi_features_train, train_labels)
 
predictions = bi_nb.predict(bi_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))


              precision    recall  f1-score   support

           0       0.55      0.07      0.13       217
           1       0.41      0.63      0.50       457
           2       0.29      0.10      0.15       305
           3       0.41      0.70      0.52       475
           4       0.47      0.12      0.19       252

    accuracy                           0.41      1706
   macro avg       0.43      0.32      0.30      1706
weighted avg       0.42      0.41      0.35      1706

[[ 16 145  10  46   0]
 [ 11 288  30 121   7]
 [  2 138  30 130   5]
 [  0  95  25 333  22]
 [  0  34   8 180  30]]


#### 4.1.3 Naive Bayes: Unigram + Bigram + Trigram

In [76]:
tri_nb = MultinomialNB()
tri_nb.fit(tri_features_train, train_labels)
 
predictions = tri_nb.predict(tri_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))


              precision    recall  f1-score   support

           0       0.53      0.07      0.13       217
           1       0.41      0.62      0.50       457
           2       0.31      0.10      0.15       305
           3       0.42      0.73      0.53       475
           4       0.51      0.12      0.20       252

    accuracy                           0.41      1706
   macro avg       0.44      0.33      0.30      1706
weighted avg       0.42      0.41      0.35      1706

[[ 16 143   9  49   0]
 [ 11 285  31 123   7]
 [  3 136  29 133   4]
 [  0  90  20 346  19]
 [  0  34   5 182  31]]


### 4.2 Logistische Regression
#### 4.2.1. Unigram

In [73]:
uni_logreg = sklearn.linear_model.LogisticRegression()
uni_logreg.fit(uni_features_train, train_labels)
#print (uni_logreg)

""" default state of the classifier: 
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)"""

# Predictions für unseren Test-Datensatz, Accuracy, Confusion-Matrix berechnen: 

predictions = uni_logreg.predict(uni_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))


              precision    recall  f1-score   support

           0       0.39      0.25      0.31       217
           1       0.41      0.46      0.43       457
           2       0.25      0.28      0.27       305
           3       0.45      0.50      0.47       475
           4       0.43      0.33      0.38       252

    accuracy                           0.39      1706
   macro avg       0.39      0.36      0.37      1706
weighted avg       0.40      0.39      0.39      1706

[[ 54  83  48  22  10]
 [ 56 209  98  78  16]
 [ 12 110  86  80  17]
 [ 12  78  78 239  68]
 [  3  24  28 113  84]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### 4.2.2. Log Regression: Unigrams + Bigrams

In [44]:
bi_logreg = sklearn.linear_model.LogisticRegression()
bi_logreg.fit(uni_features_train, train_labels)
#print (bi_logreg)

predictions = bi_logreg.predict(bi_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=[0, 1, 2, 3, 4]))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))
# Feature-Set Größe erhöht sich weil auch Bi-Gramme berücksichtigt 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


ValueError: X has 64304 features, but LogisticRegression is expecting 9936 features as input.

#### 4.2.3. Log Regression: Unigrams + Bigrams + Trigrams

In [None]:
tri_logreg = sklearn.linear_model.LogisticRegression()
tri_logreg.fit(tri_features_train, train_labels)
#print (tri_logreg)

predictions = tri_logreg.predict(tri_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))

### 4.3 Linear Support Vector Machine (LSVM) 
#### 4.3.1. Unigram 

In [60]:
uni_lsvm = sklearn.svm.LinearSVC()
uni_lsvm.fit(uni_features_train, train_labels)
 
predictions = uni_lsvm.predict(uni_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))

              precision    recall  f1-score   support

           0       0.27      0.24      0.25       225
           1       0.31      0.34      0.32       427
           2       0.27      0.24      0.25       339
           3       0.38      0.44      0.40       427
           4       0.47      0.41      0.44       288

    accuracy                           0.34      1706
   macro avg       0.34      0.33      0.33      1706
weighted avg       0.34      0.34      0.34      1706

[[ 53 105  33  28   6]
 [ 79 145  84  95  24]
 [ 29 116  80  90  24]
 [ 29  64  72 186  76]
 [ 10  37  28  96 117]]


#### 4.3.2. LSVM: Bigram 

In [61]:
bi_lsvm = sklearn.svm.LinearSVC()
bi_lsvm.fit(bi_features_train, train_labels)
 
predictions = bi_lsvm.predict(uni_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','1','2','3','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1, 2, 3, 4]))

ValueError: X has 10564 features, but LinearSVC is expecting 63400 features as input.