In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/MyDrive/LSC_Sentiment_Analysis'

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1iBSu5iMtGl5Ys7feuwPxbrIUCugIipJf/LSC_Sentiment_Analysis


# Sentiment Analysis Bag-of-Words
## Movie Reviews (smaller Data-Set)

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import nltk
import re
import csv
from evaluation import test_statistics


## 1. Data Loading

In [None]:
train_set = pd.read_csv ("../data/datasets_mr/Trainset_complete.csv", sep = ';')
test_set = pd.read_csv ("../data/datasets_mr/Testset.csv", sep = ';')

# so Daten laden, wenn das test_set Sentiment Labels hätte:
#train_set = pd.read_csv ("train.tsv", sep = '\t')
#test_set= pd.read_csv ("test.tsv", sep = '\t')
#pd.set_option('display.max_colwidth', None)
#test_set.head(10)

In [None]:
train_set.Sentiment.value_counts()

# Daten ausbalancieren im Trainings-Set?
# Jeder Sentiment Value sollte gleiche Anzahl an Samples haben

## 2. Data cleaning

In [None]:
import nltk
# nltk.download('stopwords')
# pip install nltk
import re


# Für Sentimentanalyse zählen nur Wörter
def keep_only_letters(text):
    text=re.sub(r'[^a-zA-Z\s]','',text)
    return text

# Groß- und Kleinschreibung egal
def convert_to_lowercase(text):
    return text.lower()

def clean_reviews(text):
    text = keep_only_letters(text)
    text = convert_to_lowercase(text)
    return text


# Stop Words definition
english_stop_words = nltk.corpus.stopwords.words('english')
print(len(english_stop_words))
print (english_stop_words[:20])

# Stop Words removal
def remove_stop_words(text):
    for stopword in english_stop_words:
        stopword = ' ' + stopword + ' '
        text = text.replace(stopword, ' ')
    return text


# Stemming
def text_stemming(text):
    stemmer = nltk.porter.PorterStemmer()
    stemmed = ' '.join([stemmer.stem(token) for token in text.split()])
    return stemmed




train_set['Phrase'] = train_set['Phrase'].apply(clean_reviews)
# test_set['Phrase'] = test_set['Phrase'].apply(clean_reviews)

train_set['Phrase'] = train_set['Phrase'].apply(remove_stop_words)
# test_set['Phrase'] = test_set['Phrase'].apply(remove_stop_words)

train_set['Phrase'] = train_set['Phrase'].apply(text_stemming)
#test_set['Phrase'] = test_set['Phrase'].apply(text_stemming)

#train_set.head(10)
#train_set.shape
#test_set.shape

## 3. Text Vectorization
CountVectorizer aus sklearn um bag-of-words Darstellung von unserem Trainings- und Testset zu erhalten
naive bag-of-words text vectorization

Nur Trainingsdatensatz zur Definition des Vokabulars heranziehen und
das gleiche Vokabular zur Darstellung des Test-Datensatzes verwenden
-> Vektorizer an Trainingsdaten anpassen und zur Transformation der Testdaten verwenden

weighted version of BOW ausprobieren?

### N-Grams

Unigramme: Alle eindeutigen Wörter in einem Dokument

BiGramme: Alle Permutationen von zwei aufeinanderfolgenden Wörtern in einem Dokument

TriGrams: Alle Permutationen von drei aufeinanderfolgenden Wörtern in einem Dokument

In [None]:
import sklearn
# convert text data to numeric

# Argument binary=False: Vocabel-Vector mit term-frequency füllen
# binary = True: Vocabel-Vector mit Vorhandensein der Token füllen (1 vorhanden, 0 nicht vorhanden)
# ngram _range = Unigram: (1,1); Bigram: (1,2); Trigram: (1,3)

vectorizer_uni = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,1))
vectorizer_bi = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,2))
vectorizer_tri = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,3))

In [None]:
uni_features_train = vectorizer_uni.fit_transform(train_set['Phrase'])
uni_features_test = vectorizer_uni.transform(test_set['Phrase'])
#print (uni_features_train.shape, uni_features_test.shape)

bi_features_train = vectorizer_bi.fit_transform(train_set['Phrase'])
bi_features_test = vectorizer_bi.transform(test_set['Phrase'])
#print (bi_features_train.shape, bi_features_test.shape)

tri_features_train = vectorizer_tri.fit_transform(train_set['Phrase'])
tri_features_test = vectorizer_tri.transform(test_set['Phrase'])
#print (tri_features_train.shape, tri_features_test.shape)

Unigram:
(156060, 10998) (66292, 10998): 10998 einzigartige englische token in unserem Vokabular (abgeleitet aus Trainingsdatensatz)
Jeder Token wird durch eine Spalte im Datensatz repräsentiert
Für jedes Review im Datensatz wird die Frequency der Token (term-frequency) durch Vokabel-Vector der Größe 10998 dargestellt.
= Daher haben wir 156060 solcher Vektoren in unserem Trainings-Datensatz und 66292 in unserem Test-Datensatz = Anzahl der Reviews

In [None]:
train_labels = train_set['Sentiment']
test_labels = test_set['Sentiment']

## 4. Klassifizierungsmodelle trainieren
### 4.1.  Naive Bayes
#### 4.1.1. Unigram (Logistic Regression classifier on unigram features)

In [None]:
from sklearn.naive_bayes import MultinomialNB

vectorizer_uni = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,1))
uni_features_train = vectorizer_uni.fit_transform(train_set['Phrase'])
uni_features_test = vectorizer_uni.transform(test_set['Phrase'])

uni_nb = MultinomialNB()
uni_nb.fit(uni_features_train, train_labels)

predictions = uni_nb.predict(uni_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['-1','0','1']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[-1, 0, 1]))


#### 4.1.2 Naive Bayes: Unigram + Bigram

In [None]:
#vectorizer_bi = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,2))
#bi_features_train = vectorizer_bi.fit_transform(train_set['Phrase'])
#bi_features_test = vectorizer_bi.transform(test_set['Phrase'])

bi_nb = MultinomialNB()
bi_nb.fit(bi_features_train, train_labels)

predictions = bi_nb.predict(bi_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['-1','0','1']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[-1, 0, 1]))


#### 4.1.3 Naive Bayes: Unigram + Bigram + Trigram

In [None]:
tri_nb = MultinomialNB()
tri_nb.fit(tri_features_train, train_labels)

predictions = tri_nb.predict(tri_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['-1','0','1']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[-1, 0, 1]))


### 4.2 Logistische Regression
#### 4.2.1. Unigram

In [None]:
uni_logreg = sklearn.linear_model.LogisticRegression(max_iter=10000)
uni_logreg.fit(uni_features_train, train_labels)
#print (uni_logreg)

""" default state of the classifier:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)"""

# Predictions für unseren Test-Datensatz, Accuracy, Confusion-Matrix berechnen:

predictions = uni_logreg.predict(uni_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['-1','0','1']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[-1, 0, 1]))


#### 4.2.2. Log Regression: Unigrams + Bigrams

In [None]:
bi_logreg = sklearn.linear_model.LogisticRegression(max_iter=10000)
bi_logreg.fit(bi_features_train, train_labels)
#print (bi_logreg)

predictions = bi_logreg.predict(bi_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['-1','0','1']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[-1, 0, 1]))
# Feature-Set Größe erhöht sich weil auch Bi-Gramme berücksichtigt

#### 4.2.3. Log Regression: Unigrams + Bigrams + Trigrams

In [None]:
tri_logreg = sklearn.linear_model.LogisticRegression(max_iter=10000)
tri_logreg.fit(tri_features_train, train_labels)
#print (tri_logreg)

predictions = tri_logreg.predict(tri_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['-1','0','1']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[-1, 0, 1]))

### 4.3 Linear Support Vector Machine (LSVM)
#### 4.3.1. Unigram

In [None]:
uni_lsvm = sklearn.svm.LinearSVC()
uni_lsvm.fit(uni_features_train, train_labels)

predictions = uni_lsvm.predict(uni_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['-1','0','1']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[-1, 0, 1]))

#### 4.3.2. LSVM: Unigram + Bigram

In [None]:
bi_lsvm = sklearn.svm.LinearSVC()
bi_lsvm.fit(bi_features_train, train_labels)

predictions = bi_lsvm.predict(bi_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['-1','0','1']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[-1, 0, 1]))

#### 4.3.3. LSVM: Unigram + Bigram + Trigram

In [None]:
tri_lsvm = sklearn.svm.LinearSVC()
tri_lsvm.fit(tri_features_train, train_labels)

predictions = tri_lsvm.predict(tri_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['-1','0','1']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[-1, 0, 1]))

# Sentiment Analysis Bag-of-Words
## Tweets (Larger Data-Set)

In [None]:
import csv
with open("../data/tweets.csv") as csvdatei:
    data = pd.read_csv(csvdatei, delimiter=',')

data.columns = ['Sentiment','ID','date','flag','user','Phrase']
data.drop(['date','flag','user'], axis=1, inplace=True)
data = sklearn.utils.shuffle(data)

data.Sentiment.value_counts()

In [None]:
import nltk
# nltk.download('stopwords')
# pip install nltk
import re

# Zeitbedarf: 7m 14s


# Für Sentimentanalyse zählen nur Wörter
def keep_only_letters(text):
    text=re.sub(r'[^a-zA-Z\s]','',text)
    return text

# Groß- und Kleinschreibung egal
def convert_to_lowercase(text):
    return text.lower()

def clean_reviews(text):
    text = keep_only_letters(text)
    text = convert_to_lowercase(text)
    return text


# Stop Words definition
english_stop_words = nltk.corpus.stopwords.words('english')
#print(len(english_stop_words))
#print (english_stop_words[:20])

# Stop Words removal
def remove_stop_words(text):
    for stopword in english_stop_words:
        stopword = ' ' + stopword + ' '
        text = text.replace(stopword, ' ')
    return text


# Stemming
def text_stemming(text):
    stemmer = nltk.porter.PorterStemmer()
    stemmed = ' '.join([stemmer.stem(token) for token in text.split()])
    return stemmed




#data['Phrase'] = data['Phrase'].apply(clean_reviews)
# test_set['Phrase'] = test_set['Phrase'].apply(clean_reviews)

#data['Phrase'] = data['Phrase'].apply(remove_stop_words)
# test_set['Phrase'] = test_set['Phrase'].apply(remove_stop_words)

#data['Phrase'] = data['Phrase'].apply(text_stemming)
#test_set['Phrase'] = test_set['Phrase'].apply(text_stemming)

In [None]:
# Train Test Split
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(data, test_size=0.2)

In [None]:
train_set.Sentiment.value_counts()

## 3. Text Vectorization

In [None]:
import sklearn
# convert text data to numeric

# Argument binary=False: Vocabel-Vector mit term-frequency füllen
# binary = True: Vocabel-Vector mit Vorhandensein der Token füllen (1 vorhanden, 0 nicht vorhanden)
# ngram _range = Unigram: (1,1); Bigram: (1,2); Trigram: (1,3)

vectorizer_uni = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,1))
vectorizer_bi = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,2))
vectorizer_tri = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,3))


In [None]:
# Dauer mit Cleaning: 3m 24s
# Dauer ohne Cleaning: 4m 46s

uni_features_train = vectorizer_uni.fit_transform(train_set['Phrase'])
uni_features_test = vectorizer_uni.transform(test_set['Phrase'])
#print (uni_features_train.shape, uni_features_test.shape)

bi_features_train = vectorizer_bi.fit_transform(train_set['Phrase'])
bi_features_test = vectorizer_bi.transform(test_set['Phrase'])
#print (bi_features_train.shape, bi_features_test.shape)

tri_features_train = vectorizer_tri.fit_transform(train_set['Phrase'])
tri_features_test = vectorizer_tri.transform(test_set['Phrase'])
#print (tri_features_train.shape, tri_features_test.shape)

In [None]:
train_labels = train_set['Sentiment']
test_labels = test_set['Sentiment']

## Klassifizierungsmodelle trainieren

In [None]:
# Naive Bayes Unigram
# mit Cleaning: Zeit: 2s Accuracy: 0.77
# ohne Cleaning: Zeit: 2s Accuracy: 0.78

from sklearn.naive_bayes import MultinomialNB

uni_nb = MultinomialNB()
uni_nb.fit(uni_features_train, train_labels)

predictions = uni_nb.predict(uni_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 4]))

In [None]:
# Naive Bayes Unigram + Bigram
# mit Cleaning: Zeit: 6.5s Accuracy: 0.78
# ohne Cleaning: Zeit: 6.5s Accuracy: 0.80

bi_nb = MultinomialNB()
bi_nb.fit(bi_features_train, train_labels)

predictions = bi_nb.predict(bi_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 4]))


In [None]:
# Naive Bayes Unigram + Bigram + Trigram
# mit Cleaning: Zeit: 4.2s Accuracy: 0.79
# mit Cleaning: Zeit: 4.2s Accuracy: 0.80

tri_nb = MultinomialNB()
tri_nb.fit(tri_features_train, train_labels)

predictions = tri_nb.predict(tri_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 4]))


In [None]:
# Logistische Regression Unigram
# mit Cleaning: Zeit: 5m 50s Accuracy: 0.78
# ohne Cleaning: Zeit: 7m 26s Accuracy: 0.80


uni_logreg = sklearn.linear_model.LogisticRegression(max_iter=10000)
uni_logreg.fit(uni_features_train, train_labels)

predictions = uni_logreg.predict(uni_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 4]))


In [None]:
# Logistische Regression Unigram + Bigram
# mit Cleaning: Zeit: 20m 7s Accuracy: 0.80


bi_logreg = sklearn.linear_model.LogisticRegression(max_iter=10000)
bi_logreg.fit(bi_features_train, train_labels)
#print (bi_logreg)

predictions = bi_logreg.predict(bi_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 4]))
# Feature-Set Größe erhöht sich weil auch Bi-Gramme berücksichtigt

In [None]:
# Logistische Regression Unigram + Bigram + Trigram
# mit Cleaning: Zeit: 36m 21s Accuracy: 0.80

tri_logreg = sklearn.linear_model.LogisticRegression(max_iter=10000)
tri_logreg.fit(tri_features_train, train_labels)
#print (tri_logreg)

predictions = tri_logreg.predict(tri_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 4]))

In [None]:
# Linear Support Vector Machine (LSVM) Unigram
# mit Cleaning: Zeit: 10m 12s Accuracy: 0.77

uni_lsvm = sklearn.svm.LinearSVC()
uni_lsvm.fit(uni_features_train, train_labels)

predictions = uni_lsvm.predict(uni_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 4]))

In [None]:
# Linear Support Vector Machine (LSVM) Unigram + Bigram
# mit Cleaning: Zeit: 11m 30s Accuracy: 0.79

bi_lsvm = sklearn.svm.LinearSVC()
bi_lsvm.fit(bi_features_train, train_labels)

predictions = bi_lsvm.predict(bi_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 4]))

In [None]:
# Linear Support Vector Machine (LSVM) Unigram + Bigram + Trigram
# mit Cleaning: Zeit: 13m 10s Accuracy: 0.79

tri_lsvm = sklearn.svm.LinearSVC()
tri_lsvm.fit(tri_features_train, train_labels)

predictions = tri_lsvm.predict(tri_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['0','4']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 4]))