## Data Import


In [11]:
import os
os.chdir("C:/Users/Steffen/Arbeit/99_opencampus/13_ASCOT+/NLP-Paper/Special-Issue-NLP/azubi_evaluation_ml_with_tensorflow/data")

import numpy
import pandas as pd
data = pd.read_csv("data.csv", encoding="UTF-8")

## Data Preparation

In [12]:
# split the data into two pieces, one for training and one for testing (a validation data set is not used in the given regression) 
from sklearn.model_selection import train_test_split
train_text_series, test_text_series, train_label_series, test_label_series = train_test_split(data["text"], data["label"], test_size = 0.30, random_state = 42)

# Converting the data from series objects into regular lists to allow for easy iteration through the data elements
train_text = train_text_series.to_list()
test_text = test_text_series.to_list()
train_label = train_label_series.to_list()
test_label = test_label_series.to_list()

# Sample sizes:
print("Size of the training dataset: ", len(train_text))
print("Size of the test dataset: ", len(test_text))

Size of the training dataset:  1461
Size of the test dataset:  627


## Text Preprocessing


In [13]:
import numpy as np
from utils import process_mail, build_freqs, extract_features
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Steffen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

The given function process_mail() tokenizes the mail into individual words, removes stop words and applies stemming.


In [14]:
# test the process_mail function
print('This is an example of a mail: \n', train_text[10])
print('\nThis is an example of the processed version of the above mail: \n', process_mail(train_text[10]))

This is an example of a mail: 
 Hallo Frau Meier,

im Anhang finden Sie den  Auszug aus der Deckungsbeitragsrechnung. Ich habe die relevanten Abweichungen berechnet und drei besonders Hohe Abweichungen markiert. Diese übersteigen unsere toleranz von 20 %. 

Wie sie sehen handelt es sich hierbei besonders um die Sattelstütze. Jedoch ist es ja eine positive Abweichung für uns, wir haben einen höheren Preis veranschlagt als unsere Kosten am Ende waren. Hier haben entweder Gespräche mit dem Lieferanten stattgefunden oder wir haben die Plankosten auf einer falschen Grundlage berechnet.
Deswegen sollte man sich den Lieferanten Fli(I)nk noch einmal anschauen, er hat einen deutlich höheren Stückpreis verlangt an geplant war. Eventuell sollte der Einkauf hier Gespräche führen um den Kaufpreis ein wenig zu senken oder das nächste Mal müsste sofort ein realistischer Preis in die Planung eingehen.

Ausgehend von den absoluten Werten und nicht den relativen muss ich allerdings (obwohl 20 % Abweichu

Create a frequency dictionary using the build_freqs() function
* The `freqs` dictionary is the frequency dictionary that's being built. 
* The key is the tuple (word, label), such as ("happy",1) or ("happy",0).  The value stored for each key is the count of how many times the word "happy" was associated with a positive label, or how many times "happy" was associated with a negative label.

In [15]:
# create frequency dictionary
freqs = build_freqs(train_text, train_label)

# check the output
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 8334


In [16]:
# Testing the extract_features function
# Two features are generated: the first is a score for the polite words in a mail and the second a score for the impolite words.
# The scores are calculated by looking up the respective counts of the words in the frequency dictionary created above and summing them up.
tmp1 = extract_features(train_text[2], freqs)
print(tmp1)


[[4783.  206.]]


## Model Estimation
- Extracting the features for the test data
- Calibrating the logistic regression Model
- Checking the training data results

In [17]:
# Extracting the features for the training data

train_features = np.zeros((len(train_text), 2))
for i in range(len(train_text)):
    train_features[i, :]= extract_features(train_text[i], freqs)


In [18]:
# Calibrating the logistic regression Model

# Since the classes are very unbalanced the class_weight parameter is set to balanced.
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(random_state=42, class_weight='balanced').fit(train_features, train_label)


In [19]:
# Checking the training data results

from sklearn import metrics

# Mean accuracy
print("Mean Accuracy:\n", log_model.score(train_features, train_label))

# Confusion matrix
print("Confusion Matrix:\n", metrics.confusion_matrix(train_label, log_model.predict(train_features)))

# F1 Score
print("F1 Score:\n", metrics.f1_score(train_label, log_model.predict(train_features)))

Mean Accuracy:
 0.86652977412731
Confusion Matrix:
 [[  90   10]
 [ 185 1176]]
F1 Score:
 0.9234393404004712


## Model Evaluation
- Extracting the features for the test data set
- Checking the test data results

In [20]:
# Extracting the features for the test data

test_features = np.zeros((len(test_text), 2))
for i in range(len(test_text)):
    test_features[i, :]= extract_features(test_text[i], freqs)

In [21]:
# Checking the test data results

from sklearn import metrics

# Mean accuracy
print("Mean Accuracy:\n", log_model.score(test_features, test_label))

# Confusion matrix
print("Confusion Matrix:\n", metrics.confusion_matrix(test_label, log_model.predict(test_features)))

# F1 Score
print("F1 Score:\n", metrics.f1_score(test_label, log_model.predict(test_features)))

# Precision
print("Precision:\n", metrics.precision_score(test_label, log_model.predict(test_features)))

# Recall
print("Recall:\n", metrics.recall_score(test_label, log_model.predict(test_features)))

# ROC AUC Score
print("ROC AUC:\n", metrics.roc_auc_score(test_label, log_model.predict(test_features)))

# Cohen's Kappa Score
print("Cohen's Kappa:\n", metrics.cohen_kappa_score(test_label, log_model.predict(test_features)))

Mean Accuracy:
 0.8373205741626795
Confusion Matrix:
 [[ 31  15]
 [ 87 494]]
F1 Score:
 0.9064220183486238
Precision:
 0.9705304518664047
Recall:
 0.8502581755593803
ROC AUC:
 0.7620856095188207
Cohen's Kappa:
 0.3046361936241464
