# Telegram Analyse (Master-Thesis Teil Autoren-Klassifizierung)

Informatik Master

Maximilian Bundscherer

Beschreibung tbd.

## Arbeitsumgebung starten und Konfigurationen anwenden

### Jupyter Notebook Parameter

``C_USE_CACHE_FILE``: Vergleiche Notebook ``Telegram.ipynb`` (Daten aufbereiten)

In [None]:
C_USE_CACHE_FILE = "long-run-server-28-01.pkl"

In [None]:
## Laden von Abhänigkeiten

### Bibliotheken und Abhängigkeiten laden

 Vergleiche Notebook ``Telegram.ipynb`` (Umgebung starten und Konfigurationen anwenden)

#### Abhänigkeiten vom Docker-Image und IO-Libs und weitere

In [None]:
# Import default libs
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import re
import os
import sys
import demjson
import requests
import networkx as nx
import warnings
from pprint import pprint
from urllib.parse import urlparse
from collections import Counter
from pathlib import Path
from lxml.html import fromstring

DeprecationWarnings ausblenden

In [None]:
# Hide DeprecationWarning
warnings.filterwarnings("ignore", category=DeprecationWarning)

#### Weitere Abhänigkeiten installieren

In [None]:
!{sys.executable} -m pip install demoji

#### Weitere Abhänigkeiten importieren

In [None]:
import nltk
import demoji

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.dummy import DummyClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

#### Stopuhr bereitstellen

In [None]:
dictGloStopwatches = dict()

# Start timer (for reporting)
def gloStartStopwatch(key):
    print("[Stopwatch started >>" + str(key) + "<<]")
    dictGloStopwatches[key] = time.time()

# Stop timer (for reporting)
def gloStopStopwatch(key):
    endTime     = time.time()
    startTime   = dictGloStopwatches[key]
    print("[Stopwatch stopped >>" + str(key) + "<< (" + '{:5.3f}s'.format(endTime-startTime) + ")]")

### Download von Daten

#### NLTK

In [None]:
nltk.download("stopwords")

#### Demoji

In [None]:
demoji.download_codes()

## Umgebung konfigurieren und vorbereiten

### Konfigurationen Umgebung anwenden

#### IO Einstellungen

In [None]:
# Show all columns (pandas hides columns by default)
pd.set_option('display.max_columns', None)

# Set plot style
# TODO: Test different style
plt.style.use('ggplot')

#### Arbeitsverzeichnis definieren

In [None]:
dir_var                 = "./"
dir_var_output          = dir_var + "output/"
dir_var_pandas_cache    = dir_var + "cache/pandas/"

#### Globale Text-Normalisierungsfunktionen definieren

##### Deutsche Umlaute entfernen

In [None]:
def gloReplaceGermanChars(inputText):

    inputText = inputText.replace("ö", "oe")
    inputText = inputText.replace("ü", "ue")
    inputText = inputText.replace("ä", "ae")

    inputText = inputText.replace("Ö", "Oe")
    inputText = inputText.replace("Ü", "Ue")
    inputText = inputText.replace("Ä", "Ae")

    inputText = inputText.replace("ß", "ss")
    
    return inputText

##### Sonderzeichen entfernen und Chat Titel normalisieren

von Text und Chat-Titel

In [None]:
# Rm unsafe chars
def gloConvertToSafeString(text):
    text = demoji.replace(text, "")
    text = gloReplaceGermanChars(text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

# Generate unique chat name
def gloConvertToSafeChatName(chatName):
    chatName = gloConvertToSafeString(chatName)
    return chatName[:30]

##### Bereitstellen von Stop Words Datenbanken

In [None]:
def gloGetStopWordsList(filterList):

    stopwWorldsList = []

    deWordsList = nltk.corpus.stopwords.words('german')

    enWordsList = nltk.corpus.stopwords.words('english')

    aStopwords = []
    with open(dir_var + "additionalStopwords.txt") as file:
        for line in file: 
            line = line.strip()
            if(line != ""):
                aStopwords.append(line)

    for s in filterList:
        s = gloReplaceGermanChars(s)
        stopwWorldsList.append(s)

    for s in deWordsList:
        s = gloReplaceGermanChars(s)
        stopwWorldsList.append(s)

    for s in enWordsList:
        stopwWorldsList.append(s)

    for s in aStopwords:
        s = gloReplaceGermanChars(s)
        stopwWorldsList.append(s)

    return stopwWorldsList

## Klassifizierung nach Autoren

### Daten laden und aufbereiten

#### Daten laden von Cache

In [None]:
gloStartStopwatch("Cache einlesen")
dfAllDataMessages = pd.read_pickle(dir_var_pandas_cache + C_USE_CACHE_FILE)
gloStopStopwatch("Cache einlesen")

##### Filtern und anzeigen

In [None]:
dfAllDataMessages = dfAllDataMessages[dfAllDataMessages["from"].str.contains("QUER") == False]
dfAllDataMessages = dfAllDataMessages[dfAllDataMessages["from"].str.contains("Quer") == False]

dfAllDataMessages = dfAllDataMessages[dfAllDataMessages.procEvalIsValidText == True]
dfAllDataMessages = dfAllDataMessages[dfAllDataMessages.procTDCleanText != ""]
dfAllDataMessages = dfAllDataMessages[dfAllDataMessages.procTDTextLength > 5]
dfAllDataMessages = dfAllDataMessages[dfAllDataMessages.procChatType == "public_channel"]

dfAllDataMessages["from"] = dfAllDataMessages["from"].apply(gloConvertToSafeChatName)

In [None]:
dfAllDataMessages.head(5)

In [None]:
dfAllDataMessages["from"].value_counts()

In [None]:
dfAllDataMessages.columns

#### Features

##### Welche Features sind wichtig?

In [None]:
targetDf = dfAllDataMessages[[
 'date',
 'from',
 'procEvalIsValidText',
 'procIsJsonFormatted',
 'procTDCleanText',
 'procTDSafeText',
 'procTDSafeLowercaseText',
 'procTDTextLength',
 'procEvalContainsUrl',
 'procTDURLs',
 'procEvalContainsHashtag',
 'procTDHashtags',
 'procEvalContainsBoldItem',
 'procTDBolds',
 'procEvalContainsItalicItem',
 'procTDItalics',
 'procEvalContainsUnderlineItem',
 'procTDUnderlines',
 'procEvalContainsEmailItem',
 'procTDEmails',
 'procEvalContainsEmojiItem',
 'procTDEmojis',
 'procTDEmojisDesc',
 'procEvalContainsPhoto',
 'procEvalContainsFile',
 'procEvalIsEdited',
 'procEvalIsForwarded',
 'procPipeline-ner-xlm-roberta',
 'procPipeline-ner-bert',
 'procPipeline-sen-bert',
 'procPipeline-sentiment']].copy()

##### Classifier Attribute

In [None]:
targetDf['clFrom']    = targetDf['from']
targetDf['clFromId']  = targetDf['from'].factorize()[0]
targetDf['clText']    = targetDf['procTDCleanText']

In [None]:
targetDf['clFrom'].value_counts().plot.bar()

In [None]:
targetDf['clFromId'].value_counts()

In [None]:
targetDf['clText'][:5]

##### Dict From Id

In [None]:
dfFromId            = targetDf[['clFrom', 'clFromId']].drop_duplicates().sort_values('clFromId')

dictFrom_to_id      = dict(dfFromId.values)
dictId_to_from      = dict(dfFromId[['clFromId', 'clFrom']].values)

In [None]:
dictId_to_from

##### CountVectorizer und Tf-idf

unten

### Tranieren

#### Test Train Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(targetDf['clText'], targetDf['clFrom'], random_state = 42, test_size=0.20)

print("Train size:\t" + str(len(X_train.index)))
print("Test size:\t" + str(len(X_test.index)))

#### CountVectorizer und Tf-idf

In [None]:
gloStartStopwatch("Transform messages")

count_vect          = CountVectorizer()
tfidf_transformer   = TfidfTransformer()

# Transform and fit train
X_train_counts      = count_vect.fit_transform(X_train)
X_train_tfidf       = tfidf_transformer.fit_transform(X_train_counts)

# Transform test
X_test_counts       = count_vect.transform(X_test)
X_test_tfidf        = tfidf_transformer.transform(X_test_counts)

gloStopStopwatch("Transform messages")

### Evaluieren

In [None]:
def trainAndEvalModel(model, outputFilename):

    gloStartStopwatch("- Train now model " + str(model))
    model.fit(X_train_tfidf, y_train)
    gloStopStopwatch("- Train now model " + str(model))

    searchStrings = ["Folge Attila Hildmann", "Liebe Eva", "Premium Kanal"]

    for sS in searchStrings:

        sS = str(sS)
        print()
        print("Who has written '" + sS + "'?")
        t = tfidf_transformer.transform(count_vect.transform([sS]))
        r = model.predict(t)
        print(str(r))

    y_pred_train        = model.predict(X_train_tfidf)
    y_pred_test         = model.predict(X_test_tfidf)

    print()
    print("Train Score:\t"  + str(accuracy_score(y_true=y_train, y_pred=y_pred_train)))
    print("Test Score:\t"   + str(accuracy_score(y_true=y_test, y_pred=y_pred_test)))

    print()
    print("Confusion Matrix on test:")
    conf_mat = confusion_matrix(y_true = y_test, y_pred = y_pred_test)
    fig, ax  = plt.subplots(figsize=(10,10))

    sns.heatmap(conf_mat, annot=True, fmt='d',
                xticklabels=dfFromId.clFrom.values, yticklabels=dfFromId.clFrom.values)
                
    plt.ylabel('Is')
    plt.xlabel('Predicted')

    if(outputFilename != ""):
        plt.savefig(dir_var_output + outputFilename)

    plt.show()

#### Evaluation SVC

In [None]:
trainAndEvalModel(LinearSVC(), "class-linearsvc.svg")

#### Evaluation Multinomialnb

In [None]:
trainAndEvalModel(MultinomialNB(), "class-multinomialnb.svg")

#### Evaluation LogisticRegression

In [None]:
trainAndEvalModel(LogisticRegression(), "class-logisticregression.svg")

#### Evaluation MLPClassifier

In [None]:
trainAndEvalModel(MLPClassifier(), "class-mlp.svg")

#### Evaluation DecisionTreeClassifier

In [None]:
trainAndEvalModel(DecisionTreeClassifier(), "class-decisiontree.svg")

#### Evaluation RandomForestClassifier

In [None]:
trainAndEvalModel(RandomForestClassifier(), "class-randomforest.svg")

#### Evaluation DummyClassifier

In [None]:
trainAndEvalModel(DummyClassifier(), "class-dummy.svg")

### Ausblick...

- Thank you https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f