# Telegram Analysis

In [1]:
# Tools
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time

# Urls
from urllib.parse import urlparse
from collections import Counter

#Use notebook for interactive plots
#%matplotlib notebook
%matplotlib inline

In [2]:
# JSON Lib
! pip install demjson
import demjson

^C


In [3]:
# Natural Language Toolkit
! pip install nltk
import nltk
nltk.download("stopwords")



[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# WordCloud
! pip install wordcloud
from wordcloud import WordCloud



In [5]:
# Show all columns
pd.set_option('display.max_columns', None)

In [6]:
# Set vars
dir_var = "./"
! ls -al ./

total 56
drwxr-xr-x 6 jovyan users   192 Dec 20 18:21 .
drwxr-xr-x 8 jovyan users   256 Dec 20 10:43 ..
drwxr-xr-x 5 jovyan users   160 Dec 20 10:22 data
-rw-r--r-- 1 jovyan users  5342 Dec 20 10:42 inputFiles.csv
drwxr-xr-x 3 jovyan users    96 Dec 20 10:30 .ipynb_checkpoints
-rwxr-xr-x 1 jovyan users 48199 Dec 20 18:21 Telegram.ipynb


## Process jobs

In [7]:
dfInputFiles = pd.read_csv(dir_var + "inputFiles.csv", sep=";")

### !!!!!!!!!!!!!!!! Quick Filter !!!!!!!!!!!!!!!!!!!!

In [None]:
# Take only samples
#dfFilter = pd.DataFrame()

#dfFilter = dfFilter.append(dfInputFiles[dfInputFiles.inputName.str.contains("Xavier")])
#dfFilter = dfFilter.append(dfInputFiles[dfInputFiles.inputName.str.contains("Janich")])
#dfFilter = dfFilter.append(dfInputFiles[dfInputFiles.inputName.str.contains("Eva")])
#dfFilter = dfFilter.append(dfInputFiles[dfInputFiles.inputName.str.contains("HILDMANN")])

#dfInputFiles = dfFilter

### Overview jobs

In [9]:
dfInputFiles

Unnamed: 0,inputName,inputPath,inputType,inputId,inputDesc,inputDownloadType
12,Xavier Naidoo (inoffiziell),DS-08-10-2020/ChatExport_2020-09-25-xavier,public_channel,9874390332,,all
37,Regellese und Diskussionsgruppe zum Xavier's M...,DS-22-10-2020/ChatExport_2020-10-13-xavierChat,private_supergroup,9907103286,,all
1,Oliver Janich oeffentlich,DS-08-10-2020/ChatExport_2020-09-25-janich,public_channel,9808932799,,all
11,Eva Herman Offiziell,DS-08-10-2020/ChatExport_2020-09-27-evaherman,public_channel,9915108907,,all
2,ATTILA HILDMANN OFFICIAL,DS-08-10-2020/ChatExport_2020-09-25-hildmann,public_channel,10034163583,,all


### Transform jobs

In [10]:
def convertToDataFrameMeta(filePath):
    dF = pd.read_json(dir_var + "data/" + filePath + "/result.json", encoding='utf-8')
    return dF

In [11]:
def convertToDataFrameMessages(filePath, dictMeta):
    dF = pd.json_normalize(dictMeta[filePath].messages)
    return dF

In [12]:
def checkIsFormattedText(text):
    textString = str(text)
    if(textString.startswith("[") == True and textString.endswith("]") == True):
        return True
    else:
        return False

In [13]:
# See below
def getExtractedParam(param, processedTextData):
    a,b,c,d,e,f,g = processedTextData
    switcher = {
        0: a,
        1: b,
        2: c,
        3: d,
        4: e,
        5: f,
        6: g
    }
    return switcher.get(param)

In [14]:
# TODO: What href in normal text?

# Return types (see above)

# a = processedText
# b = Items 'processedURLs'
# c = Items 'processedHashtags'
# d = Items 'processedBolds'
# e = Items 'processedItalics'
# f = Items 'processedUnderlines'
# g = Items 'processedEmails'

def extractTextData(processedIsFormattedText, text):
    
    # 3 returns!
    
    processedURLs       = list()
    processedHashtags   = list()
    processedBolds      = list()
    processedItalics    = list()
    processedUnderlines = list()
    processedEmails     = list()
    
    if(processedIsFormattedText != True):
        #Is no JSON
        return (text, processedURLs, processedHashtags, processedBolds, processedItalics, processedUnderlines, processedEmails)
    else:
        #Is json try to parse
        try:
            jsonList = demjson.decode(str(text), encoding='utf8')

            returnList = []

            # Do for each item in list
            for lItem in jsonList:

                messageString = str(lItem)

                isFormatted = messageString.startswith("{") and messageString.endswith("}")

                if(isFormatted):
                    # Is Json Sub String
                    subJsonString = demjson.decode(str(messageString), encoding='utf8')
                    subJsonType = subJsonString["type"]

                    if(subJsonType == "bold"):
                        #text included
                        processedBolds.append(subJsonString["text"])
                        returnList.append(subJsonString["text"])
                        
                    elif(subJsonType == "italic"):
                        #text included
                        processedItalics.append(subJsonString["text"])
                        returnList.append(subJsonString["text"])
                        
                    elif(subJsonType == "underline"):
                        #text included
                        processedUnderlines.append(subJsonString["text"])
                        returnList.append(subJsonString["text"])
                    
                    elif(subJsonType == "email"):
                        #text included
                        processedEmails.append(subJsonString["text"])
                        
                    elif(subJsonType == "text_link"):
                        #text and href included
                        processedURLs.append(subJsonString["href"])
                        returnList.append(subJsonString["text"])
                        
                    elif(subJsonType == "link"):
                        #text included
                        processedURLs.append(subJsonString["text"])
                        
                    elif(subJsonType == "hashtag"):
                        #text included
                        processedHashtags.append(subJsonString["text"])
                        returnList.append(subJsonString["text"])
                        
                    elif(subJsonType == "mention"):
                        #text included
                        returnList.append(subJsonString["text"])
                        
                    elif(subJsonType == "mention_name"):
                        #text and user_id included
                        returnList.append(subJsonString["text"])
                        
                    elif(subJsonType == "bot_command"):
                        #text included
                        returnList = returnList 
                        
                    elif(subJsonType == "code"):
                        #text included
                        returnList = returnList
                        
                    elif(subJsonType == "phone"):
                        #text included
                        returnList = returnList
                        
                    elif(subJsonType == "strikethrough"):
                        #text included
                        returnList.append(subJsonString["text"])
                        
                    elif(subJsonType == "pre"):
                        #text and language included
                        returnList.append(subJsonString["text"])
                        
                    elif(subJsonType == "bank_card"):
                        #text included
                        returnList = returnList
                        
                    else:
                        print("- Error: Unknown type " + subJsonType)
                        returnList = returnList

                else:
                    # Is Normal Sub String
                    returnList.append(messageString)

            return (''.join(returnList), processedURLs, processedHashtags, processedBolds, processedItalics, processedUnderlines, processedEmails)
        
        except:
            #Parser error
            print("- Warn: Json parser error (set return text to inputText) >>" + text + "<<")
            return (text, processedURLs, processedHashtags, processedBolds, processedItalics, processedUnderlines, processedEmails)

In [15]:
# Timer Start
timeStartGlobal = time.time()

# Add Key = filePath / Value = DataFrame (Metadata)
dictMeta = {}
for fP in dfInputFiles.inputPath:
    
    dictMeta[fP] = convertToDataFrameMeta(fP)

# Add Key = filePath / Value = DataFrame (Messages)
dictMessages = {}
for fP in dfInputFiles.inputPath:

    timeStartSingle = time.time()
    print("[Process now " + fP + "]")
    dfMessages = convertToDataFrameMessages(fP, dictMeta)
    
    dfMessages["processedChannelFilePath"]  = fP
    dfMessages["processedChannelType"]      = dictMeta[fP].type.iloc[0]
    dfMessages["processedRawTextSize"]      = dfMessages["text"].str.len()
    dfMessages["processedIsFormattedText"]  = dfMessages["text"].apply(checkIsFormattedText)
    
    dfMessages["extractedTextData"]      = dfMessages.apply(lambda x: extractTextData(x.processedIsFormattedText, x.text), axis=1)
    dfMessages["processedValidText"]     = dfMessages.apply(lambda x: getExtractedParam(0, x.extractedTextData), axis=1)
    dfMessages["processedValidTextSize"] = dfMessages["processedValidText"].str.len()
    
    dfMessages["processedTextDataURLs"]       = dfMessages.apply(lambda x: getExtractedParam(1, x.extractedTextData), axis=1)
    dfMessages["processedTextDataHashtags"]   = dfMessages.apply(lambda x: getExtractedParam(2, x.extractedTextData), axis=1)
    dfMessages["processedTextDataBolds"]      = dfMessages.apply(lambda x: getExtractedParam(3, x.extractedTextData), axis=1)
    dfMessages["processedTextDataItalics"]    = dfMessages.apply(lambda x: getExtractedParam(4, x.extractedTextData), axis=1)
    dfMessages["processedTextDataUnderlines"] = dfMessages.apply(lambda x: getExtractedParam(5, x.extractedTextData), axis=1)
    dfMessages["processedTextDataEmails"]     = dfMessages.apply(lambda x: getExtractedParam(6, x.extractedTextData), axis=1)
    
    dictMessages[fP] = dfMessages
    timeEndSingle = time.time()
    print('{:5.3f}s'.format(timeEndSingle-timeStartSingle))

# All Messages to DataFrame
dfAllDataMessages = pd.DataFrame()
for fP in dfInputFiles.inputPath:
    
    #print("[Append now " + fP + "]")
    dfMessages        = dictMessages[fP].copy()
    dfAllDataMessages = dfAllDataMessages.append(dfMessages)

# Print Time
timeEndGlobal = time.time()
print()
print("[Finished global]")
print('{:5.3f}s'.format(timeEndGlobal-timeStartGlobal))

[Process now DS-08-10-2020/ChatExport_2020-09-25-xavier]
16.378s
[Process now DS-22-10-2020/ChatExport_2020-10-13-xavierChat]
22.595s
[Process now DS-08-10-2020/ChatExport_2020-09-25-janich]


TypeError: can only concatenate str (not "list") to str

### Debug

In [None]:
#dfMessages = dfAllDataMessages.copy()
#dfMessages = dfMessages[dfMessages.astype(str)["processedTextDataEmails"] != "[]"]
#t = dfMessages.sort_values(by="processedValidTextSize", ascending=False).iloc[3]
#print(">>" + str(t.text) + "<<")
#print()
#print(">>" + str(t.processedTextDataBolds) + "<<")
#print(">>" + str(t.processedTextDataURLs) + "<<")
#print(">>" + str(t.processedTextDataHashtags) + "<<")
#print(">>" + str(t.processedTextDataItalics) + "<<")
#print(">>" + str(t.processedTextDataUnderlines) + "<<")
#print(">>" + str(t.processedTextDataEmails) + "<<")

### Type of channels

In [None]:
dfInputFiles.inputType.value_counts()

### Only in different types of channels

In [None]:
dfAllDataMessages.columns.difference(
    dfAllDataMessages[dfAllDataMessages.processedChannelType == "public_channel"].dropna(how='all', axis=1).columns
)

In [None]:
dfAllDataMessages.columns.difference(
    dfAllDataMessages[dfAllDataMessages.processedChannelType == "public_supergroup"].dropna(how='all', axis=1).columns
)

In [None]:
# Vorsicht: Wenige Daten
dfAllDataMessages.columns.difference(
    dfAllDataMessages[dfAllDataMessages.processedChannelType == "private_supergroup"].dropna(how='all', axis=1).columns
)

## Queries

In [None]:
def queryChannelId(filePath):
    dfMeta = dictMeta[filePath].copy()
    return str(dfMeta["id"].iloc[0])

In [None]:
def queryChannelName(filePath):
    dfMeta = dictMeta[filePath].copy()
    inputStr  = str(dfMeta["name"].iloc[0])
    outputStr = inputStr.encode('ascii', 'ignore')
    outputStr = outputStr.decode('ascii')
    return outputStr[:25]

In [None]:
def queryChannelType(filePath):
    dfMeta = dictMeta[filePath].copy()
    return str(dfMeta["type"].iloc[0])

In [None]:
def queryChannelCountEntries(filePath):
    dfMessages = dictMessages[filePath].copy()
    return len(dfMessages.index)

In [None]:
def queryChannelCountRawText(filePath):
    dfMessages = dictMessages[filePath].copy()
    dfMessages = dfMessages[dfMessages.processedRawTextSize > 0]
    return len(dfMessages.index)

In [None]:
def queryChannelCountIsFormattedText(filePath):
    dfMessages = dictMessages[filePath].copy()
    dfMessages = dfMessages[dfMessages.processedIsFormattedText == True]
    return len(dfMessages.index)

In [None]:
def queryChannelCountValidText(filePath):
    dfMessages = dictMessages[filePath].copy()
    dfMessages = dfMessages[dfMessages.processedValidTextSize > 0]
    return len(dfMessages.index)

In [None]:
# TODO mit process
def queryChannelCountPhoto(filePath):
    dfMessages = dictMessages[filePath].copy()
    if "photo" not in dfMessages.columns:
        return 0
    else:
        dfMessages = dfMessages.photo.dropna()
        return len(dfMessages.index)

In [None]:
# TODO mit process
def queryChannelCountFile(filePath):
    dfMessages = dictMessages[filePath].copy()
    if "file" not in dfMessages.columns:
        return 0
    else:
        dfMessages = dfMessages.file.dropna()
        return len(dfMessages.index)

In [None]:
# TODO mit process
def queryChannelCountEdited(filePath):
    dfMessages = dictMessages[filePath].copy()
    if "edited" not in dfMessages.columns:
        return 0
    else:
        dfMessages = dfMessages.edited.dropna()
        return len(dfMessages.index)

In [None]:
def queryCalcPercent(countFiltered, countTotal):
    return (countFiltered / countTotal) * 100

In [None]:
# Refactor

dfQueryMeta = pd.DataFrame(dfInputFiles.inputPath)

dfQueryMeta["queryChannelId"]                     = dfQueryMeta.inputPath.apply(queryChannelId)
dfQueryMeta["queryChannelName"]                   = dfQueryMeta.inputPath.apply(queryChannelName)
dfQueryMeta["queryChannelType"]                   = dfQueryMeta.inputPath.apply(queryChannelType)
dfQueryMeta["queryChannelCountEntries"]           = dfQueryMeta.inputPath.apply(queryChannelCountEntries)

dfQueryMeta["queryChannelCountRawText"]           = dfQueryMeta.inputPath.apply(queryChannelCountRawText)
dfQueryMeta["queryChannelCountIsFormattedText"]   = dfQueryMeta.inputPath.apply(queryChannelCountIsFormattedText)
dfQueryMeta["queryChannelCountValidText"]         = dfQueryMeta.inputPath.apply(queryChannelCountValidText)
dfQueryMeta["queryChannelCountPhoto"]             = dfQueryMeta.inputPath.apply(queryChannelCountPhoto)
dfQueryMeta["queryChannelCountFile"]              = dfQueryMeta.inputPath.apply(queryChannelCountFile)
dfQueryMeta["queryChannelCountEdited"]            = dfQueryMeta.inputPath.apply(queryChannelCountEdited)

dfQueryMeta["queryChannelPercentRawText"]         = queryCalcPercent(dfQueryMeta["queryChannelCountRawText"], dfQueryMeta["queryChannelCountEntries"])
dfQueryMeta["queryChannelPercentIsFormattedText"] = queryCalcPercent(dfQueryMeta["queryChannelCountIsFormattedText"], dfQueryMeta["queryChannelCountEntries"])
dfQueryMeta["queryChannelPercentValidText"]       = queryCalcPercent(dfQueryMeta["queryChannelCountValidText"], dfQueryMeta["queryChannelCountEntries"])
dfQueryMeta["queryChannelPercentPhoto"]           = queryCalcPercent(dfQueryMeta["queryChannelCountPhoto"], dfQueryMeta["queryChannelCountEntries"])
dfQueryMeta["queryChannelPercentFile"]            = queryCalcPercent(dfQueryMeta["queryChannelCountFile"], dfQueryMeta["queryChannelCountEntries"])
dfQueryMeta["queryChannelPercentEdited"]          = queryCalcPercent(dfQueryMeta["queryChannelCountEdited"], dfQueryMeta["queryChannelCountEntries"])

dfQueryMeta.sort_values(by="queryChannelCountEntries", ascending=False)

### Plotter

In [None]:
def queryPlotter(attributeName):
    dfFilter = dfQueryMeta.copy()
    sns.catplot(
        y="queryChannelName",
        x=attributeName,
        data=dfFilter,
        hue="queryChannelType",
        #aspect=1.2,
        #palette="rocket",
        kind="bar",
        height=7,
        order=dfFilter.sort_values(attributeName, ascending=False).queryChannelName
    )

In [None]:
queryPlotter("queryChannelCountEntries")

In [None]:
#queryPlotter("queryChannelPercentRawText")

In [None]:
queryPlotter("queryChannelPercentIsFormattedText")

In [None]:
#queryPlotter("queryChannelPercentValidText") 

In [None]:
#queryPlotter("queryChannelPercentPhoto")

In [None]:
#queryPlotter("queryChannelPercentFile")

In [None]:
queryPlotter("queryChannelPercentEdited")

## Get valid text

In [None]:
def normalizedValidTextSize(df):
    df = df.copy()
    # https://stackoverflow.com/questions/23199796/detect-and-exclude-outliers-in-pandas-data-frame
    # keep only the ones that are within +3 to -3 standard deviations in the column 'Data'.
    return df[np.abs(df.processedValidTextSize-df.processedValidTextSize.mean()) <= (3*df.processedValidTextSize.std())]

In [None]:
dfMessages = dfAllDataMessages.copy()
print("Before all filters\t\t" + str(len(dfMessages.index)))

dfMessages = dfMessages[dfMessages.processedValidTextSize > 10]
print("Before normalizedValidTextSize\t" + str(len(dfMessages.index)))

dfMessages = normalizedValidTextSize(dfMessages)
print("After normalizedValidTextSize\t" + str(len(dfMessages.index)))

_ = dfMessages.processedValidTextSize.hist()

## Extract Hashtags

In [None]:
# TODO: Extract hashtags in non formatted text

def extractImportantHashtags(filePath):
    dfMessages = dictMessages[filePath].copy()
    dfMessages = dfMessages[dfMessages.astype(str)["processedTextDataHashtags"] != "[]"]

    hashTagList = list()
    for index, row in dfMessages.iterrows():
        for hashtagItem in row["processedTextDataHashtags"]:
            hashTagList.append(hashtagItem)

    return Counter(hashTagList).most_common(20)

In [None]:
extractImportantHashtags("DS-08-10-2020/ChatExport_2020-09-25-janich")

In [None]:
extractImportantHashtags("DS-08-10-2020/ChatExport_2020-09-27-evaherman")

In [None]:
extractImportantHashtags("DS-08-10-2020/ChatExport_2020-09-25-hildmann")

In [None]:
extractImportantHashtags("DS-08-10-2020/ChatExport_2020-09-25-xavier")

## Extract Urls

In [None]:
# TODO: No Hostname if string startsWith ! "http"
# TODO: Url in non formatted text
# TODO: Add mention and other attributes
# TODO: mention in non formatted text? and other attributes

# TODO: Check if both set (from and from_id, actor, ...)
# TODO: Add cache attributes ^^ from_id -> from (map with validator)

# TODO: Duplicates in refs from text

# Text ref important for finding groups
# forwared_from important for graphs

# Return  Counter forwarded_from
def extractImportantUrls(filePath):
    dfMessages = dictMessages[filePath].copy()

    hostList = list()
    urList   = list()
    refList = list()
    for index, row in dfMessages.iterrows():
        if(str(row["processedTextDataURLs"]) != "[]"):
            for urlItem in row["processedTextDataURLs"]:
                urlData = urlparse(str(urlItem))

                completeHostname = urlData.hostname
                completeUrl      = urlData.geturl()

                hostList.append(str(completeHostname))
                urList.append(str(completeUrl))

                if "t.me" in str(completeHostname):
                    refList.append(str(completeUrl))
            
    forwardedFromList = list()
    if("forwarded_from" in dfMessages.columns):
        for index, row in dfMessages.iterrows():        
            forwardedFromList.append(str(row["forwarded_from"]))
            
    actorList = list()
    if("actor" in dfMessages.columns):
        for index, row in dfMessages.iterrows():
            actorList.append(str(row["actor"]))
    
    memberList = list()
    if("members" in dfMessages.columns):
        for index, row in dfMessages.iterrows():
            if(str(row["members"]) != "nan"):
                for memberItem in row["members"]:
                    memberList.append(str(memberItem))
                    
    fromList = list()
    if("from" in dfMessages.columns):
        for index, row in dfMessages.iterrows():
            fromList.append(str(row["from"]))
            
    savedFromList = list()
    if("saved_from" in dfMessages.columns):
        for index, row in dfMessages.iterrows():
            savedFromList.append(str(row["saved_from"]))
        
    print("########################################")
    print("###### Top 20 Hosts ####################")
    print("########################################")
    print ("\n".join(map(str, Counter(hostList).most_common(20))))
    print()

    print("########################################")
    print("###### Top 20 URLs #####################")
    print("########################################")
    print ("\n".join(map(str, Counter(urList).most_common(20))))
    print()

    print("########################################")
    print("###### Top 20 Refs from text ###########")
    print("########################################")
    print ("\n".join(map(str, Counter(refList).most_common(20))))
    print()

    print("########################################")
    print("###### Top 20 Refs (forwarded_from) ####")
    print("########################################")
    print ("\n".join(map(str, Counter(forwardedFromList).most_common(20))))
    print()
    
    print("########################################")
    print("###### Top 20 Refs (actor) #############")
    print("########################################")
    print ("\n".join(map(str, Counter(actorList).most_common(20))))
    print()
    
    print("########################################")
    print("###### Top 20 Refs (members) ###########")
    print("########################################")
    print ("\n".join(map(str, Counter(memberList).most_common(20))))
    print()
    
    print("########################################")
    print("###### Top 20 Refs (from) ##############")
    print("########################################")
    print ("\n".join(map(str, Counter(fromList).most_common(20))))
    print()
    
    print("########################################")
    print("###### Top 20 Refs (saved_from) ########")
    print("########################################")
    print ("\n".join(map(str, Counter(savedFromList).most_common(20))))
    print()
    
    return Counter(forwardedFromList)

In [None]:
_ = extractImportantUrls("DS-08-10-2020/ChatExport_2020-09-25-janich")

In [None]:
_ = extractImportantUrls("DS-08-10-2020/ChatExport_2020-09-27-evaherman")

In [None]:
counterSampleChat = extractImportantUrls("DS-08-10-2020/ChatExport_2020-09-25-hildmann")

In [None]:
_ = extractImportantUrls("DS-08-10-2020/ChatExport_2020-09-25-xavier")

In [None]:
# Just for test purposes
_ = extractImportantUrls("DS-22-10-2020/ChatExport_2020-10-13-xavierChat")

## Word Clouds

In [None]:
#TODO Difference capital letters?
#TODO Context?

# Use "global" for all DataFrames
def plotChannelWordCloud(filePath, label, filterList):
    timeStart = time.time()
    
    if(filePath != "global"):
        dfMessages = dictMessages[filePath].copy()
    else:
        dfMessages = dfAllDataMessages.copy()
    
    print("[Start transform text to global text string]")
    textList = []
    for index, row in dfMessages.iterrows():
        textList.append(" " + row["processedValidText"])
        
    textString = ''.join(textList)
    
    germanStopWordsList = nltk.corpus.stopwords.words('german')
    germanStopWordsList.append("http")
    germanStopWordsList.append("https")
    germanStopWordsList.append("ja") #TODO: split to local - maybe?
    germanStopWordsList.append("nein")
    germanStopWordsList.append("mehr")
    germanStopWordsList.append("mal")
    germanStopWordsList.append("schon")
    germanStopWordsList.append("immer")
    germanStopWordsList.append("wurde")
    germanStopWordsList.append("wurden")
    germanStopWordsList.append("sei")
    germanStopWordsList.append("sein")
    germanStopWordsList.append("viel")
    germanStopWordsList.append("viele")
    germanStopWordsList.append("wegen")
    germanStopWordsList.append("müssen")
    germanStopWordsList.append("geht")
    germanStopWordsList.append("gibt")
    germanStopWordsList.append("wer")
    germanStopWordsList.append("wie")
    germanStopWordsList.append("was")
    germanStopWordsList.append("macht")
    germanStopWordsList.append("machen")
    germanStopWordsList.append("machte")
    germanStopWordsList.append("kommen")
    germanStopWordsList.append("kommt")
    germanStopWordsList.append("glaube")
    germanStopWordsList.append("glaubst")
    germanStopWordsList.append("tun")
    germanStopWordsList.append("wäre")
    germanStopWordsList.append("sagte")
    germanStopWordsList.append("sagten")
    germanStopWordsList.append("hat")
    germanStopWordsList.append("hast")
    germanStopWordsList.append("haben")
    germanStopWordsList.append("habt")
    germanStopWordsList.append("statt")
    germanStopWordsList.append("genau")
    germanStopWordsList.append("sagen")
    germanStopWordsList.append("sagte")
    germanStopWordsList.append("sagten")
    germanStopWordsList.append("bitte")
    germanStopWordsList.append("bitten")
    germanStopWordsList.append("danke")
    germanStopWordsList.append("dank")
    germanStopWordsList.append("sollen")
    germanStopWordsList.append("soll")
    germanStopWordsList.append("sollte")
    germanStopWordsList.append("sehen")
    germanStopWordsList.append("seht")
    germanStopWordsList.append("zeigen")
    germanStopWordsList.append("zeigt")
    germanStopWordsList.append("sei")
    germanStopWordsList.append("sein")
    germanStopWordsList.append("seid")
    germanStopWordsList.append("seit")
    germanStopWordsList.append("laut")
    germanStopWordsList.append("lauten")
    germanStopWordsList.append("sehen")
    germanStopWordsList.append("seht")
    germanStopWordsList.append("haben")
    germanStopWordsList.append("hat")
    germanStopWordsList.append("hätten")
    germanStopWordsList.append("sagte")
    germanStopWordsList.append("sag")
    germanStopWordsList.append("sagt")
    germanStopWordsList.append("ab")
    germanStopWordsList.append("bei")
    germanStopWordsList.append("beim")
    germanStopWordsList.append("denen")
    germanStopWordsList.append("gab")
    germanStopWordsList.append("ab")
    
    for fItem in filterList:
        germanStopWordsList.append(fItem)
    
    print("[Start generate wordCloud]")
    wordcloud = WordCloud(
                background_color="black",
                width=1920,
                height=1080,
                stopwords=germanStopWordsList
            ).generate(textString)
    wordcloud.to_file("wordcloud-" + label + ".png")
    
    print("Top 20 occ:\n" + str(pd.Series(wordcloud.words_).head(20)))
    
    print("[Start generate figure]")
    plt.figure(figsize=(14, 14))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.show()
    
    timeEnd = time.time()
    print("[Finished]")
    print('{:5.3f}s'.format(timeEnd-timeStart))

### Public channels

In [None]:
# Oliver Janich öffentlich (public_channel)
plotChannelWordCloud(
    "DS-08-10-2020/ChatExport_2020-09-25-janich",
    "pc-janich",
    []
)

In [None]:
# Eva Herman Offiziell (public_channel)
plotChannelWordCloud(
    "DS-08-10-2020/ChatExport_2020-09-27-evaherman",
    "pc-evaHerman",
    []
)

In [None]:
# ATTILA HILDMANN OFFICIAL (public_channel)
plotChannelWordCloud(
    "DS-08-10-2020/ChatExport_2020-09-25-hildmann",
    "pc-hildmann",
    ["ATTILAHILDMANN CHAT"]
)

In [None]:
# Xavier Naidoo (public_channel)
plotChannelWordCloud(
    "DS-08-10-2020/ChatExport_2020-09-25-xavier",
    "pc-xavier",
    ["xavier_naidoo", "Xavier_Naidoo", "politische_bildersprueche", "einmal_hin_alles_drin"]
)

In [None]:
# Gerechtigkeit für das Vaterland (public_channel)
#plotChannelWordCloud(
#    "DS-08-10-2020/ChatExport_2020-09-26-gerechtigkeitVaterland",
#    "pc-GerechtigkeitfuerdasVaterland",
#    ["gerechtigkeitfuersvaterland"]
#)

In [None]:
# Corona Virus Informationen (public_channel)
#plotChannelWordCloud(
#    "DS-08-10-2020/ChatExport_2020-09-26-cvirusinfo",
#    "pc-cVirusInfo",
#    [])

In [None]:
# Liberté (public_channel)
#plotChannelWordCloud(
#    "DS-08-10-2020/ChatExport_2020-09-26-liberte",
#    "pc-liberte",
#    []
#)

In [None]:
# Just for test purposes
plotChannelWordCloud(
    "global",
    "global",
    []
)

## Concordance

In [None]:
#tbd

## Graphs

In [None]:
import networkx as nx

In [None]:
#G_weighted = nx.DiGraph()

#G_weighted.add_node('A', weight=1500)
#G_weighted.add_node('B', weight=800)
#G_weighted.add_node('C', weight=200)
#G_weighted.add_node('D', weight=500)

#G_weighted.add_edge('A', 'B', weight=8)
#G_weighted.add_edge('A', 'C', weight=2)
#G_weighted.add_edge('A', 'D', weight=5)

In [None]:
G_weighted = nx.DiGraph()

globalSize = 0

for aKey in counterSampleChat:
    
    groupCount = counterSampleChat[aKey]
    
    inputStr  = str(aKey)
    outputStr = inputStr.encode('ascii', 'ignore')
    groupName = outputStr.decode('ascii')
    groupName = groupName[:25]
    
    if(groupCount > 3 and groupCount < 99999999 and groupName != "nan"):
        globalSize = globalSize + groupCount

G_weighted.add_node("target", weight=globalSize)
        
# TODO: Refactor
    
for aKey in counterSampleChat:
    
    groupCount = counterSampleChat[aKey]
    
    inputStr  = str(aKey)
    outputStr = inputStr.encode('ascii', 'ignore')
    groupName = outputStr.decode('ascii')
    groupName = groupName[:25]
    
    if(groupCount > 3 and groupCount < 99999999 and groupName != "nan"):
    
        print("Add " + str(groupCount) + "\t" + str(groupName))
    
        G_weighted.add_node(groupName, weight=groupCount * 5)
        G_weighted.add_edge("target", groupName, weight=groupCount / 100)       

In [None]:
plt.figure(figsize=(12,12))

# TODO https://networkx.org/documentation/stable/reference/drawing.html#module-networkx.drawing.layout
# (try different layouts e.g. circular_layout)

pos = nx.spring_layout(G_weighted)

nx.draw(G_weighted,
        pos,
        #node_color='lightgreen',
        with_labels=True,
        #edge_color=colors, 
        width=list(nx.get_edge_attributes(G_weighted, "weight").values()),
        #edge_labels=edge_labels,
        node_size=list(nx.get_node_attributes(G_weighted,'weight').values()),
        arrowsize=1,
       )

edge_labels = nx.get_edge_attributes(G_weighted, "weight")

_ = nx.draw_networkx_edge_labels(G_weighted, pos, edge_labels=edge_labels)

plt.show()

In [None]:
#from ipywidgets import interact
#import ipywidgets as widgets

In [None]:
#interact(plot_random_graph, n=(2,30), m=(1,10), k=(1,10), p=(0.0, 1.0, 0.001),
#         generator={
#             'lobster': random_lobster,
#             'power law': powerlaw_cluster,
#             'Newman-Watts-Strogatz': newman_watts_strogatz,
#             u'Erdős-Rényi': erdos_renyi,
#         });