# Install necessary packages

In [None]:
import nltk

In [None]:
%%capture
!pip install emoji
nltk.download('stopwords')
nltk.download('punkt')
!python3 -m spacy download de_core_news_md
!pip install textblob-de

**Note:** When first using the notebook, please restart your runtime after installation and run again!

# Import

In [None]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import emoji

# Load Data

In [None]:
filepath = ''

In [None]:
with open(filepath, encoding="utf8") as f:
  raw_chat = f.read()

In [None]:
# You could also read the file line by line. But multiline messages will be a problem
#with open(filepath, encoding="utf8") as f:
    #lines = f.read().splitlines()

# Extract Features

In [None]:
# Because of multiline messages removing new lines and split aferwards by (dd.mm.yy, hh:mm)
raw_chat = raw_chat.replace('\n', ' ').replace('\r', ' ')

## Dates

In [None]:
#Regex to extract Datetime from mesasge (dd.mm.yy, hh:mm)
regexDate = '[0-9]{2}[.][0-9]{2}[.][0-9]{2}[,] [0-9]{2}[:][0-9]{2}'

In [None]:
dates = re.findall(regexDate, raw_chat)

In [None]:
len(dates)

## Content of message

In [None]:
content_of_message = re.split(regexDate, raw_chat)

In [None]:
# First element of text will always be empty because of regex
len(content_of_message)

In [None]:
content_of_message.pop(0)

In [None]:
#Removing special characters ahead of every message
content_of_message = [element[3:] for element in content_of_message]

# Create df

In [None]:
df = {'datetime':dates,
      'message_unformatted': content_of_message}

In [None]:
df = pd.DataFrame(df)

In [None]:
df.head()

In [None]:
df.shape

# Split Date and Time

In [None]:
regexDate = '[0-9]{2}[.][0-9]{2}[.][0-9]{2}'

In [None]:
def extractDate(textObject, regex):
  matchObject = re.match(regex, textObject)
  return matchObject.group()

In [None]:
df["date"] = df["datetime"].map(lambda x: extractDate(x, regexDate))

In [None]:
def extractTime(textObject, regex):
  matchObject = re.split(regex, textObject)
  return matchObject[1]

In [None]:
df["time"] = df["datetime"].map(lambda x: extractTime(x, f'{regexDate}[,] '))

In [None]:
df['hour'] = df["time"].map(lambda x: re.split('[:]', x)[0])

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
df.head()

# Extract Name and Message

In [None]:
def splitName_and_text(string_to_split, split_type=None):
  #Note: System messages like "XY leaved the chat", dont contain a ':', so we can split by it to remove them
  splitted = string_to_split.split(':', 1)
  try:
    content = splitted[1]
    if split_type == "user":
      return splitted[0]
    return content
  except:
    return None

In [None]:
df["message"] = df.message_unformatted.map(lambda x: splitName_and_text(x))

In [None]:
df["user"] = df.message_unformatted.map(lambda x: splitName_and_text(x, split_type="user"))

In [None]:
df.tail()

In [None]:
# Removing system messages and useless columns
df.dropna(inplace=True)
df.drop(columns=["datetime", "message_unformatted"], inplace=True)
df = df[~df.message.str.contains("<Medien ausgeschlossen>")]

# Message Length

In [None]:
df["length_message"] = df["message"].map(lambda x: len(x.split()))

In [None]:
df.head()

In [None]:
#len_by_user = df.groupby("user")["length_message"].mean()

In [None]:
plt.rcParams["figure.figsize"] = (15,5)
sns.countplot(df.user).set_title('# of Messages by User')

In [None]:
sns.barplot(x="length_message", y="user", data=df).set_title('Average words in message by User')

# Emojis

In [None]:
def extract_emojis(s):
  return ''.join(c for c in s if c in emoji.UNICODE_EMOJI['en'])

In [None]:
df["emoji"] = df["message"].map(lambda x: extract_emojis(x))

In [None]:
df["emojis_used"] = df["emoji"].map(lambda x: len(x))

In [None]:
df.groupby("user")["emojis_used"].mean()

In [None]:
sns.barplot(x="emojis_used", y="user", data=df).set_title('Emojis / message')

# Time Distribution of chats

In [None]:
no_messages_by_date = df.groupby("date").size()

In [None]:
plt.rcParams["figure.figsize"] = (30,10)
no_messages_by_date.plot()
plt.title("Number of Messages by Day")

In [None]:
no_messages_by_hour = df.groupby("hour").size()

In [None]:
plt.rcParams["figure.figsize"] = (20,5)
no_messages_by_hour.plot()
plt.title("Number of Messages by Hour")

In [None]:
 df.groupby("date").size().sort_values()

# Sentiment Analysis

In [None]:
from nltk.corpus import stopwords
import spacy
import de_core_news_md

In [None]:
nlp = de_core_news_md.load()

## Preprocess Text

In [None]:
# Please change for chat language!
german_stop_words = stopwords.words('german')

In [None]:
def cleanse_text(text: str):
  doc = nlp(text)
  result = ' '.join([x.lemma_ for x in doc])
  result = result.lower()
  text_tokens = result.split()
  tokens_wo_stopwords = [token for token in text_tokens if not token in german_stop_words] 
  tokens_wo_special = [re.sub('[^a-zA-ZäöüÄÖÜß]+', '', _) for _ in tokens_wo_stopwords]
  return tokens_wo_special

In [None]:
#Note: This will take some minutes (depending on chat size)
df["cleansed"] = df.message.map(lambda x: cleanse_text(x))

In [None]:
df.tail()

## Word frequency

In [None]:
from collections import Counter

In [None]:
common_words = Counter([word for cleansed_list in df['cleansed'] for word in cleansed_list])
df_most_common = pd.DataFrame(common_words.most_common(200))
df_most_common.columns = ['Word','Frequency']

In [None]:
df_most_common.head(10)

### Sentiment calculation

In [None]:
#from textblob import TextBlob (For english text)
from textblob_de import TextBlobDE

In [None]:
def sentiment_analysis(message): 
    processed_message = TextBlobDE(message) 
    return processed_message.sentiment.polarity

In [None]:
df["sentiment_score"] = df["cleansed"].apply(lambda x: sentiment_analysis(str(x)))

In [None]:
df.loc[df.sentiment_score > 0, 'sentiment'] = "positive"
df.loc[df.sentiment_score == 0, 'sentiment'] = "neutral"
df.loc[df.sentiment_score < 0, 'sentiment'] = "negative"

In [None]:
df.head()

In [None]:
df.groupby(['user', 'sentiment']).size()

In [None]:
user_to_extract = '' # ENTER USERNAME
sentiment_type = '' # CHOOSE SENTIMENT

In [None]:
user_negative = df[(df["user"] == user_to_extract) & (df["sentiment"] == sentiment_type)]

In [None]:
for row in user_negative.head(10).itertuples():
  print(row.message)
  print()
  print("-----------")

In [None]:
sns.countplot(df.sentiment, hue=df.user).set_title('Sentiment by User')

In [None]:
only_negative = df[df["sentiment"] == 'negative']

In [None]:
sns.countplot(only_negative.sentiment, hue=only_negative.user).set_title('Negative Sentiment by User')

In [None]:
df.sort_values(by=['sentiment_score']).head(10)