## Imports

In [None]:
import itertools
import re

import spacy
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from datasets import load_dataset
from tqdm.notebook import tqdm

In [None]:
plt.rcParams["font.monospace"] = ["DejaVu Sans Mono"]
plt.rcParams["font.family"] = "monospace"

#### NLP analysis

In [None]:
dataset = load_dataset("ML-Projects-Kiel/tweetyface", "english")

In [None]:
nlp = spacy.load("en_core_web_sm")

Combine the train and validation dataset to one DataFrame.

In [None]:
_df_dict = dict()
for _data_set in ["train", "validation"]:
    _df_dict[_data_set] = pd.DataFrame(
        {"label": dataset[_data_set]["label"], "text": dataset[_data_set]["text"]}
    )
df = pd.concat([_df_dict["train"], _df_dict["validation"]])

Create DataFrame for unigrams and pos.

In [None]:
df.head()

In [None]:
def return_pos(doc) -> list:
    return [token.pos_ for token in doc]


def return_words(doc) -> list:
    return [token.lemma_ for token in doc]


def remove_stops(doc) -> list:
    return [
        token.lemma_
        for token in doc
        if not token.is_stop
        if not token.is_punct
        if not token.is_space
    ]

In [None]:
df_list = list()
for _user in tqdm(set(df["label"])):
    user_df = df[df["label"] == _user].copy()
    # Preprocess steps
    user_df["text"] = [re.sub("\n", " ", txt) for txt in user_df["text"]]
    user_df["text"] = [re.sub(r"http\S+", "URL", txt) for txt in user_df["text"]]
    user_df["text"] = [re.sub("&amp;", "&", txt) for txt in user_df["text"]]
    user_df["text"] = [" ".join(txt.split()) for txt in user_df["text"]]

    docs = [nlp(txt) for txt in user_df["text"]]
    _df = pd.DataFrame({"label": itertools.repeat(_user, user_df.shape[0])})
    _df["pos"] = list(map(return_pos, docs))
    _df["words"] = list(map(return_words, docs))
    _df["words_nostops"] = list(map(remove_stops, docs))
    df_list.append(_df)

In [None]:
df = pd.concat(df_list)

#### Dict to translate labels

In [None]:
full_features = dataset["train"].features["label"].names  # Create List with all users
label_translation = {idx: label for idx, label in enumerate(full_features)}

In [None]:
label_translation

In [None]:
df["label"] = df["label"].replace(label_translation)

In [None]:
df.head()

In [None]:
total_tweets = pd.DataFrame(df.groupby("label").size()).reset_index().rename(columns={0: "tweets"})
total_tweets

#### Inspects POS

In [None]:
def explode_df(df, _key):
    df_pos = (
        df.reset_index()
        .explode(_key)[["index", "label", _key]]
        .groupby(["index", "label", _key])
        .size()
        .reset_index(name="counts")
    )
    return pd.pivot(df_pos, index=["index", "label"], columns=_key, values="counts").reset_index()

In [None]:
df_pos = explode_df(df, "pos")

Create grouped POS DF.

In [None]:
df_pos_group = df_pos.groupby("label").sum().drop(columns="index").reset_index()
df_pos_group["total"] = df_pos_group.iloc[:, 1:].sum(axis=1)
df_pos_group = pd.merge(df_pos_group, total_tweets, on="label")

In [None]:
sns.barplot(df_pos_group, y="label", x="total")
plt.ylabel("Twitter User")
plt.xlabel("Number of total tokens")
plt.title("Total tokens per Twitter User")
plt.show()

In [None]:
df_pos_group["num_per_tweet"] = df_pos_group["total"] / df_pos_group["tweets"]
sns.barplot(df_pos_group, y="label", x="num_per_tweet")
plt.ylabel("Twitter User")
plt.xlabel("Tokens per Tweet")
plt.title("Tokens per Tweet per Twitter User")
plt.show()

In [None]:
df_pos_t = pd.melt(
    df_pos_group,
    id_vars=["label", "total"],
    value_vars=df_pos_group.columns[1:-3],
    var_name="POS",
    value_name="num_per_tweet",
    ignore_index=True,
).sort_values(by=["POS"])
df_pos_t.head()

In [None]:
nrows, ncols = 3, 4
fig, axes = plt.subplots(figsize=(20, 10), nrows=nrows, ncols=ncols)
row, col = 0, 0
for label in set(df_pos_t["label"]):
    ax = axes[row, col]
    sns.barplot(df_pos_t[df_pos_t["label"] == label], y="POS", x="num_per_tweet", ax=ax)
    ax.set_title(label)
    col += 1
    if col == ncols:
        row += 1
        col = 0
plt.tight_layout()
# plt.title("Percentages of part-of-speech tags (POS) per User")
plt.show()

#### Inspect Unigrams

In [None]:
def explode_df_unigram(df, _key):
    _df = (
        df.reset_index()
        .explode(_key)[["index", "label", _key]]
        .groupby(["index", "label", _key])
        .size()
        .reset_index(name="counts")
    )
    return _df.drop(columns="index").groupby(["label", _key]).sum().reset_index()

In [None]:
df_uni = explode_df_unigram(df, "words")

In [None]:
def plot_uni_bar(df, key):
    nrows, ncols = 3, 4
    fig, axes = plt.subplots(figsize=(20, 10), nrows=nrows, ncols=ncols)
    row, col = 0, 0
    for label in set(df["label"]):
        ax = axes[row, col]
        _df = df[df["label"] == label].nlargest(20, "counts")
        labels = _df[key].to_list()
        sns.barplot(_df, x=key, y="counts", ax=ax)
        ax.set_title(label)
        ax.set_xticklabels(labels, rotation=45, ha="right")
        ax.set_xlabel("Unigrams")
        ax.set_ylabel("Amount")
        col += 1
        if col == ncols:
            row += 1
            col = 0
    plt.tight_layout()
    # plt.title("Percentages of part-of-speech tags (POS) per User")
    plt.show()

In [None]:
plot_uni_bar(df_uni, "words")

In [None]:
df_uni_nostop = explode_df_unigram(df, "words_nostops")
plot_uni_bar(
    df_uni_nostop[
        (df_uni_nostop["words_nostops"] != "url") & (df_uni_nostop["words_nostops"] != "URL")
    ],
    "words_nostops",
)