# Imports/setting up directories
Note: I'm using the dark grid style for seaborn (my prefered plotting library)

In [None]:
%load_ext jupyter_black
# Utilities
import os
from pathlib import Path

# Data handling / generic ML
import pandas as pd
import numpy as np
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

# Custom
from src.eda_utils import pref_pairplot, pref_violinplots

In [None]:
CWD = Path(os.getcwd())
DATA = CWD / "data"
CORPUS_FILES = DATA / "corpus_files"
EDA = CWD / "eda"
LANG_METRICS = EDA / "doc_language_metrics"

In [None]:
EDA.mkdir(parents=True, exist_ok=True)
LANG_METRICS.mkdir(parents=True, exist_ok=True)

# Exploratory data analysis

In [None]:
doc_corpus_path = CORPUS_FILES / "document_corpus.pkl"
doc_df: pd.DataFrame = pd.read_pickle(doc_corpus_path)
doc_df = doc_df[~doc_df["submission_flair"].isna()].copy()
doc_df = doc_df.sort_values(by="UID")
doc_df.info()

In [None]:
pref_pairplot("raw_md")

In [None]:
violinplots("raw_md")

In [None]:
pref_pairplot("clean_md")

In [None]:
pref_violinplots("clean_md")

In [None]:
pref_pairplot("doc_main")

In [None]:
pref_violinplots("doc_main")

In [None]:
pref_pairplot("doc_credit")

In [None]:
pref_violinplots("doc_credit")

In [None]:
sns.scatterplot(
    data=doc_df,
    x="raw_md_char_count",
    y="clean_md_char_count",
    hue="submission_flair",
)
plt.title("Cleaned vs Raw ", fontsize=20)
plt.ylabel("# Characters (Clean MD)", fontsize=16)
plt.xlabel("# Characters (Raw MD)", fontsize=16)
plt.savefig(f"{LANG_METRICS/'clean_v_raw_chars.pdf'}");

In [None]:
doc_df["cleaned_chars"] = doc_df["raw_md_char_count"] - doc_df["clean_md_char_count"]
sns.scatterplot(
    data=doc_df,
    y="cleaned_chars",
    x="raw_md_char_count",
    hue="submission_flair",
)
plt.title("Characters Removed vs Raw Characters", fontsize=20)
plt.xlabel("# Characters (Raw MD)", fontsize=16)
plt.ylabel("# Characters Removed", fontsize=16)
plt.savefig(f"{LANG_METRICS/'char_removed_v_raw_chars.pdf'}");

In [None]:
doc_df["%Cleaned"] = (doc_df["cleaned_chars"] / doc_df["raw_md_char_count"]) * 100
sns.histplot(data=doc_df, x="%Cleaned")
plt.title("% Characters Cleaned", fontsize=20)
plt.savefig(f"{LANG_METRICS/'pct_chars_removed_dist.pdf'}");

In [None]:
doc_df["%Credit"] = (
    doc_df["doc_credit_char_count"]
    / (doc_df["doc_credit_char_count"] + doc_df["doc_main_char_count"])
    * 100
)
sns.histplot(data=doc_df, x="%Credit")
plt.title("% Credit Text", fontsize=20)
plt.savefig(f"{LANG_METRICS/'pct_credit_dist.pdf'}");

In [None]:
doc_df["%Credit"] = (
    doc_df["doc_credit_char_count"]
    / (doc_df["doc_credit_char_count"] + doc_df["doc_main_char_count"])
    * 100
)
sns.scatterplot(
    data=doc_df,
    x="%Cleaned",
    y="%Credit",
    hue="submission_flair",
)
plt.title("% Characters Cleaned vs % Credit", fontsize=20)
plt.savefig(f"{LANG_METRICS/'pct_chars_removed_v_pct_credit.pdf'}");

In [None]:
sns.scatterplot(
    data=doc_df,
    x="doc_main_char_count",
    y="%Credit",
    hue="submission_flair",
)
plt.title("% Characters Cleaned vs % Credit", fontsize=20)
plt.savefig(f"{LANG_METRICS/'pct_chars_removed_v_pct_credit.pdf'}");

In [None]:
credit_clean_cols = [
    "UID",
    "%Cleaned",
    "%Credit",
    "submission_flair",
    "credit_text",
    "clean_text",
]

In [None]:
unusual_credit = doc_df[
    (doc_df["%Credit"] > 10) & (doc_df["submission_flair"] != "Monster")
].sort_values("%Credit", ascending=False)[credit_clean_cols]
unusual_credit.to_csv(EDA / "high_credit_ratio_texts.csv", index=False)
unusual_credit

In [None]:
unusual_cleaning = doc_df[
    (doc_df["%Cleaned"] > 20) & (doc_df["%Credit"] <= 10)
].sort_values("%Cleaned", ascending=False)[credit_clean_cols]
unusual_cleaning.to_csv(EDA / "high_cleaning_ratio_texts.csv", index=False)
unusual_cleaning

### Investigating word/sentence length outliers

In [None]:
word_q75, word_q25 = np.percentile(doc_df["doc_main_avg_word_len"], [75, 25])
word_iqr = word_q75 - word_q25
word_inds = (doc_df["doc_main_avg_word_len"] >= word_q75 + word_iqr) & (
    doc_df["doc_main_avg_word_len"] <= word_q25 - word_iqr
)

sent_q75, sent_q25 = np.percentile(doc_df["doc_main_avg_sent_len"], [75, 25])
sent_iqr = sent_q75 - sent_q25
median = np.median(doc_df["doc_main_avg_sent_len"])
sent_inds = (doc_df["doc_main_avg_sent_len"] >= median + (sent_iqr * 1.5)) & (
    doc_df["doc_main_avg_sent_len"] <= median + (sent_iqr * 1.5)
)

sent_words_cols = [
    "UID",
    "doc_main_avg_word_len",
    "doc_main_avg_sent_len",
    "submission_flair",
    "clean_text",
]

In [None]:
doc_df[sent_inds | word_inds][sent_words_cols].to_csv(
    DATA / "word_and_sent_len_outliers.csv"
)
doc_df[sent_inds | word_inds][sent_words_cols]

In [None]:
sns.histplot(data=doc_df, x="num_sections", hue="submission_flair")
plt.title("Total Section Count", fontsize=20)
plt.savefig(f"{LANG_METRICS/'doc_section_count.pdf'}");

In [None]:
doc_df["log2_num_sections"] = np.log2(doc_df["num_sections"])
sns.histplot(data=doc_df, x="log2_num_sections", hue="submission_flair")
plt.title("Total Section Count", fontsize=20)
plt.savefig(f"{LANG_METRICS/'doc_log2section_count.pdf'}");