In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import umap
from sklearn.manifold import TSNE
from scipy.stats import pearsonr

df = pd.read_csv("./dataset.csv")
df.columns = df.columns.str.lower()
df.columns


In [None]:
sns.set(font_scale=1.5)
plt.figure(figsize=(10, 10))

class_counts = df["class"].value_counts(normalize=True) * 100
class_labels = [
    f"{label} ({value:.1f}%)"
    for label, value in zip(class_counts.index, class_counts.values)
]

patches, texts = plt.pie(
    x=class_counts,
    labels=class_labels,
    startangle=90,
    labeldistance=0.5,
)

for text in texts:
    text.set_fontweight("bold")
    text.set_horizontalalignment("center")
    text.set_fontsize(20)

plt.axis("equal")
plt.show()


In [None]:
df


In [None]:
df.info()


In [None]:
df.isnull().sum()


In [None]:
dfn = df.iloc[:, :-1]
dfn.describe().drop(["count"], axis=0).round(3)


In [None]:
for column in df.columns[:-1]:
    custom_params = {"axes.spines.right": False, "axes.spines.top": False}
    sns.set_theme(style="ticks", rc=custom_params)

    plt.figure(figsize=(8, 4))
    sns.histplot(data=df, x=column, kde=True, hue="class")

    plt.xlabel(column.title(), labelpad=10)
    plt.ylabel("Count", labelpad=10)

    plt.show()


## Normalize


In [None]:
def normalize(df):
    normed = df.copy()
    numcols = df.select_dtypes(include=["float64", "int64"]).columns
    normed[numcols] = (normed[numcols] - normed[numcols].min()) / (
        normed[numcols].max() - normed[numcols].min()
    )
    return normed


df_normed = normalize(df)


## Outliers


### Initial


In [None]:
def create_outlier_boxplot(df):
    custom_params = {"axes.spines.right": False, "axes.spines.top": False}
    sns.set_theme(style="ticks", rc=custom_params)

    columns_to_melt = [
        "area",
        "majoraxislength",
        "minoraxislength",
        "eccentricity",
        "convexarea",
        "extent",
        "perimeter",
    ]

    df_melted = pd.melt(
        df,
        id_vars="class",
        value_vars=columns_to_melt,
        var_name="metric",
        value_name="value",
    )

    plt.figure(figsize=(12.8, 9.6))

    ax = sns.boxplot(
        data=df_melted,
        x="value",
        y="metric",
        hue="class",
    )

    ax.set_xlabel("Value")
    ax.set_ylabel("Metric")

    handles, labels = ax.get_legend_handles_labels()

    ax.legend_.remove()
    ax.legend(
        handles=handles,
        labels=labels,
        loc="center left",
        bbox_to_anchor=(0.99, 0.5),
        frameon=False,
    )

    plt.tight_layout()
    plt.show()


create_outlier_boxplot(df_normed)


### Removal


In [None]:
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)


def get_outliers(df: pd.DataFrame, column: str):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    extreme_lower_bound = Q1 - 3 * IQR
    extreme_upper_bound = Q3 + 3 * IQR

    extreme_outliers = df[
        ~((df[column] >= extreme_lower_bound) & (df[column] <= extreme_upper_bound))
    ]
    mild_outliers = df[
        ((df[column] < lower_bound) | (df[column] > upper_bound))
        & ((df[column] >= extreme_lower_bound) & (df[column] <= extreme_upper_bound))
    ]

    return (
        {"mild": mild_outliers.shape[0], "extreme": extreme_outliers.shape[0]},
        extreme_outliers.index,
        mild_outliers.index,
    )


def find_outlier_counts(df: pd.DataFrame) -> dict:
    outlier_counts = {}
    for column in df.select_dtypes(include=["number"]).columns:
        outlier_counts[column], _, _ = get_outliers(df, column)
    return outlier_counts


def find_outlier_indices(df: pd.DataFrame):
    outlier_indices = list()
    extreme_outlier_indices = list()

    for column in df.select_dtypes(include=["number"]).columns:
        _, eoi, oi = get_outliers(df, column)
        outlier_indices.extend(oi)
        extreme_outlier_indices.extend(eoi)

    return extreme_outlier_indices, outlier_indices


outlier_counts_df = find_outlier_counts(df)

print("Outlier counts:")
for col, counts in outlier_counts_df.items():
    print(f"{col} - Mild: {counts['mild']}, Extreme: {counts['extreme']}")

df0 = df.query('`class` == "Kecimen"')
df1 = df.query('`class` == "Besni"')

df0eoi, df0oi = find_outlier_indices(df0)
df1eoi, df1oi = find_outlier_indices(df1)

extreme_outlier_indices = df0eoi + df1eoi
outlier_indices = df0oi + df1oi
print(f"\nTotal extreme outlier count: {len(extreme_outlier_indices)}")
print(f"Total mild outlier count: {len(outlier_indices)}\n")

df_clean: pd.DataFrame = df.drop(extreme_outlier_indices)
df_clean_normed: pd.DataFrame = normalize(df_clean)

create_outlier_boxplot(df_clean_normed)


## Correlations


In [None]:
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)

df_clean_numeric = df_clean.select_dtypes(include=["float64", "int64"])

correlation_matrix = df_clean_numeric.corr()
correlation_matrix.to_latex("correlation_matrix.tex")
correlation_matrix.round(3).to_latex("correlation_matrix.tex", float_format="%.3f")

mask = np.triu(np.ones_like(correlation_matrix, dtype=bool), 1)

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", mask=mask)
plt.xticks(rotation=45)
plt.show()

pvalues = df_clean_numeric.corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(
    len(df_clean_numeric.columns)
)

pvalues.round(3).to_latex("pvalues.tex", float_format="%.3f")


# Algos


In [None]:
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)


def graph_algo(results):
    local_df = pd.DataFrame(results, columns=["comp1", "comp2"])
    local_df["class"] = df["class"]
    local_df["type"] = "inlier"
    local_df.loc[list(outlier_indices), "type"] = "outlier"
    local_df.loc[list(extreme_outlier_indices), "type"] = "extreme_outlier"

    plt.figure(figsize=(18, 10))

    d = {"inlier": 50, "outlier": 200, "extreme_outlier": 300}
    local_df["size_outliers"] = local_df["type"].map(d)

    ax = sns.scatterplot(
        x="comp1",
        y="comp2",
        hue="class",
        style="type",
        data=local_df,
        size="size_outliers",
        sizes=(50, 300),
    )
    ax.set(xlabel="X", ylabel="Y")
    plt.show()


## t-Sne


tsne with outliers marked


In [None]:
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)


def graph_tsne(perplexity) -> None:
    tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
    tsne_results = tsne.fit_transform(df_normed.drop(columns="class"))
    graph_algo(tsne_results)


graph_tsne(10)
graph_tsne(30)
graph_tsne(50)


## Umap


umap with outliers marked


In [None]:
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)


def graph_umap(par) -> None:
    reducer = umap.UMAP(n_jobs=1, random_state=42, n_neighbors=par)
    umap_results = reducer.fit_transform(df_normed.drop(columns=["class"]))
    graph_algo(umap_results)


graph_umap(5)
graph_umap(15)
graph_umap(50)
