In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import umap
from sklearn.manifold import TSNE
from scipy.stats import pearsonr

custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)

df = pd.read_csv("./dataset.csv")
df.columns = df.columns.str.lower()
df.columns


In [None]:
df
df.info()


In [None]:
df.isnull().sum()


In [None]:
dfn = df.iloc[:, :-1]
dfn.describe().drop(['count'], axis=0).round(3)


In [None]:
for column in df.columns[:-1]:
    plt.figure(figsize=(8, 4))
    sns.histplot(data=df, x=column, kde=True, hue='class')
    plt.show()


## Normalize

In [None]:
def normalize(df):
    normed = df.copy()
    normed[numcols] = (normed[numcols] - normed[numcols].min()) / (normed[numcols].max() - normed[numcols].min())
    return normed

numcols = df.select_dtypes(include=['float64', 'int64']).columns
df_normed = normalize(df)


## Outliers

### Initial

In [None]:
def create_outlier_boxplot(df):
    columns_to_melt = ['area', 'majoraxislength', 'minoraxislength', 'eccentricity',
                    'convexarea', 'extent', 'perimeter']

    df_melted = pd.melt(df, id_vars='class', value_vars=columns_to_melt, 
                        var_name='metric', value_name='value')

    plt.figure(figsize=(12.8, 9.6))
    sns.boxplot(data=df_melted, x='value', y='metric', hue='class')
    plt.show()

create_outlier_boxplot(df_normed)


### Removal

In [None]:
def get_outlier_indices(df: pd.DataFrame, column: str):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    extreme_lower_bound = Q1 - 3 * IQR
    extreme_upper_bound = Q3 + 3 * IQR
    
    extreme_outliers = df[~((df[column] >= extreme_lower_bound) & (df[column] <= extreme_upper_bound))]
    outliers = df[~((df[column] >= lower_bound) & (df[column] <= upper_bound))]
    
    return extreme_outliers.index, outliers.index


def find_outliers(df: pd.DataFrame):
    outlier_indices = list()
    extreme_outlier_indices = list()
    
    for column in df.select_dtypes(include=['number']).columns:
        eoi, oi = get_outlier_indices(df, column)
        outlier_indices.extend(oi)
        extreme_outlier_indices.extend(eoi)
        
    return extreme_outlier_indices, outlier_indices

df0 = df.query('`class` == "Kecimen"')
df1 = df.query('`class` == "Besni"')

df0eoi, df0oi = find_outliers(df0)
df1eoi, df1oi = find_outliers(df1)

extreme_outlier_indices = df0eoi + df1eoi
outlier_indices = df0oi + df1oi
print(f"Total extreme outlier count: {len(extreme_outlier_indices)}")
print(f"Total mild outlier count: {len(outlier_indices)}")

df_clean: pd.DataFrame = df.drop(extreme_outlier_indices)
df_clean_normed: pd.DataFrame = normalize(df_clean)

create_outlier_boxplot(df_clean_normed)


## Correlations

In [None]:
df_clean_numeric = df_clean.select_dtypes(include=["float64", "int64"])

correlation_matrix = df_clean_numeric.corr()
correlation_matrix.to_latex("correlation_matrix.tex")
correlation_matrix.round(3).to_latex("correlation_matrix.tex", float_format="%.3f")

mask = np.triu(np.ones_like(correlation_matrix, dtype=bool), 1)

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", mask=mask)
plt.xticks(rotation=45)
plt.show()

pvalues = (df_clean_numeric.corr(method=lambda x, y: 
               pearsonr(x, y)[1]) - np.eye(len(df_clean_numeric.columns)))

pvalues.round(3).to_latex("pvalues.tex", float_format="%.3f")


In [None]:
def graph_algo(results):
    local_df = pd.DataFrame(results, columns=["comp1", "comp2"])
    local_df["class"] = df["class"]
    local_df["type"] = "inlier"
    local_df.loc[list(outlier_indices), "type"] = "outlier"
    local_df.loc[list(extreme_outlier_indices), "type"] = "extreme_outlier"

    plt.figure(figsize=(10, 8))
    d = {"inlier": 1, "outlier": 2, "extreme_outlier": 3}
    local_df["size_outliers"] = local_df["type"].map(d)
    ax = sns.scatterplot(
        x="comp1",
        y="comp2",
        hue="class",
        style="type",
        data=local_df,
        size="size_outliers",
    )
    ax.set(xlabel='X', ylabel='Y')
    plt.show()


## t-Sne

tsne with outliers marked

In [None]:
def graph_tsne(perplexity) -> None:
    tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
    tsne_results = tsne.fit_transform(df_normed.drop(columns="class"))
    graph_algo(tsne_results)

graph_tsne(10)
graph_tsne(30)
graph_tsne(50)


## Umap

umap with outliers marked

In [None]:
def graph_umap(par) -> None:
    reducer = umap.UMAP(n_jobs=1, random_state=42, n_neighbors=par)
    umap_results = reducer.fit_transform(df_normed.drop(columns=['class']))
    graph_algo(umap_results)

    
graph_umap(5)
graph_umap(15)
graph_umap(50)
