In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import umap
from sklearn.manifold import TSNE

sns.set_theme()

df = pd.read_csv("./dataset.csv")
df.columns = df.columns.str.lower()
df.columns


In [None]:
df
df.info()


In [None]:
df.isnull().sum()


In [None]:
dfn = df.iloc[:, :-1]
dfn.describe().drop(['count'], axis=0).round(3)


In [None]:
for column in df.columns[:-1]:
    plt.figure(figsize=(8, 4))
    sns.histplot(data=df, x=column, kde=True, hue='class')
    plt.show()


## Normalize

In [None]:
numcols = df.select_dtypes(include=['float64', 'int64']).columns
df_normed = df.copy()
df_normed[numcols] = (df_normed[numcols] - df_normed[numcols].min()) / (df_normed[numcols].max() - df_normed[numcols].min())


## Outliers

### Initial

In [None]:
def create_outlier_boxplot(df):
    columns_to_melt = ['area', 'majoraxislength', 'minoraxislength', 'eccentricity',
                    'convexarea', 'extent', 'perimeter']

    df_melted = pd.melt(df, id_vars='class', value_vars=columns_to_melt, 
                        var_name='metric', value_name='value')

    plt.figure(figsize=(12.8, 9.6))
    sns.boxplot(data=df_melted, x='value', y='metric', hue='class')
    plt.show()

create_outlier_boxplot(df_normed)


### Removal

In [None]:
def get_outlier_indices(df: pd.DataFrame, column: str):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[~((df[column] >= lower_bound) & (df[column] <= upper_bound))]
    return outliers.index


def find_outliers(df: pd.DataFrame) -> set:
    outlier_indices = set()
    for column in df.select_dtypes(include=['number']).columns:
        outlier_indices.update(get_outlier_indices(df, column))
    return outlier_indices

df0 = df.query('`class` == "Kecimen"')
df1 = df.query('`class` == "Besni"')

outlier_indices = find_outliers(df0).union(find_outliers(df1))
print(f"Total outlier count: {len(outlier_indices)}")
df_clean: pd.DataFrame = df.drop(outlier_indices)

create_outlier_boxplot(df_clean)


## Correlations

In [None]:
correlation_matrix = df_clean.select_dtypes(include=['float64', 'int64']).corr()
correlation_matrix.to_latex('correlation_matrix.tex')
print(correlation_matrix)

mask = np.triu(np.ones_like(correlation_matrix, dtype=bool), 1)

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", mask=mask)
plt.xticks(rotation=45)
plt.show()


## t-Sne

tsne with outliers marked

In [None]:
def graph_tsne(perplexity) -> None:
    tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
    tsne_results = tsne.fit_transform(df_normed.drop(columns="class"))

    df_tsne = pd.DataFrame(tsne_results, columns=["comp1", "comp2"])
    df_tsne["class"] = df["class"]
    df_tsne["type"] = "inlier"
    df_tsne.loc[list(outlier_indices), "type"] = "outlier"

    plt.figure(figsize=(10, 8))
    sns.scatterplot(x="comp1", y="comp2", hue="class", style="type", data=df_tsne)
    plt.show()

graph_tsne(10)
graph_tsne(30)
graph_tsne(50)


## Umap

umap with outliers marked

In [None]:
def graph_umap(par) -> None:
    reducer = umap.UMAP(n_jobs=1, random_state=42, n_neighbors=par)
    umap_results = reducer.fit_transform(df_normed.drop(columns=['class']))

    df_umap = pd.DataFrame(umap_results, columns=['comp1', 'comp2'])
    df_umap['class'] = df['class']
    df_umap["type"] = "inlier"
    df_umap.loc[list(outlier_indices), "type"] = "outlier"

    plt.figure(figsize=(10, 8))
    sns.scatterplot(x='comp1', y='comp2', hue='class', style='type', data=df_umap)
    plt.show()
    
graph_umap(5)
graph_umap(15)
graph_umap(50)
