In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import umap
from sklearn.manifold import TSNE

sns.set_theme()

df = pd.read_csv("./dataset.csv")
df.columns = df.columns.str.lower()
df.columns


In [None]:
sns.set(font_scale=1.5)  # Increase the font scale
plt.figure(figsize=(10, 10))

class_counts = df['class'].value_counts(normalize=True) * 100
class_labels = [f"{label} ({value:.1f}%)" for label, value in zip(class_counts.index, class_counts.values)]

patches, texts = plt.pie(
    x=class_counts, 
    labels=class_labels,
    colors=sns.color_palette('pastel'),
    startangle=90,
    labeldistance=0.5
)

for text in texts:
    text.set_fontweight('bold')
    text.set_horizontalalignment('center')
    text.set_fontsize(20) 


plt.axis("equal")  # This ensures the pie is drawn as a circle.

plt.show()


In [None]:
df
df.info()


In [None]:
df.isnull().sum()


In [None]:
dfn = df.iloc[:, :-1]
dfn.describe().drop(['count'], axis=0).round(3)


In [None]:
# Set the font size using matplotlib
plt.rcParams.update({'font.size': 20})

for column in df.columns[:-1]:
    plt.figure(figsize=(8, 4))
    sns.histplot(data=df, x=column, kde=True, hue='class')
    
    # Increase the distance of the x-axis label
    plt.xlabel(column, labelpad=10)
    
    # Increase the distance of the y-axis label
    plt.ylabel('Count', labelpad=10)
    
    plt.show()


## Normalize

In [None]:
numcols = df.select_dtypes(include=['float64', 'int64']).columns
df_normed = df.copy()
df_normed[numcols] = (df_normed[numcols] - df_normed[numcols].min()) / (df_normed[numcols].max() - df_normed[numcols].min())


In [None]:
dfn = df_normed.iloc[:, :-1]
dfn.describe().drop(['count'], axis=0).round(3)


## Outliers

In [None]:
import pandas as pd

def get_outlier_counts(df: pd.DataFrame, column: str) -> dict:
    if column not in df.columns:
        return {"mild": 0, "extreme": 0}
    
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1  # H
    
    # Vidinis barjeras
    inner_lower_bound = Q1 - 1.5 * IQR
    inner_upper_bound = Q3 + 1.5 * IQR
    # Išorinis barjeras
    outer_lower_bound = Q1 - 3.0 * IQR
    outer_upper_bound = Q3 + 3.0 * IQR
    
    # Ekstrimalios išskirtys
    extreme_outliers = df[~((df[column] >= outer_lower_bound) & (df[column] <= outer_upper_bound))]
    
    # Sąlyginės išskirtys
    mild_outliers = df[((df[column] < inner_lower_bound) | (df[column] > inner_upper_bound)) & ((df[column] >= outer_lower_bound) & (df[column] <= outer_upper_bound))]
    
    return {"mild": mild_outliers.shape[0], "extreme": extreme_outliers.shape[0]}

def find_outliers(df: pd.DataFrame) -> dict:
    outlier_counts = {}
    for column in df.select_dtypes(include=['number']).columns:
        outlier_counts[column] = get_outlier_counts(df, column)
    return outlier_counts

outlier_counts_df = find_outliers(df)

print("Outlier counts:")
for col, counts in outlier_counts_df.items():
    print(f"{col} - Mild: {counts['mild']}, Extreme: {counts['extreme']}")


### Initial

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def create_outlier_boxplot(df):
    columns_to_melt = ['area', 'majoraxislength', 'minoraxislength', 'eccentricity',
                       'convexarea', 'extent', 'perimeter']

    df_melted = pd.melt(df, id_vars='class', value_vars=columns_to_melt, 
                        var_name='metric', value_name='value')

    plt.figure(figsize=(17.8, 20.6))
    
    # Set background color to white
    sns.set_style("whitegrid", {'axes.grid' : False})
    
    ax = sns.boxplot(data=df_melted, x='value', y='metric', hue='class', fliersize=12) # Adjust fliersize here
    
    # Set font size to 30
    ax.tick_params(labelsize=30)
    
    # Adjust x and y labels with increased distance from their respective axes
    ax.set_xlabel('Value', fontsize=30, labelpad=15)
    ax.set_ylabel('Metric', fontsize=30, labelpad=15)

    # Extract the legend handles and labels
    handles, labels = ax.get_legend_handles_labels()
    
    # Place the legend entries vertically to the right (removing the existing legend first)
    ax.legend_.remove()
    ax.legend(handles=handles, labels=labels, loc='center left', 
              bbox_to_anchor=(0.99, 0.5), 
              fontsize=30, frameon=False)

    plt.tight_layout()  # Ensure that all elements fit within the figure boundaries
    plt.show()

# You'd call the function with your dataframe as:
create_outlier_boxplot(df_normed)


### Removal

In [None]:
def get_outlier_indices(df: pd.DataFrame, column: str):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 3.0 * IQR
    upper_bound = Q3 + 3.0 * IQR
    outliers = df[~((df[column] >= lower_bound) & (df[column] <= upper_bound))]
    return outliers.index


def find_outliers(df: pd.DataFrame) -> set:
    outlier_indices = set()
    for column in df.select_dtypes(include=['number']).columns:
        outlier_indices.update(get_outlier_indices(df, column))
    return outlier_indices

df0 = df_normed.query('`class` == "Kecimen"')
df1 = df_normed.query('`class` == "Besni"')

outlier_indices = find_outliers(df0).union(find_outliers(df1))
print(f"Total outlier count: {len(outlier_indices)}")
df_clean: pd.DataFrame = df_normed.drop(outlier_indices)

create_outlier_boxplot(df_clean)


df_clean
df_clean.info()

dfn = df_clean.iloc[:, :-1]
dfn.describe().drop(['count'], axis=0).round(3)


## Correlations

In [None]:
correlation_matrix = df_clean.select_dtypes(include=['float64', 'int64']).corr()
correlation_matrix.to_latex('correlation_matrix.tex')
print(correlation_matrix)

sns.set_style("white")

mask = np.triu(np.ones_like(correlation_matrix, dtype=bool), 1)

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", mask=mask)
plt.xticks(rotation=45)
plt.show()


In [None]:
import numpy as np
import pandas as pd
from scipy import stats

# Setting display precision for pandas
pd.set_option('display.float_format', '{:.3f}'.format)

# Assuming df_clean is already defined
numeric_df = df_clean.select_dtypes(include=['float64', 'int64'])

# Initialize matrix for p-values
p_matrix = pd.DataFrame(np.zeros(numeric_df.shape[1] * numeric_df.shape[1]).reshape(numeric_df.shape[1], -1), 
                        columns=numeric_df.columns, index=numeric_df.columns)

# Calculate the p-values
for col1 in numeric_df.columns:
    for col2 in numeric_df.columns:
        r, p = stats.pearsonr(numeric_df[col1], numeric_df[col2])
        p_matrix.loc[col1, col2] = p

print(p_matrix)


## t-Sne

tsne with outliers marked

In [None]:
def graph_tsne(perplexity) -> None:
    tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
    tsne_results = tsne.fit_transform(df_normed.drop(columns="class"))

    df_tsne = pd.DataFrame(tsne_results, columns=["comp1", "comp2"])
    df_tsne["class"] = df["class"]
    df_tsne["type"] = "inlier"
    df_tsne.loc[list(outlier_indices), "type"] = "outlier"

    plt.figure(figsize=(10, 8))
    sns.scatterplot(x="comp1", y="comp2", hue="class", style="type", data=df_tsne)
    plt.show()

graph_tsne(10)
graph_tsne(30)
graph_tsne(50)


## Umap

umap with outliers marked

In [None]:
def graph_umap(par) -> None:
    reducer = umap.UMAP(n_jobs=1, random_state=42, n_neighbors=par)
    umap_results = reducer.fit_transform(df_normed.drop(columns=['class']))

    df_umap = pd.DataFrame(umap_results, columns=['comp1', 'comp2'])
    df_umap['class'] = df['class']
    df_umap["type"] = "inlier"
    df_umap.loc[list(outlier_indices), "type"] = "outlier"

    plt.figure(figsize=(10, 8))
    sns.scatterplot(x='comp1', y='comp2', hue='class', style='type', data=df_umap)
    plt.show()
    
graph_umap(5)
graph_umap(15)
graph_umap(50)
