In [88]:
import pandas as pd
import numpy as np
import plotly.express as pe
from sklearn.ensemble import IsolationForest


# Loading dataset

In [89]:
# Load dataset
data_dir_path = '../src/data/'
data_trainset = pd.read_csv(data_dir_path + "/train.csv")
data_testset = pd.read_csv(data_dir_path + '/test.csv')

# Dataset Analysis

In [90]:
# missing values in the dataset
missing_values =data_trainset.isna().sum().sum()
print("Missing values in the dataset : ", missing_values)

Missing values in the dataset :  0


In [91]:
# Duplicated values in the dataset
duplicated_values =data_trainset.duplicated().sum()
print("Duplicates values in the dataset : ", duplicated_values)

Duplicates values in the dataset :  0


In [92]:
# Statistics of the dataset using the describe function.
statistics_data = data_trainset.describe()
statistics_data 

Unnamed: 0,id,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,margin9,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
count,990.0,990.0,990.0,990.0,990.0,990.0,990.0,990.0,990.0,990.0,...,990.0,990.0,990.0,990.0,990.0,990.0,990.0,990.0,990.0,990.0
mean,799.59596,0.017412,0.028539,0.031988,0.02328,0.014264,0.038579,0.019202,0.001083,0.007167,...,0.036501,0.005024,0.015944,0.011586,0.016108,0.014017,0.002688,0.020291,0.008989,0.01942
std,452.477568,0.019739,0.038855,0.025847,0.028411,0.01839,0.05203,0.017511,0.002743,0.008933,...,0.063403,0.019321,0.023214,0.02504,0.015335,0.060151,0.011415,0.03904,0.013791,0.022768
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,415.25,0.001953,0.001953,0.013672,0.005859,0.001953,0.0,0.005859,0.0,0.001953,...,0.0,0.0,0.000977,0.0,0.004883,0.0,0.0,0.0,0.0,0.000977
50%,802.5,0.009766,0.011719,0.025391,0.013672,0.007812,0.015625,0.015625,0.0,0.005859,...,0.004883,0.0,0.005859,0.000977,0.012695,0.0,0.0,0.003906,0.00293,0.011719
75%,1195.5,0.025391,0.041016,0.044922,0.029297,0.017578,0.056153,0.029297,0.0,0.007812,...,0.043701,0.0,0.022217,0.009766,0.021484,0.0,0.0,0.023438,0.012695,0.029297
max,1584.0,0.087891,0.20508,0.15625,0.16992,0.11133,0.31055,0.091797,0.03125,0.076172,...,0.42969,0.20215,0.17285,0.2002,0.10645,0.57813,0.15137,0.37598,0.086914,0.1416


The dataset appears to contain a number of outliers, with some maximum values significantly diverging from the mean. Let's proceed to investigate this using histograms and boxplots."

In [93]:
# Histograms of the dataset based on the specified column
fig = pe.histogram(data_trainset, ['margin1', 'margin2', 'margin3'])
fig.show()

We observe that certain values are indeed significantly distant from the cluster of observed values for the different features

In [94]:
#Plots the boxplot of the dataset based on the specified column
fig = pe.box(data_trainset, ['margin1', 'margin2', 'margin3'])
fig.show()

We observe that certain values are indeed significantly distant from the cluster of observed values for the different features

Our initial hypothesis stands confirmed the histogram and boxplot reveal the presence of outliers within our dataset. We'll now move forward with replacing these outliers with better values


# Replacing outliers

In [95]:
# Drop the "id" column
data_trainset = data_trainset.drop(columns="id", axis=1)

In [96]:
outliers_percentage = {}

for column in data_trainset.columns:
     # Check if the column is numeric
    if data_trainset[column].dtype in ['int64', 'float64']: 
        # Isolation Forest requires 2D data, so reshape the column data
        data = data_trainset[column].values.reshape(-1, 1)
        
        clf = IsolationForest(random_state=42)
        preds = clf.fit_predict(data)
        
        # Calculate the number of outliers
        n_outliers = np.sum(preds == -1)
        
        # Replace outliers with NaN
        data_trainset.loc[preds == -1, column] = np.nan
        
        # Calculate the percentage of outliers
        percentage = (n_outliers / len(data)) * 100
        outliers_percentage[column] = percentage

# Print the percentage of outliers for each numeric column
for column, percentage in outliers_percentage.items():
    print(f"{column}: {percentage:.2f}% outliers")


margin1: 36.26% outliers
margin2: 35.05% outliers
margin3: 33.13% outliers
margin4: 24.34% outliers
margin5: 22.63% outliers
margin6: 37.88% outliers
margin7: 40.61% outliers
margin8: 18.69% outliers
margin9: 24.65% outliers
margin10: 47.58% outliers
margin11: 40.51% outliers
margin12: 55.25% outliers
margin13: 24.24% outliers
margin14: 33.84% outliers
margin15: 58.79% outliers
margin16: 1.31% outliers
margin17: 30.40% outliers
margin18: 17.17% outliers
margin19: 15.35% outliers
margin20: 30.30% outliers
margin21: 46.57% outliers
margin22: 31.62% outliers
margin23: 8.18% outliers
margin24: 32.53% outliers
margin25: 40.10% outliers
margin26: 31.01% outliers
margin27: 33.13% outliers
margin28: 32.32% outliers
margin29: 40.91% outliers
margin30: 23.03% outliers
margin31: 32.22% outliers
margin32: 31.31% outliers
margin33: 52.93% outliers
margin34: 30.61% outliers
margin35: 36.36% outliers
margin36: 39.19% outliers
margin37: 44.34% outliers
margin38: 35.56% outliers
margin39: 59.90% outlie

In [98]:
# Set the threshold to 50% of the number of rows in the dataset
threshold = len(data_trainset) * 0.5

# Drop columns from the DataFrame where more than 50% of the values are NaN
data_trainset_cleaned = data_trainset.dropna(axis='columns', thresh=threshold)
data_trainset_cleaned.to_csv('../src/data/data_trainset_cleaned.csv', index=False)

In [82]:
statistics_data = data_trainset_cleaned.describe()
statistics_data 

Unnamed: 0,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,margin9,margin10,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
count,631.0,643.0,662.0,749.0,766.0,615.0,588.0,805.0,746.0,519.0,...,610.0,779.0,670.0,744.0,706.0,789.0,886.0,721.0,518.0,621.0
mean,0.005739,0.006525,0.02357,0.013661,0.00616,0.007562,0.010908,0.0,0.004513,0.01608,...,0.002318,0.0,0.00393,0.001592,0.012092,3.2e-05,0.0,0.003585,0.00119,0.009187
std,0.006283,0.007889,0.011235,0.009461,0.005769,0.010334,0.007615,0.0,0.003582,0.00656,...,0.004819,0.0,0.004988,0.002528,0.007573,0.000175,0.0,0.00545,0.002507,0.011594
min,0.0,0.0,0.007812,0.001953,0.0,0.0,0.0,0.0,0.0,0.005859,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.013672,0.005859,0.0,0.0,0.005859,0.0,0.0,0.011719,...,0.0,0.0,0.0,0.0,0.006836,0.0,0.0,0.0,0.0,0.0
50%,0.003906,0.003906,0.021484,0.011719,0.003906,0.003906,0.011719,0.0,0.005859,0.015625,...,0.0,0.0,0.001953,0.0,0.011719,0.0,0.0,0.000977,0.0,0.00293
75%,0.011719,0.009766,0.03125,0.019531,0.009766,0.011719,0.017578,0.0,0.005859,0.021484,...,0.00293,0.0,0.005859,0.002197,0.018555,0.0,0.0,0.004883,0.0,0.015625
max,0.019531,0.03125,0.050781,0.035156,0.019531,0.042969,0.023438,0.0,0.011719,0.027344,...,0.032227,0.0,0.023438,0.009766,0.026367,0.000977,0.0,0.022461,0.008789,0.041992
