In [37]:
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

#### Helper functions

In [38]:
def check_equal_ids(df1: pd.DataFrame, df2: pd.DataFrame) -> bool:
    if len(df1) != len(df2):
        return False
    df1 = df1["id"].sort_values().reset_index(drop=True)
    df2 = df2["id"].sort_values().reset_index(drop=True)
    return (df1 == df2).sum() == len(df1)

#### Load data from csv

In [39]:
features_df = pd.read_csv("features.csv", sep=";")
targets_df = pd.read_csv("targets.csv", sep=";")
text_features_df = pd.read_csv("text_features.csv", sep=";")

#### Check equal ids

Check if all three datasets represent the same observations

In [40]:
check_equal_ids(features_df, text_features_df), check_equal_ids(features_df, targets_df)

(True, True)

#### Merge data

We can now merge datasets on `id` column

In [41]:
data = pd.merge(features_df, targets_df, on="id", how="inner").merge(text_features_df, on="id", how="inner")
data

Unnamed: 0,id,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,target,f_10
0,b03a97185bcd4e3297fffcbe7360c64d,F,0.495,0.400,1550.0,0.6445,0.2420,0.1325,0.205,SHORT,1.025113,1,Measurable process selection theorem and non-a...
1,84ab6aeda37c4943847d204c120410df,M,0.630,0.515,1550.0,1.2590,0.4105,0.1970,0.410,SHORT,2.006427,1,
2,dcbb537ee552433780fde6dfb1d941c6,F,0.595,0.465,1550.0,1.0260,0.4645,0.1120,0.305,SHORT,2.430176,0,On a topology property for moduli space of Kap...
3,e9cc5c8b303045a783e11df15389cd39,M,0.360,0.295,1000.0,0.2105,0.0660,0.0525,0.075,SHORT,1.451025,0,Machine Learning on Sequential Data Using a Re...
4,bd8ca64a21fd4edeb3db59ce5db871bc,M,0.600,0.510,1850.0,1.2850,0.6095,0.2745,0.315,SHORT,2.701401,0,Multirole Logic (Extended Abstract)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4173,9af44122585b423493ba05828fba8ac3,M,0.645,0.485,1550.0,1.4890,0.5915,0.3120,0.380,SHORT,2.969376,1,
4174,5ccd0a6c62c845838205fb184053294f,M,0.650,0.525,1850.0,1.6220,0.6645,0.3225,0.477,TALL,3.770031,0,Recurrent Deterministic Policy Gradient Method...
4175,f3290aaf37de4c74bf330e11cbe6bbd8,F,0.710,0.565,1950.0,1.6500,0.6380,0.3365,0.565,TALL,4.116161,1,Multilink Communities of Multiplex Networks
4176,b3facf6230fe4adf86b56f161be32725,M,0.595,0.455,1500.0,0.8860,0.4315,0.2010,0.223,SHORT,3.177005,0,A geometric second-order-rectifiable stratific...


#### Input features

In [42]:
numeric_features = [f"f_{i}" for i in range(1, 10) if i != 8]
categorical_features = ["f_0", "f_8"]
text_features = ["f10"]

#### Outliers analysis

Before computing correlation and other outlier-sensitive analyses, we identify and remove most critical outliers from numerical features. From the boxplots below, we can see the only feature that has outliers above or below `3*IQR` (double distance of inner fences displayed in boxplots) is `f_3`, so we only remove outliers for this variable

In [43]:
# Boxplots
rows, cols = 2, 4
fig = make_subplots(rows=rows, cols=cols)
for idx in range(rows * cols):
    feature = numeric_features[idx]
    fig.add_trace(go.Box(y=data[feature], name=feature), row=idx // cols + 1, col=idx % cols + 1)
fig.show()

# Filter outliers for f_3
data = data[data["f_3"] < 3000]

#### Correlation analysis

We see that all numerical featurs are positively correlated with some having nearly perfect correlation. 

In [44]:
corr = data[numeric_features].corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_9
f_1,1.0,0.987128,0.899834,0.907389,0.888886,0.903091,0.897648,0.646238
f_2,0.987128,1.0,0.907436,0.907612,0.887198,0.900211,0.905408,0.6463
f_3,0.899834,0.907436,1.0,0.870132,0.828963,0.86571,0.88898,0.594742
f_4,0.907389,0.907612,0.870132,1.0,0.937176,0.941768,0.933583,0.620254
f_5,0.888886,0.887198,0.828963,0.937176,1.0,0.925323,0.87592,0.610852
f_6,0.903091,0.900211,0.86571,0.941768,0.925323,1.0,0.907603,0.632607
f_7,0.897648,0.905408,0.88898,0.933583,0.87592,0.907603,1.0,0.614271
f_9,0.646238,0.6463,0.594742,0.620254,0.610852,0.632607,0.614271,1.0


#### Null values analysis

We see that `f_2` and `f_10` are the two features with most null values (`~10%` of dataset samples). Since the `f_2` is nearly identical to `f_1`, we can exlude it and solve its null value problem.

In [45]:
display(pd.DataFrame(data.isna().sum()).T)
numeric_features.remove("f_2")

Unnamed: 0,id,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,target,f_10
0,0,67,0,412,0,0,0,0,0,0,0,0,467


['f_1', 'f_3', 'f_4', 'f_5', 'f_6', 'f_7', 'f_9']