In [229]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split

#### Helper functions

In [230]:
def check_equal_ids(df1: pd.DataFrame, df2: pd.DataFrame) -> bool:
    if len(df1) != len(df2):
        return False
    df1 = df1["id"].sort_values().reset_index(drop=True)
    df2 = df2["id"].sort_values().reset_index(drop=True)
    return (df1 == df2).sum() == len(df1)

#### Load data from csv

In [231]:
features_df = pd.read_csv("features.csv", sep=";")
targets_df = pd.read_csv("targets.csv", sep=";")
text_features_df = pd.read_csv("text_features.csv", sep=";")

#### Check equal ids

Check if all three datasets represent the same observations

In [232]:
check_equal_ids(features_df, text_features_df), check_equal_ids(features_df, targets_df)

(True, True)

#### Merge data

We can now merge datasets on `id` column

In [233]:
data = pd.merge(features_df, text_features_df, on="id", how="inner").merge(targets_df, on="id", how="inner")
data

Unnamed: 0,id,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,target
0,b03a97185bcd4e3297fffcbe7360c64d,F,0.495,0.400,1550.0,0.6445,0.2420,0.1325,0.205,SHORT,1.025113,Measurable process selection theorem and non-a...,1
1,84ab6aeda37c4943847d204c120410df,M,0.630,0.515,1550.0,1.2590,0.4105,0.1970,0.410,SHORT,2.006427,,1
2,dcbb537ee552433780fde6dfb1d941c6,F,0.595,0.465,1550.0,1.0260,0.4645,0.1120,0.305,SHORT,2.430176,On a topology property for moduli space of Kap...,0
3,e9cc5c8b303045a783e11df15389cd39,M,0.360,0.295,1000.0,0.2105,0.0660,0.0525,0.075,SHORT,1.451025,Machine Learning on Sequential Data Using a Re...,0
4,bd8ca64a21fd4edeb3db59ce5db871bc,M,0.600,0.510,1850.0,1.2850,0.6095,0.2745,0.315,SHORT,2.701401,Multirole Logic (Extended Abstract),0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4173,9af44122585b423493ba05828fba8ac3,M,0.645,0.485,1550.0,1.4890,0.5915,0.3120,0.380,SHORT,2.969376,,1
4174,5ccd0a6c62c845838205fb184053294f,M,0.650,0.525,1850.0,1.6220,0.6645,0.3225,0.477,TALL,3.770031,Recurrent Deterministic Policy Gradient Method...,0
4175,f3290aaf37de4c74bf330e11cbe6bbd8,F,0.710,0.565,1950.0,1.6500,0.6380,0.3365,0.565,TALL,4.116161,Multilink Communities of Multiplex Networks,1
4176,b3facf6230fe4adf86b56f161be32725,M,0.595,0.455,1500.0,0.8860,0.4315,0.2010,0.223,SHORT,3.177005,A geometric second-order-rectifiable stratific...,0


#### Train/test split

In order not to leak information from test data into training data, which would distort model evaluation metrics, we split data into train and test datasets before any exploratory analyses

In [234]:
train_dataset, test_dataset = train_test_split(data, test_size=0.2, random_state=100, stratify=data["target"])
train_dataset.shape, test_dataset.shape

((3342, 13), (836, 13))

#### Input features

In [235]:
numeric_features = [f"f_{i}" for i in range(1, 10) if i != 8]
categorical_features = ["f_0", "f_8"]
text_features = ["f10"]
target = ["target"]

#### Outliers analysis

Before computing correlation and other outlier-sensitive analyses, we identify and fix the most critical outliers from numerical features. From the boxplots below, we can see the only feature that has outliers above or below `3*IQR` (double distance of inner fences displayed in boxplots) is `f_3`, so we only fix outliers for this variable by replacing them with the value of last observation not considered an outlier. The replacement value is calculated on train dataset and applied also to test dataset.

In [236]:
# Boxplots
rows, cols = 2, 4
fig = make_subplots(rows=rows, cols=cols)
for idx in range(rows * cols):
    feature = numeric_features[idx]
    fig.add_trace(go.Box(y=train_dataset[feature], name=feature), row=idx // cols + 1, col=idx % cols + 1)
fig.show()

# Replace outliers for f_3
train_dataset["f_3"] = np.minimum(train_dataset["f_3"], 2400)
test_dataset["f_3"] = np.minimum(test_dataset["f_3"], 2400)


#### Correlation analysis of inputs

We see that all numerical featurs are strongly positively correlated with some having nearly perfect correlation. 

In [237]:
corr = train_dataset[numeric_features].corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_9
f_1,1.0,0.98798,0.90143,0.908338,0.897018,0.905446,0.899032,0.650757
f_2,0.98798,1.0,0.909232,0.908021,0.891577,0.902056,0.906983,0.653372
f_3,0.90143,0.909232,1.0,0.867424,0.833317,0.862582,0.887836,0.600238
f_4,0.908338,0.908021,0.867424,1.0,0.94112,0.942675,0.932871,0.627132
f_5,0.897018,0.891577,0.833317,0.94112,1.0,0.930115,0.882316,0.625964
f_6,0.905446,0.902056,0.862582,0.942675,0.930115,1.0,0.907169,0.642268
f_7,0.899032,0.906983,0.887836,0.932871,0.882316,0.907169,1.0,0.617623
f_9,0.650757,0.653372,0.600238,0.627132,0.625964,0.642268,0.617623,1.0


#### Null values analysis

We see that `f_2` and `f_10` are the two features with most null values (`~10%` of dataset samples). Since the `f_2` is nearly identical to `f_1` based on the correlation analysis, we can exlude it and solve its null value problem. Since we don't know more details about the null values in `f_0` variable and the number of affected observations is not high, we will discard the observations for which `f_0` is null.

In [238]:
display(pd.DataFrame(train_dataset.isna().sum()).T)
numeric_features.remove("f_2")

train_dataset = train_dataset[~train_dataset["f_0"].isna()]
test_dataset = test_dataset[~test_dataset["f_0"].isna()]

Unnamed: 0,id,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,target
0,0,55,1,331,1,1,1,1,1,1,1,386,0


#### Categorical features

Distribution of values in categorical features is as follows. We can see that the number of observations in M, F, I categories is roughly the same (we assume they represent category of papers from math, physics and informatics). 

In [239]:
display(train_dataset[categorical_features[0]].value_counts())
train_dataset[categorical_features[1]].value_counts()

f_0
M    1195
F    1049
I    1043
Name: count, dtype: int64

f_8
SHORT    2970
TALL      317
Name: count, dtype: int64

#### [Classical] Train dataset with non-null features without most critical outliers and most correlated numerical feature

In [240]:
train_dataset = train_dataset[numeric_features + categorical_features + target]
train_dataset

Unnamed: 0,f_1,f_3,f_4,f_5,f_6,f_7,f_9,f_0,f_8,target
1882,0.700,2150.0,1.9780,0.6675,0.3125,0.710,3.190625,F,TALL,1
4019,0.650,1900.0,1.9500,0.6265,0.4005,0.395,3.241601,M,SHORT,1
2784,0.595,1400.0,0.8520,0.4215,0.2255,0.227,2.583401,M,SHORT,0
3078,0.605,1450.0,0.8840,0.3835,0.1905,0.270,2.565161,M,SHORT,0
3374,0.450,1000.0,0.4110,0.1945,0.1000,0.098,1.840000,I,SHORT,0
...,...,...,...,...,...,...,...,...,...,...
684,0.350,1100.0,0.2090,0.0660,0.0590,0.075,1.767405,I,SHORT,0
2114,0.660,2000.0,1.6760,0.6730,0.4805,0.450,3.563521,F,TALL,1
3722,0.295,700.0,0.1365,0.0575,0.0295,0.035,0.887611,I,SHORT,0
1129,0.465,1200.0,0.5805,0.2550,0.0915,0.184,2.366861,I,SHORT,0


In [241]:
test_dataset = test_dataset[numeric_features + categorical_features + target]
test_dataset

Unnamed: 0,f_1,f_3,f_4,f_5,f_6,f_7,f_9,f_0,f_8,target
2534,0.615,1550.0,1.1375,0.3670,0.2360,0.3700,3.353480,M,SHORT,1
3391,0.350,900.0,0.1980,0.0725,0.0560,0.0600,1.059408,M,SHORT,0
4103,0.555,1450.0,0.8500,0.4165,0.1685,0.2300,1.166784,M,SHORT,0
2185,0.575,1350.0,0.8570,0.4610,0.1470,0.2125,2.983045,F,SHORT,0
3802,0.595,1800.0,1.0530,0.4405,0.1920,0.3900,1.263728,F,SHORT,1
...,...,...,...,...,...,...,...,...,...,...
3489,0.485,1000.0,0.5130,0.2190,0.1075,0.1300,2.482781,I,SHORT,0
1593,0.485,1200.0,0.5155,0.2465,0.1025,0.1470,0.991012,I,SHORT,0
2946,0.270,700.0,0.1060,0.0465,0.0180,0.0360,0.540648,M,SHORT,0
2575,0.370,950.0,0.2655,0.1220,0.0520,0.0800,1.863520,I,SHORT,0
