In [15]:
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [16]:
data = Path("../data")

In [55]:
file_path = data / Path("original/new-thyroid.csv")
df = pd.read_csv(file_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   f       215 non-null    int64  
 1   f.1     215 non-null    float64
 2   f.2     215 non-null    float64
 3   f.3     215 non-null    float64
 4   f.4     215 non-null    float64
 5   class   215 non-null    int64  
dtypes: float64(4), int64(2)
memory usage: 10.2 KB


In [56]:
len(set(df.columns) - set(df._get_numeric_data().columns))

0

In [62]:
len(df)

215

In [57]:
df.groupby('class')['class'].count().min() / df.groupby('class')['class'].count().max()

0.2

In [67]:
# Sacar número de variables categoricas por cada dataset
# Sacar distribución de clases
# Sacar número de variables

def get_info_df(dataset, performs_well):
    csv_path = f"original/{dataset}.csv"
    file_path = data / Path(csv_path)
    df = pd.read_csv(file_path)
    df_no_class = df.drop(columns=['class'])
    
    performs_better_than_rf = performs_well
    n_instances = len(df)
    n_features = len(df_no_class.columns)
    n_classes = df['class'].nunique()
    class_distribution = df.groupby('class')['class'].count().min() / df.groupby('class')['class'].count().max()
    n_categorical_features = len(set(df_no_class.columns) - set(df_no_class._get_numeric_data().columns))
    
    return {
        "dataset": dataset, 
        "n_instances": n_instances, 
        "n_features": n_features, 
        "n_classes": n_classes, 
        "class_distribution": class_distribution, 
        "n_categorical_features": n_categorical_features, 
        "performs_better": performs_better_than_rf
    }

In [68]:
# Those that are indredibly better than the others
best_datasets = ["diabetes", "german", "twonorm", "threenorm", "ringnorm", "waveform"]
mid_datasets = ["heart", "ionosphere", "australian", "magic04"]
worst_datasets = ["wine", "new-thyroid", "wdbc", "tic-tac-toe", "segment"]

list_of_dicts = []

for dataset in best_datasets:
    df_dict = get_info_df(dataset, performs_well=2)
    list_of_dicts.append(df_dict)
    
for dataset in mid_datasets:
    df_dict = get_info_df(dataset, performs_well=1)
    list_of_dicts.append(df_dict)
    
for dataset in worst_datasets:
    df_dict = get_info_df(dataset, performs_well=0)
    list_of_dicts.append(df_dict)

In [69]:
list_of_dicts

[{'dataset': 'diabetes',
  'n_instances': 768,
  'n_features': 8,
  'n_classes': 2,
  'class_distribution': 0.536,
  'n_categorical_features': 0,
  'performs_better': 2},
 {'dataset': 'german',
  'n_instances': 1000,
  'n_features': 24,
  'n_classes': 2,
  'class_distribution': 0.42857142857142855,
  'n_categorical_features': 0,
  'performs_better': 2},
 {'dataset': 'twonorm',
  'n_instances': 5000,
  'n_features': 21,
  'n_classes': 2,
  'class_distribution': 1.0,
  'n_categorical_features': 0,
  'performs_better': 2},
 {'dataset': 'threenorm',
  'n_instances': 5000,
  'n_features': 21,
  'n_classes': 2,
  'class_distribution': 1.0,
  'n_categorical_features': 0,
  'performs_better': 2},
 {'dataset': 'ringnorm',
  'n_instances': 5000,
  'n_features': 20,
  'n_classes': 2,
  'class_distribution': 0.9833399444664815,
  'n_categorical_features': 0,
  'performs_better': 2},
 {'dataset': 'waveform',
  'n_instances': 5000,
  'n_features': 21,
  'n_classes': 3,
  'class_distribution': 0.9711

In [71]:
pd.DataFrame(list_of_dicts)

Unnamed: 0,dataset,n_instances,n_features,n_classes,class_distribution,n_categorical_features,performs_better
0,diabetes,768,8,2,0.536,0,2
1,german,1000,24,2,0.428571,0,2
2,twonorm,5000,21,2,1.0,0,2
3,threenorm,5000,21,2,1.0,0,2
4,ringnorm,5000,20,2,0.98334,0,2
5,waveform,5000,21,3,0.971108,0,2
6,heart,270,13,2,0.8,0,1
7,ionosphere,351,34,2,0.56,0,1
8,australian,690,14,2,0.801567,0,1
9,magic04,19020,10,2,0.542329,0,1
