In [44]:
import pandas as pd
import json
import os
import ast
from pycaret.classification import *
from sklearn.preprocessing import MultiLabelBinarizer

In [63]:
data = pd.read_csv('../data/result.csv')
print(data.iloc[0])

Unnamed: 0                                                       0
title                                               D. Ingenuity-2
TL                                                       2 seconds
ML                                                   256 megabytes
Input            The first line of input contains $$$t$$$ ($$$1...
Output           For each test case, if the required distributi...
Note             Let's consider the first example: the string $...
Statement        Let's imagine the surface of Mars as an infini...
contest                                                       1974
index                                                            D
tags             ['constructive algorithms', 'greedy', 'impleme...
rating                                                         NaN
sample-input     ['\n106NENSNE3WWW6NESSWS2SN2WE4SSNN4WESN2SS4EW...
sample-output    ['\nRRHRRH\nNO\nHRRHRH\nNO\nNO\nRHRH\nRRHH\nRH...
contest-name                         Codeforces Round 946 (Div

In [64]:
text_columns = ['title', 'Input', 'Output', 'Note', 'Statement']
text_list_columns = ['sample-input', 'sample-output']
data[text_columns] = data[text_columns].fillna('missing')
data[text_list_columns] = data[text_list_columns].fillna('[]')

data['title'] = data['title'].str.extract(r'^[A-Za-z0-9]+\.\s*(.+)')
data['TL'] = data['TL'].str.extract(r'(\d+)').astype(int)
data['ML'] = data['ML'].str.extract(r'(\d+)').astype(int)
data['tags'] = data['tags'].apply(ast.literal_eval)
data['sample-input'] = data['sample-input'].apply(ast.literal_eval).apply(lambda x: " ".join(x))
data['sample-output'] = data['sample-output'].apply(ast.literal_eval).apply(lambda x: " ".join(x))

data.drop(columns='Unnamed: 0', inplace=True)
data.drop(columns='contest-name', inplace=True)

In [65]:
print(data.iloc[0])

title                                                  Ingenuity-2
TL                                                               2
ML                                                             256
Input            The first line of input contains $$$t$$$ ($$$1...
Output           For each test case, if the required distributi...
Note             Let's consider the first example: the string $...
Statement        Let's imagine the surface of Mars as an infini...
contest                                                       1974
index                                                            D
tags             [constructive algorithms, greedy, implementation]
rating                                                         NaN
sample-input     \n106NENSNE3WWW6NESSWS2SN2WE4SSNN4WESN2SS4EWNN...
sample-output    \nRRHRRH\nNO\nHRRHRH\nNO\nNO\nRHRH\nRRHH\nRH\n...
Name: 0, dtype: object


In [48]:
mlb = MultiLabelBinarizer()
tags_binarized = mlb.fit_transform(data['tags'])
tags_df = pd.DataFrame(tags_binarized, columns=mlb.classes_)
data = pd.concat([data, tags_df], axis=1)
data.drop(columns='tags', inplace=True)

In [49]:
print(data.iloc[0])

title                                                              Ingenuity-2
TL                                                                           2
ML                                                                         256
Input                        The first line of input contains $$$t$$$ ($$$1...
Output                       For each test case, if the required distributi...
Note                         Let's consider the first example: the string $...
Statement                    Let's imagine the surface of Mars as an infini...
contest                                                                   1974
index                                                                        D
rating                                                                     NaN
sample-input                 \n106NENSNE3WWW6NESSWS2SN2WE4SSNN4WESN2SS4EWNN...
sample-output                \nRRHRRH\nNO\nHRRHRH\nNO\nNO\nRHRH\nRRHH\nRH\n...
*special                                            

In [50]:
sub_data = data.iloc[:1000]
#for tag in tags_df.columns:
for tag in ['binary search', 'dp', 'math', 'shortest paths']:
    ignored_features = list(tags_df.columns.difference([tag]))
    exp = setup(data=sub_data, 
                target=tag, 
                text_features=text_columns + text_list_columns,
                ignore_features=ignored_features,
                #fix_imbalance=True,
                session_id=123,
                )
    #model = compare_models()
    model = create_model('lightgbm')
    #final_model = finalize_model(model)
    evaluate_model(model)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,binary search
2,Target type,Binary
3,Original data shape,"(1000, 49)"
4,Transformed data shape,"(1000, 17117)"
5,Transformed train set shape,"(700, 17117)"
6,Transformed test set shape,"(300, 17117)"
7,Ignore features,36
8,Numeric features,4
9,Text features,7


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9,0.7324,0.0,0.0,0.0,0.0,0.0
1,0.8857,0.6213,0.0,0.0,0.0,-0.0256,-0.0401
2,0.9143,0.6145,0.1429,1.0,0.25,0.2308,0.3612
3,0.9,0.6689,0.0,0.0,0.0,0.0,0.0
4,0.8714,0.6855,0.0,0.0,0.0,-0.0261,-0.0432
5,0.8857,0.5645,0.0,0.0,0.0,0.0,0.0
6,0.8857,0.6714,0.0,0.0,0.0,0.0,0.0
7,0.8857,0.6472,0.0,0.0,0.0,0.0,0.0
8,0.8857,0.6532,0.0,0.0,0.0,0.0,0.0
9,0.8857,0.7177,0.0,0.0,0.0,0.0,0.0


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0,Description,Value
0,Session id,123
1,Target,dp
2,Target type,Binary
3,Original data shape,"(1000, 49)"
4,Transformed data shape,"(1000, 16771)"
5,Transformed train set shape,"(700, 16771)"
6,Transformed test set shape,"(300, 16771)"
7,Ignore features,36
8,Numeric features,4
9,Text features,7


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8286,0.7867,0.2667,0.8,0.4,0.328,0.3959
1,0.7857,0.8327,0.2667,0.5,0.3478,0.2336,0.2501
2,0.8286,0.743,0.3333,0.7143,0.4545,0.3684,0.4062
3,0.8143,0.6945,0.2,0.75,0.3158,0.2479,0.3214
4,0.7714,0.7503,0.1333,0.4,0.2,0.104,0.1255
5,0.7571,0.662,0.1875,0.4286,0.2609,0.1414,0.1588
6,0.7143,0.6134,0.25,0.3333,0.2857,0.1117,0.1135
7,0.7143,0.581,0.125,0.25,0.1667,0.0169,0.0183
8,0.7571,0.7211,0.0625,0.3333,0.1053,0.0357,0.0528
9,0.8286,0.6968,0.375,0.75,0.5,0.4101,0.4461


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0,Description,Value
0,Session id,123
1,Target,math
2,Target type,Binary
3,Original data shape,"(1000, 49)"
4,Transformed data shape,"(1000, 16602)"
5,Transformed train set shape,"(700, 16602)"
6,Transformed test set shape,"(300, 16602)"
7,Ignore features,36
8,Numeric features,4
9,Text features,7


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7857,0.798,0.45,0.6923,0.5455,0.4134,0.4298
1,0.7286,0.679,0.2,0.5714,0.2963,0.1739,0.2108
2,0.7429,0.789,0.35,0.5833,0.4375,0.2841,0.2997
3,0.6714,0.633,0.1,0.2857,0.1481,0.0,0.0
4,0.6857,0.659,0.15,0.375,0.2143,0.061,0.071
5,0.7286,0.712,0.3,0.5455,0.3871,0.2312,0.2483
6,0.7714,0.813,0.3,0.75,0.4286,0.3171,0.3692
7,0.7286,0.78,0.2,0.5714,0.2963,0.1739,0.2108
8,0.7429,0.7512,0.2857,0.6667,0.4,0.2683,0.3073
9,0.7286,0.6696,0.2857,0.6,0.3871,0.24,0.2673


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0,Description,Value
0,Session id,123
1,Target,shortest paths
2,Target type,Binary
3,Original data shape,"(1000, 49)"
4,Transformed data shape,"(1000, 16691)"
5,Transformed train set shape,"(700, 16691)"
6,Transformed test set shape,"(300, 16691)"
7,Ignore features,36
8,Numeric features,4
9,Text features,7


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9857,0.8986,0.0,0.0,0.0,0.0,0.0
1,0.9857,0.8841,0.0,0.0,0.0,0.0,0.0
2,0.9857,0.9565,0.0,0.0,0.0,0.0,0.0
3,0.9857,0.913,0.0,0.0,0.0,0.0,0.0
4,0.9857,0.5652,0.0,0.0,0.0,0.0,0.0
5,0.9857,1.0,0.0,0.0,0.0,0.0,0.0
6,0.9571,0.625,0.0,0.0,0.0,-0.0194,-0.0206
7,0.9714,0.9559,0.0,0.0,0.0,0.0,0.0
8,0.9714,0.7279,0.0,0.0,0.0,0.0,0.0
9,0.9714,0.5735,0.0,0.0,0.0,0.0,0.0


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…