In [1]:
import pandas as pd
import json
import os
import ast
from pycaret.classification import *
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
#data = pd.read_csv('../data/50%_train_data.csv')
data = pd.read_csv('../data/train_data.csv')
print(data.iloc[0])

Unnamed: 0                                                    1287
title                                                H. Squid Game
TL                                                       2 seconds
ML                                                   256 megabytes
Input            The first line contains $$$2$$$ integer $$$n$$...
Output           Print the minimum number of operations Mashtal...
Note             Explanation for the first sample:  In the firs...
Statement        After watching the new over-rated series Squid...
contest                                                       1610
index                                                            H
tags             ['data structures', 'dfs and similar', 'greedy...
rating                                                      3500.0
sample-input     ['\n6 3\n1 1 1 4 4\n1 5\n3 4\n2 6\n', '\n5 3\n...
sample-output                                  ['\n2\n', '\n-1\n']
contest-name                            Codeforces Global Roun

In [3]:
def basic_preprocess(df):
    text_columns = ['title', 'Input', 'Output', 'Note', 'Statement']
    text_list_columns = ['sample-input', 'sample-output']
    df[text_columns] = df[text_columns].fillna('missing')
    df[text_list_columns] = df[text_list_columns].fillna('[]')

    df['title'] = df['title'].str.extract(r'^[A-Za-z0-9]+\.\s*(.+)')
    df['TL'] = df['TL'].str.extract(r'(\d+)').astype(int)
    df['ML'] = df['ML'].str.extract(r'(\d+)').astype(int)
    df['tags'] = df['tags'].apply(ast.literal_eval)
    df['sample-input'] = df['sample-input'].apply(ast.literal_eval).apply(lambda x: " ".join(x))
    df['sample-output'] = df['sample-output'].apply(ast.literal_eval).apply(lambda x: " ".join(x))

    df.drop(columns='Unnamed: 0', inplace=True)
    df.drop(columns='contest-name', inplace=True)
    return df

data = basic_preprocess(data)

In [4]:
print(data.iloc[0])

title                                                   Squid Game
TL                                                               2
ML                                                             256
Input            The first line contains $$$2$$$ integer $$$n$$...
Output           Print the minimum number of operations Mashtal...
Note             Explanation for the first sample:  In the firs...
Statement        After watching the new over-rated series Squid...
contest                                                       1610
index                                                            H
tags             [data structures, dfs and similar, greedy, trees]
rating                                                      3500.0
sample-input     \n6 3\n1 1 1 4 4\n1 5\n3 4\n2 6\n \n5 3\n1 1 3...
sample-output                                         \n2\n \n-1\n
Name: 0, dtype: object


In [5]:
def tag_labeling(df):
    mlb = MultiLabelBinarizer()
    tags_binarized = mlb.fit_transform(df['tags'])
    tags_df = pd.DataFrame(tags_binarized, columns=mlb.classes_)
    df = pd.concat([df, tags_df], axis=1)
    df.drop(columns='tags', inplace=True)
    return tags_df, df

tags_df, data = tag_labeling(data)

In [6]:
print(data.iloc[0])

title                                                               Squid Game
TL                                                                           2
ML                                                                         256
Input                        The first line contains $$$2$$$ integer $$$n$$...
Output                       Print the minimum number of operations Mashtal...
Note                         Explanation for the first sample:  In the firs...
Statement                    After watching the new over-rated series Squid...
contest                                                                   1610
index                                                                        H
rating                                                                  3500.0
sample-input                 \n6 3\n1 1 1 4 4\n1 5\n3 4\n2 6\n \n5 3\n1 1 3...
sample-output                                                     \n2\n \n-1\n
*special                                            

In [7]:
sub_data = data.iloc[:1000]
#sub_data = data
#for tag in tags_df.columns:
#for tag in ['binary search', 'dp', 'math', 'shortest paths']:
for tag in ['dp']:
    ignored_features = list(tags_df.columns.difference([tag]))
    exp = setup(data=sub_data, 
                target=tag, 
                text_features=['title', 'Input', 'Output', 'Note', 'Statement', 'sample-input', 'sample-output'],
                ignore_features=ignored_features,
                #fix_imbalance=True,
                session_id=123,
                #use_gpu=True
                )
    #model = compare_models()
    model = create_model('lightgbm')
    #final_model = finalize_model(model)
    evaluate_model(model)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,dp
2,Target type,Binary
3,Original data shape,"(1000, 49)"
4,Transformed data shape,"(1000, 16096)"
5,Transformed train set shape,"(700, 16096)"
6,Transformed test set shape,"(300, 16096)"
7,Ignore features,36
8,Numeric features,4
9,Text features,7


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7714,0.7997,0.0714,0.25,0.1111,0.0244,0.0308
1,0.7714,0.6122,0.0714,0.25,0.1111,0.0244,0.0308
2,0.7857,0.5574,0.2143,0.4286,0.2857,0.1758,0.1905
3,0.8,0.7615,0.1429,0.5,0.2222,0.1463,0.1846
4,0.8143,0.6594,0.2143,0.6,0.3158,0.2353,0.2774
5,0.7571,0.7927,0.0667,0.25,0.1053,0.0165,0.0214
6,0.8,0.7103,0.2667,0.5714,0.3636,0.2632,0.2901
7,0.8286,0.7952,0.2,1.0,0.3333,0.2821,0.4052
8,0.8143,0.7964,0.1333,1.0,0.2353,0.1947,0.3284
9,0.7857,0.6715,0.2,0.5,0.2857,0.186,0.2132


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [8]:
#test = pd.read_csv('../data/50%_test_data.csv')
test = pd.read_csv('../data/test_data.csv')
test = basic_preprocess(test)
tags_df, test = tag_labeling(test)

In [9]:
predictions = predict_model(model, data=test)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.7815,0.7117,0.1718,0.5652,0.2635,0.1761,0.2195


In [10]:
y_pred = predictions['prediction_label']
test['Predicted'] = y_pred
wrong_predictions = test[test['dp'] != test['Predicted']]
wrong_predictions.head()

Unnamed: 0,title,TL,ML,Input,Output,Note,Statement,contest,index,rating,...,probabilities,schedules,shortest paths,sortings,string suffix structures,strings,ternary search,trees,two pointers,Predicted
3,Last minute enhancements,1,256,The input consists of multiple test cases. The...,"For each test case, you should output a single...","In the first test case, Euterpe can increase t...",Athenaeus has just finished creating his lates...,1466,B,750.0,...,0,0,0,0,0,0,0,0,0,0
4,The Minimum Number of Variables,1,256,The first line contains integer n (1 ≤ n ≤ 23)...,In a single line print a single number — the m...,"In the first sample, you can use two variables...","You've got a positive integer sequence a1, a2,...",279,D,2000.0,...,0,0,0,0,0,0,0,0,0,0
9,Staircases,2,256,The first line contains three integers $$$n$$$...,Print $$$q$$$ integers — the $$$i$$$-th value ...,missing,"You are given a matrix, consisting of $$$n$$$ ...",1598,E,,...,0,0,0,0,0,0,0,0,0,0
10,Tree Painting,2,256,The first line contains an integer $$$n$$$ — t...,Print one integer — the maximum number of poin...,The first example tree is shown in the problem...,You are given a tree (an undirected connected ...,1187,E,,...,0,0,0,0,0,0,0,1,0,0
16,Isolation,3,256,The first line contains two space-separated in...,The first and only line contains the number of...,"In the first sample, the three possible divisi...",Find the number of ways to divide an array $$$...,1129,D,2250.0,...,0,0,0,0,0,0,0,0,0,0
