In [29]:
import pandas as pd
import nltk
import ast
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sympy import sympify
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.preprocessing import MultiLabelBinarizer
from pycaret.classification import *

In [21]:
data = pd.read_csv('../data/train_data.csv')
data = data[:1000]

text_columns = ['title', 'Input', 'Output', 'Note', 'Statement']
text_list_columns = ['sample-input', 'sample-output']
data[text_columns] = data[text_columns].fillna('missing')
data[text_list_columns] = data[text_list_columns].fillna('[]')

data['title'] = data['title'].str.extract(r'^[A-Za-z0-9]+\.\s*(.+)')
data['TL'] = data['TL'].str.extract(r'(\d+)').astype(int)
data['ML'] = data['ML'].str.extract(r'(\d+)').astype(int)
data['tags'] = data['tags'].apply(ast.literal_eval)
data['sample-input'] = data['sample-input'].apply(ast.literal_eval).apply(lambda x: " ".join(x))
data['sample-output'] = data['sample-output'].apply(ast.literal_eval).apply(lambda x: " ".join(x))

data.drop(columns='Unnamed: 0', inplace=True)
data.drop(columns='contest-name', inplace=True)

print(data.iloc[0])

title                                                   Squid Game
TL                                                               2
ML                                                             256
Input            The first line contains $$$2$$$ integer $$$n$$...
Output           Print the minimum number of operations Mashtal...
Note             Explanation for the first sample:  In the firs...
Statement        After watching the new over-rated series Squid...
contest                                                       1610
index                                                            H
tags             [data structures, dfs and similar, greedy, trees]
rating                                                      3500.0
sample-input     \n6 3\n1 1 1 4 4\n1 5\n3 4\n2 6\n \n5 3\n1 1 3...
sample-output                                         \n2\n \n-1\n
Name: 0, dtype: object


In [22]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]
    return ' '.join(lemmatized_tokens)

data['Input'] = data['Input'].apply(preprocess_text)
data['Output'] = data['Output'].apply(preprocess_text)
data['Note'] = data['Note'].apply(preprocess_text)
data['Input'] = data['Input'].apply(preprocess_text)
data['Statement'] = data['Statement'].apply(preprocess_text)
data['sample-input'] = data['sample-input'].apply(preprocess_text)
data['sample-output'] = data['sample-output'].apply(preprocess_text)

In [23]:
print(data.iloc[0])

title                                                   Squid Game
TL                                                               2
ML                                                             256
Input            first line contain 2 integ n 1 n 3 number vert...
Output           print minimum number oper mashtali way mashtal...
Note             explan first sampl first oper mashtali choos v...
Statement        watch new seri squid game mashtali soroush dec...
contest                                                       1610
index                                                            H
tags             [data structures, dfs and similar, greedy, trees]
rating                                                      3500.0
sample-input     6 3 1 1 1 4 4 1 5 3 4 2 6 5 3 1 1 3 3 1 2 1 4 1 5
sample-output                                                    2
Name: 0, dtype: object


In [26]:
mlb = MultiLabelBinarizer()
tags_binarized = mlb.fit_transform(data['tags'])
tags_df = pd.DataFrame(tags_binarized, columns=mlb.classes_)
data = pd.concat([data, tags_df], axis=1)
data.drop(columns='tags', inplace=True)

In [30]:
sub_data = data.iloc[:1000]
#for tag in tags_df.columns:
for tag in ['binary search', 'dp', 'math', 'shortest paths']:
    ignored_features = list(tags_df.columns.difference([tag]))
    exp = setup(data=sub_data, 
                target=tag, 
                text_features=['title', 'Input', 'Output', 'Note', 'Statement', 'sample-input', 'sample-output'],
                ignore_features=ignored_features,
                #fix_imbalance=True,
                session_id=123,
                )
    #model = compare_models()
    model = create_model('lightgbm')
    #final_model = finalize_model(model)
    evaluate_model(model)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,binary search
2,Target type,Binary
3,Original data shape,"(1000, 49)"
4,Transformed data shape,"(1000, 10978)"
5,Transformed train set shape,"(700, 10978)"
6,Transformed test set shape,"(300, 10978)"
7,Ignore features,36
8,Numeric features,4
9,Text features,7


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8857,0.7379,0.0,0.0,0.0,0.0,0.0
1,0.9143,0.7903,0.25,1.0,0.4,0.3713,0.4774
2,0.8714,0.5665,0.0,0.0,0.0,-0.0261,-0.0432
3,0.8714,0.4355,0.0,0.0,0.0,-0.0261,-0.0432
4,0.9,0.6149,0.125,1.0,0.2222,0.202,0.3351
5,0.8857,0.7944,0.125,0.5,0.2,0.1617,0.2079
6,0.8714,0.5383,0.0,0.0,0.0,-0.0261,-0.0432
7,0.8857,0.756,0.0,0.0,0.0,0.0,0.0
8,0.9,0.75,0.125,1.0,0.2222,0.202,0.3351
9,0.8714,0.6653,0.0,0.0,0.0,-0.0261,-0.0432


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0,Description,Value
0,Session id,123
1,Target,dp
2,Target type,Binary
3,Original data shape,"(1000, 49)"
4,Transformed data shape,"(1000, 11199)"
5,Transformed train set shape,"(700, 11199)"
6,Transformed test set shape,"(300, 11199)"
7,Ignore features,36
8,Numeric features,4
9,Text features,7


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8,0.5676,0.0714,0.5,0.125,0.0789,0.1286
1,0.7857,0.5077,0.2143,0.4286,0.2857,0.1758,0.1905
2,0.7714,0.5153,0.0714,0.25,0.1111,0.0244,0.0308
3,0.7571,0.7105,0.0714,0.2,0.1053,0.0,0.0
4,0.7857,0.7258,0.1429,0.4,0.2105,0.1176,0.1387
5,0.7857,0.7758,0.2667,0.5,0.3478,0.2336,0.2501
6,0.8,0.7915,0.3333,0.5556,0.4167,0.305,0.3195
7,0.8,0.8109,0.2,0.6,0.3,0.216,0.2607
8,0.8,0.7915,0.1333,0.6667,0.2222,0.1624,0.2333
9,0.8,0.6521,0.2667,0.5714,0.3636,0.2632,0.2901


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0,Description,Value
0,Session id,123
1,Target,math
2,Target type,Binary
3,Original data shape,"(1000, 49)"
4,Transformed data shape,"(1000, 11235)"
5,Transformed train set shape,"(700, 11235)"
6,Transformed test set shape,"(300, 11235)"
7,Ignore features,36
8,Numeric features,4
9,Text features,7


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7143,0.666,0.3,0.5,0.375,0.2045,0.2158
1,0.7429,0.684,0.45,0.5625,0.5,0.3298,0.3335
2,0.7571,0.777,0.35,0.6364,0.4516,0.3121,0.3352
3,0.6571,0.655,0.4,0.4,0.4,0.16,0.16
4,0.7,0.631,0.25,0.4545,0.3226,0.1503,0.1614
5,0.7857,0.795,0.4,0.7273,0.5161,0.3931,0.422
6,0.7143,0.743,0.35,0.5,0.4118,0.2308,0.2372
7,0.8143,0.77,0.45,0.8182,0.5806,0.474,0.5089
8,0.7714,0.767,0.35,0.7,0.4667,0.3412,0.3744
9,0.7857,0.691,0.4762,0.7143,0.5714,0.4361,0.452


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0,Description,Value
0,Session id,123
1,Target,shortest paths
2,Target type,Binary
3,Original data shape,"(1000, 49)"
4,Transformed data shape,"(1000, 11139)"
5,Transformed train set shape,"(700, 11139)"
6,Transformed test set shape,"(300, 11139)"
7,Ignore features,36
8,Numeric features,4
9,Text features,7


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9857,0.058,0.0,0.0,0.0,0.0,0.0
1,0.9857,0.8116,0.0,0.0,0.0,0.0,0.0
2,0.9857,0.2174,0.0,0.0,0.0,0.0,0.0
3,0.9857,0.7971,0.0,0.0,0.0,0.0,0.0
4,0.9857,0.9855,0.0,0.0,0.0,0.0,0.0
5,0.9857,0.8116,0.0,0.0,0.0,0.0,0.0
6,0.9857,0.3623,0.0,0.0,0.0,0.0,0.0
7,0.9857,0.4203,0.0,0.0,0.0,0.0,0.0
8,0.9857,0.4058,0.0,0.0,0.0,0.0,0.0
9,0.9857,0.913,0.0,0.0,0.0,0.0,0.0


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…