In [15]:
import pandas as pd
import nltk
import ast
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sympy import sympify
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.preprocessing import MultiLabelBinarizer
from pycaret.classification import *
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import pickle

In [16]:
data = pd.read_csv('../data/train_data.csv')
print(data.columns)

Index(['Unnamed: 0', 'title', 'TL', 'ML', 'Input', 'Output', 'Note',
       'Statement', 'contest', 'index', 'tags', 'rating', 'sample-input',
       'sample-output', 'contest-name'],
      dtype='object')


In [17]:
sub_cnt = 5000
#change sub_cnt to 10000 or other big value to use all training data
embedding_columns = ['Statement']
ignore_features = ['title']

In [18]:
data = pd.read_csv('../data/train_data.csv')
#data = pd.read_csv('../data/50%_train_data.csv')
data = data[:sub_cnt]

text_columns = ['title', 'Input', 'Output', 'Note', 'Statement']
text_list_columns = ['sample-input', 'sample-output']
text_features = text_columns + text_list_columns
print(data.iloc[0])

Unnamed: 0                                                    1287
title                                                H. Squid Game
TL                                                       2 seconds
ML                                                   256 megabytes
Input            The first line contains $$$2$$$ integer $$$n$$...
Output           Print the minimum number of operations Mashtal...
Note             Explanation for the first sample:  In the firs...
Statement        After watching the new over-rated series Squid...
contest                                                       1610
index                                                            H
tags             ['data structures', 'dfs and similar', 'greedy...
rating                                                      3500.0
sample-input     ['\n6 3\n1 1 1 4 4\n1 5\n3 4\n2 6\n', '\n5 3\n...
sample-output                                  ['\n2\n', '\n-1\n']
contest-name                            Codeforces Global Roun

In [19]:
def basic_preprocess(df):
    df[text_columns] = df[text_columns].fillna('missing')
    df[text_list_columns] = df[text_list_columns].fillna('[]')
    df['title'] = df['title'].str.extract(r'^[A-Za-z0-9]+\.\s*(.+)')
    df['TL'] = df['TL'].str.extract(r'(\d+)').astype(int)
    df['ML'] = df['ML'].str.extract(r'(\d+)').astype(int)
    df['tags'] = df['tags'].apply(ast.literal_eval)
    df['sample-input'] = df['sample-input'].apply(ast.literal_eval).apply(lambda x: " ".join(x))
    df['sample-output'] = df['sample-output'].apply(ast.literal_eval).apply(lambda x: " ".join(x))
    df.drop(columns='Unnamed: 0', inplace=True)
    df.drop(columns='contest-name', inplace=True)
    return df

data = basic_preprocess(data)
print(data.iloc[0])

title                                                   Squid Game
TL                                                               2
ML                                                             256
Input            The first line contains $$$2$$$ integer $$$n$$...
Output           Print the minimum number of operations Mashtal...
Note             Explanation for the first sample:  In the firs...
Statement        After watching the new over-rated series Squid...
contest                                                       1610
index                                                            H
tags             [data structures, dfs and similar, greedy, trees]
rating                                                      3500.0
sample-input     \n6 3\n1 1 1 4 4\n1 5\n3 4\n2 6\n \n5 3\n1 1 3...
sample-output                                         \n2\n \n-1\n
Name: 0, dtype: object


In [20]:
def text_preprocess(df):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    def text_transform(text):
        tokens = word_tokenize(text.lower())
        filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
        stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]
        return ' '.join(lemmatized_tokens)
    for feature in text_features:
        df[feature] = df[feature].apply(text_transform)
    return df

data = text_preprocess(data)
print(data.iloc[0])

title                                                   squid game
TL                                                               2
ML                                                             256
Input            first line contain 2 integ n 1 n 3 number vert...
Output           print minimum number oper mashtali way mashtal...
Note             explan first sampl first oper mashtali choos v...
Statement        watch new seri squid game mashtali soroush dec...
contest                                                       1610
index                                                            H
tags             [data structures, dfs and similar, greedy, trees]
rating                                                      3500.0
sample-input     6 3 1 1 1 4 4 1 5 3 4 2 6 5 3 1 1 3 3 1 2 1 4 1 5
sample-output                                                    2
Name: 0, dtype: object


In [21]:
def text_embedding(df):
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    def generate_embeddings(text):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        outputs = model(**inputs)
        return torch.mean(outputs.last_hidden_state, dim=1).detach().numpy().flatten()
    for feature in embedding_columns:
        df[feature] = df[feature].apply(generate_embeddings)
        embedding_df = pd.DataFrame(df[feature].to_list(), columns=[f'{feature}_Emb_{i}' for i in range(len(df[feature][0]))])
        df = pd.concat([df, embedding_df], axis=1)

    df = df.drop(columns=embedding_columns)
    return df
    
data = text_embedding(data)
print(data.iloc[0])

title                                                       squid game
TL                                                                   2
ML                                                                 256
Input                first line contain 2 integ n 1 n 3 number vert...
Output               print minimum number oper mashtali way mashtal...
                                           ...                        
Statement_Emb_379                                             0.021034
Statement_Emb_380                                             0.157798
Statement_Emb_381                                            -0.061132
Statement_Emb_382                                             -0.09207
Statement_Emb_383                                             0.066913
Name: 0, Length: 396, dtype: object


In [22]:
def tag_labeling(df):
    mlb = MultiLabelBinarizer()
    tags_binarized = mlb.fit_transform(df['tags'])
    tags_df = pd.DataFrame(tags_binarized, columns=mlb.classes_)
    df = pd.concat([df, tags_df], axis=1)
    df.drop(columns='tags', inplace=True)
    return tags_df, df

tags_df, data = tag_labeling(data)
print(data.iloc[0])

title                                                              squid game
TL                                                                          2
ML                                                                        256
Input                       first line contain 2 integ n 1 n 3 number vert...
Output                      print minimum number oper mashtali way mashtal...
                                                  ...                        
string suffix structures                                                    0
strings                                                                     0
ternary search                                                              0
trees                                                                       1
two pointers                                                                0
Name: 0, Length: 432, dtype: object


In [23]:
#preparing test data
test = pd.read_csv('../data/test_data.csv')
test = basic_preprocess(test)
test = text_preprocess(test)
test = text_embedding(test)
tag, test = tag_labeling(test)

In [24]:
data.to_pickle("../data/data.pkl")
test.to_pickle("../data/test.pkl")

In [25]:
data = pd.read_pickle("../data/data.pkl")
test = pd.read_pickle("../data/test.pkl")

In [29]:
#data = data.iloc[:sub_cnt]
for tag in ['dp']:
    print(f'computing {tag}...')
    ignored_features = list(tags_df.columns.difference([tag]))
    exp = setup(data=data, 
                target=tag, 
                text_features=['sample-input', 'sample-output', 'Input', 'Output', 'Note'],
                ignore_features=ignored_features + ignore_features,
                fix_imbalance=True,
                session_id=123,
                )
    model = create_model('lightgbm')
    evaluate_model(model)
    predictions = predict_model(model, data=test)
    print("\n\n")

computing dp...


Unnamed: 0,Description,Value
0,Session id,123
1,Target,dp
2,Target type,Binary
3,Original data shape,"(5000, 432)"
4,Transformed data shape,"(7038, 18043)"
5,Transformed train set shape,"(5538, 18043)"
6,Transformed test set shape,"(1500, 18043)"
7,Ignore features,37
8,Numeric features,388
9,Text features,5


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7886,0.7332,0.274,0.4878,0.3509,0.2363,0.2503
1,0.7857,0.7533,0.2192,0.4706,0.2991,0.192,0.2115
2,0.8114,0.793,0.4247,0.5636,0.4844,0.3718,0.3774
3,0.7914,0.7644,0.3425,0.5,0.4065,0.2853,0.2928
4,0.7943,0.7295,0.3425,0.5102,0.4098,0.2911,0.2995
5,0.8029,0.7452,0.274,0.5556,0.367,0.2658,0.2892
6,0.7857,0.7661,0.3699,0.4821,0.4186,0.29,0.2939
7,0.7971,0.7626,0.3562,0.52,0.4228,0.3049,0.3129
8,0.7914,0.7265,0.3425,0.5,0.4065,0.2853,0.2928
9,0.8343,0.8063,0.4054,0.6818,0.5085,0.4165,0.4368


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.796,0.7769,0.3392,0.59,0.4308,0.3174,0.3354







In [27]:
test = pd.read_pickle("../data/test.pkl")
predictions = predict_model(model, data=test)

In [28]:
y_pred = predictions['prediction_label']
test['Predicted'] = y_pred
wrong_predictions = test[test['dp'] != test['Predicted']]
wrong_predictions.head()

Unnamed: 0,title,TL,ML,Input,Output,Note,contest,index,rating,sample-input,...,probabilities,schedules,shortest paths,sortings,string suffix structures,strings,ternary search,trees,two pointers,Predicted
3,last minut enhanc,1,256,input consist multipl test case first line con...,test case output singl line contain precis one...,first test case euterp increas second fifth si...,1466,B,750.0,5 6 1 2 2 2 5 6 2 4 4 6 1 1 3 4 4 5 1 1 6 1 1 ...,...,0,0,0,0,0,0,0,0,0,0
4,minimum number variabl,1,256,first line contain integ n 1 n 23 second line ...,singl line print singl number minimum number v...,first sampl use two variabl b1 b2 perform foll...,279,D,2000.0,51 2 3 6 8 33 6 5 62 4 8 6 10 18,...,0,0,0,0,0,0,0,0,0,0
9,staircas,2,256,first line contain three integ n q 1 n 1000 1 ...,print q integ valu equal number differ stairca...,miss,1598,E,,2 2 8 1 1 1 1 1 1 2 2 1 1 1 2 2 1 1 1 3 4 10 1...,...,0,0,0,0,0,0,0,0,0,0
16,isol,3,256,first line contain two integ n k 1 k n number ...,first line contain number way divid array modulo,first sampl three possibl divis follow 1 1 2 1...,1129,D,2250.0,3 1 1 1 2 5 2 1 1 2 1 3 5 5 1 2 3 4 5,...,0,0,0,0,0,0,0,0,0,0
17,rock push,2,512,first line contain two integ n dimens labyrint...,print singl integ number differ legal path 1 1...,first sampl case ca move henc path consist sin...,1225,E,2750.0,1 1 2 3 r 4 4 r r,...,0,0,0,0,0,0,0,0,0,0
