In [36]:
import pandas as pd
import nltk
import ast
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sympy import sympify
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.preprocessing import MultiLabelBinarizer
from pycaret.classification import *
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

In [37]:
data = pd.read_csv('../data/train_data.csv')
data = data[:300]

text_columns = ['title', 'Input', 'Output', 'Note', 'Statement']
text_list_columns = ['sample-input', 'sample-output']
text_features = text_columns + text_list_columns

def basic_preprocess(df):
    df[text_columns] = df[text_columns].fillna('missing')
    df[text_list_columns] = df[text_list_columns].fillna('[]')
    df['title'] = df['title'].str.extract(r'^[A-Za-z0-9]+\.\s*(.+)')
    df['TL'] = df['TL'].str.extract(r'(\d+)').astype(int)
    df['ML'] = df['ML'].str.extract(r'(\d+)').astype(int)
    df['tags'] = df['tags'].apply(ast.literal_eval)
    df['sample-input'] = df['sample-input'].apply(ast.literal_eval).apply(lambda x: " ".join(x))
    df['sample-output'] = df['sample-output'].apply(ast.literal_eval).apply(lambda x: " ".join(x))
    df.drop(columns='Unnamed: 0', inplace=True)
    df.drop(columns='contest-name', inplace=True)
    return df

data = basic_preprocess(data)
print(data.iloc[0])

title                                                   Squid Game
TL                                                               2
ML                                                             256
Input            The first line contains $$$2$$$ integer $$$n$$...
Output           Print the minimum number of operations Mashtal...
Note             Explanation for the first sample:  In the firs...
Statement        After watching the new over-rated series Squid...
contest                                                       1610
index                                                            H
tags             [data structures, dfs and similar, greedy, trees]
rating                                                      3500.0
sample-input     \n6 3\n1 1 1 4 4\n1 5\n3 4\n2 6\n \n5 3\n1 1 3...
sample-output                                         \n2\n \n-1\n
Name: 0, dtype: object


In [38]:
def text_preprocess(df):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    def text_transform(text):
        tokens = word_tokenize(text.lower())
        filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
        stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]
        return ' '.join(lemmatized_tokens)
    for feature in text_features:
        df[feature] = df[feature].apply(text_transform)
    return df

data = text_preprocess(data)
print(data.iloc[0])

title                                                   squid game
TL                                                               2
ML                                                             256
Input            first line contain 2 integ n 1 n 3 number vert...
Output           print minimum number oper mashtali way mashtal...
Note             explan first sampl first oper mashtali choos v...
Statement        watch new seri squid game mashtali soroush dec...
contest                                                       1610
index                                                            H
tags             [data structures, dfs and similar, greedy, trees]
rating                                                      3500.0
sample-input     6 3 1 1 1 4 4 1 5 3 4 2 6 5 3 1 1 3 3 1 2 1 4 1 5
sample-output                                                    2
Name: 0, dtype: object


In [39]:
def text_embedding(df):
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    def generate_embeddings(text):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        outputs = model(**inputs)
        return torch.mean(outputs.last_hidden_state, dim=1).detach().numpy().flatten()
    embedding_columns = ['Input', 'Output', 'Note', 'Statement']
    for feature in embedding_columns:
        df[feature] = df[feature].apply(generate_embeddings)
        embedding_df = pd.DataFrame(df[feature].to_list(), columns=[f'{feature}_Emb_{i}' for i in range(len(df[feature][0]))])
        df = pd.concat([df, embedding_df], axis=1)

    df = df.drop(columns=embedding_columns)
    return df
    
data = text_embedding(data)
print(data.iloc[0])

title                squid game
TL                            2
ML                          256
contest                    1610
index                         H
                        ...    
Statement_Emb_379      0.021034
Statement_Emb_380      0.157798
Statement_Emb_381     -0.061132
Statement_Emb_382      -0.09207
Statement_Emb_383      0.066913
Name: 0, Length: 1545, dtype: object


In [40]:
def tag_labeling(df):
    mlb = MultiLabelBinarizer()
    tags_binarized = mlb.fit_transform(df['tags'])
    tags_df = pd.DataFrame(tags_binarized, columns=mlb.classes_)
    df = pd.concat([df, tags_df], axis=1)
    df.drop(columns='tags', inplace=True)
    return tags_df, df

tags_df, data = tag_labeling(data)
print(data.iloc[0])

title                       squid game
TL                                   2
ML                                 256
contest                           1610
index                                H
                               ...    
string suffix structures             0
strings                              0
ternary search                       0
trees                                1
two pointers                         0
Name: 0, Length: 1579, dtype: object


In [41]:
sub_data = data.iloc[:300]
#for tag in tags_df.columns:
#for tag in ['binary search', 'dp', 'math', 'shortest paths']:
for tag in ['dp']:
    ignored_features = list(tags_df.columns.difference([tag]))
    exp = setup(data=sub_data, 
                target=tag, 
                #text_features=['title', 'Input', 'Output', 'Note', 'Statement', 'sample-input', 'sample-output'],
                text_features=['title', 'sample-input', 'sample-output'],
                ignore_features=ignored_features + ['index'],
                fix_imbalance=True,
                session_id=123,
                )
    #model = compare_models()
    model = create_model('lr')
    #final_model = finalize_model(model)
    evaluate_model(model)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,dp
2,Target type,Binary
3,Original data shape,"(300, 1579)"
4,Transformed data shape,"(414, 2584)"
5,Transformed train set shape,"(324, 2584)"
6,Transformed test set shape,"(90, 2584)"
7,Ignore features,39
8,Numeric features,1540
9,Text features,3


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7143,0.7059,0.5,0.3333,0.4,0.2222,0.2301
1,0.7619,0.8382,0.25,0.3333,0.2857,0.1463,0.1485
2,0.6667,0.7375,0.2,0.25,0.2222,0.0134,0.0136
3,0.7619,0.775,0.2,0.5,0.2857,0.1732,0.1995
4,0.5714,0.6625,0.2,0.1667,0.1818,-0.1053,-0.1061
5,0.6667,0.6375,0.2,0.25,0.2222,0.0134,0.0136
6,0.6667,0.5875,0.0,0.0,0.0,-0.1575,-0.1814
7,0.5714,0.7125,0.2,0.1667,0.1818,-0.1053,-0.1061
8,0.7619,0.4375,0.2,0.5,0.2857,0.1732,0.1995
9,0.7143,0.6,0.6,0.4286,0.5,0.3077,0.3162


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [43]:
test = pd.read_csv('../data/test_data.csv')
test = basic_preprocess(test)
test = text_preprocess(test)
test = text_embedding(test)
tag, test = tag_labeling(test)

In [44]:
predictions = predict_model(model, data=test)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7328,0.6608,0.3436,0.399,0.3692,0.2009,0.2019


In [45]:
y_pred = predictions['prediction_label']
test['Predicted'] = y_pred
wrong_predictions = test[test['dp'] != test['Predicted']]
wrong_predictions.head()

Unnamed: 0,title,TL,ML,contest,index,rating,sample-input,sample-output,Input_Emb_0,Input_Emb_1,...,probabilities,schedules,shortest paths,sortings,string suffix structures,strings,ternary search,trees,two pointers,Predicted
1,submarin rybinsk sea easi edit,2,256,1195,D1,1000.0,3 12 33 45 2 123 456 1 1 5 1000000000 10000000...,26730 1115598 11 265359409,-0.003349,0.224522,...,0,0,0,0,0,0,0,0,0,1
2,yet anoth array partit task,2,256,1114,B,1250.0,9 2 3 5 2 5 2 4 1 1 3 2 6 1 4 4 1 3 2 2 3 2 1 ...,21 3 5 12 1 3 5 0 1,-0.079452,0.365466,...,0,0,0,1,0,0,0,0,0,1
3,last minut enhanc,1,256,1466,B,750.0,5 6 1 2 2 2 5 6 2 4 4 6 1 1 3 4 4 5 1 1 6 1 1 ...,5 2 6 1 3,0.022157,0.045695,...,0,0,0,0,0,0,0,0,0,0
4,minimum number variabl,1,256,279,D,2000.0,51 2 3 6 8 33 6 5 62 4 8 6 10 18,2 3,-0.076375,0.150557,...,0,0,0,0,0,0,0,0,0,0
9,staircas,2,256,1598,E,,2 2 8 1 1 1 1 1 1 2 2 1 1 1 2 2 1 1 1 3 4 10 1...,5 10 5 2 5 3 1 0 49 35 24 29 49 39 31 23 29 27...,-0.148037,0.294611,...,0,0,0,0,0,0,0,0,0,0
