In [51]:
import pandas as pd
import nltk
import ast
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sympy import sympify
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.preprocessing import MultiLabelBinarizer
from pycaret.classification import *

In [52]:
data = pd.read_csv('../data/train_data.csv')
data = data[:3000]

text_columns = ['title', 'Input', 'Output', 'Note', 'Statement']
text_list_columns = ['sample-input', 'sample-output']
text_features = text_columns + text_list_columns

def basic_preprocess(df):
    df[text_columns] = df[text_columns].fillna('missing')
    df[text_list_columns] = df[text_list_columns].fillna('[]')
    df['title'] = df['title'].str.extract(r'^[A-Za-z0-9]+\.\s*(.+)')
    df['TL'] = df['TL'].str.extract(r'(\d+)').astype(int)
    df['ML'] = df['ML'].str.extract(r'(\d+)').astype(int)
    df['tags'] = df['tags'].apply(ast.literal_eval)
    df['sample-input'] = df['sample-input'].apply(ast.literal_eval).apply(lambda x: " ".join(x))
    df['sample-output'] = df['sample-output'].apply(ast.literal_eval).apply(lambda x: " ".join(x))
    df.drop(columns='Unnamed: 0', inplace=True)
    df.drop(columns='contest-name', inplace=True)
    return df

data = basic_preprocess(data)
print(data.iloc[0])

title                                                   Squid Game
TL                                                               2
ML                                                             256
Input            The first line contains $$$2$$$ integer $$$n$$...
Output           Print the minimum number of operations Mashtal...
Note             Explanation for the first sample:  In the firs...
Statement        After watching the new over-rated series Squid...
contest                                                       1610
index                                                            H
tags             [data structures, dfs and similar, greedy, trees]
rating                                                      3500.0
sample-input     \n6 3\n1 1 1 4 4\n1 5\n3 4\n2 6\n \n5 3\n1 1 3...
sample-output                                         \n2\n \n-1\n
Name: 0, dtype: object


In [53]:
def text_preprocess(df):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    def text_transform(text):
        tokens = word_tokenize(text.lower())
        filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
        stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]
        return ' '.join(lemmatized_tokens)
    for feature in text_features:
        df[feature] = df[feature].apply(text_transform)
    return df

data = text_preprocess(data)
print(data.iloc[0])

title                                                   squid game
TL                                                               2
ML                                                             256
Input            first line contain 2 integ n 1 n 3 number vert...
Output           print minimum number oper mashtali way mashtal...
Note             explan first sampl first oper mashtali choos v...
Statement        watch new seri squid game mashtali soroush dec...
contest                                                       1610
index                                                            H
tags             [data structures, dfs and similar, greedy, trees]
rating                                                      3500.0
sample-input     6 3 1 1 1 4 4 1 5 3 4 2 6 5 3 1 1 3 3 1 2 1 4 1 5
sample-output                                                    2
Name: 0, dtype: object


In [54]:
def tag_labeling(df):
    mlb = MultiLabelBinarizer()
    tags_binarized = mlb.fit_transform(df['tags'])
    tags_df = pd.DataFrame(tags_binarized, columns=mlb.classes_)
    df = pd.concat([df, tags_df], axis=1)
    df.drop(columns='tags', inplace=True)
    return df

data = tag_labeling(data)
print(data.iloc[0])

title                                                               squid game
TL                                                                           2
ML                                                                         256
Input                        first line contain 2 integ n 1 n 3 number vert...
Output                       print minimum number oper mashtali way mashtal...
Note                         explan first sampl first oper mashtali choos v...
Statement                    watch new seri squid game mashtali soroush dec...
contest                                                                   1610
index                                                                        H
rating                                                                  3500.0
sample-input                 6 3 1 1 1 4 4 1 5 3 4 2 6 5 3 1 1 3 3 1 2 1 4 1 5
sample-output                                                                2
*special                                            

In [None]:
sub_data = data.iloc[:3000]
#for tag in tags_df.columns:
#for tag in ['binary search', 'dp', 'math', 'shortest paths']:
for tag in ['dp']:
    ignored_features = list(tags_df.columns.difference([tag]))
    exp = setup(data=sub_data, 
                target=tag, 
                text_features=['title', 'Input', 'Output', 'Note', 'Statement', 'sample-input', 'sample-output'],
                ignore_features=ignored_features + ['index'],
                #fix_imbalance=True,
                session_id=123,
                )
    #model = compare_models()
    model = create_model('lr')
    #final_model = finalize_model(model)
    evaluate_model(model)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,dp
2,Target type,Binary
3,Original data shape,"(3000, 49)"
4,Transformed data shape,"(3000, 21108)"
5,Transformed train set shape,"(2100, 21108)"
6,Transformed test set shape,"(900, 21108)"
7,Ignore features,37
8,Numeric features,4
9,Text features,7


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [56]:
test = pd.read_csv('../data/test_data.csv')
test = basic_preprocess(test)
test = text_preprocess(test)
test = tag_labeling(test)

In [57]:
predictions = predict_model(model, data=test)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7799,0.7135,0.1718,0.5532,0.2622,0.173,0.2142


In [58]:
y_pred = predictions['prediction_label']
test['Predicted'] = y_pred
wrong_predictions = test[test['dp'] != test['Predicted']]
wrong_predictions.head()

Unnamed: 0,title,TL,ML,Input,Output,Note,Statement,contest,index,rating,...,probabilities,schedules,shortest paths,sortings,string suffix structures,strings,ternary search,trees,two pointers,Predicted
1,submarin rybinsk sea easi edit,2,256,first line input contain singl integ n 1 n num...,print answer modulo,miss,problem differ next one presenc constraint equ...,1195,D1,1000.0,...,0,0,0,0,0,0,0,0,0,1
3,last minut enhanc,1,256,input consist multipl test case first line con...,test case output singl line contain precis one...,first test case euterp increas second fifth si...,athenaeu finish creat latest music composit pr...,1466,B,750.0,...,0,0,0,0,0,0,0,0,0,0
4,minimum number variabl,1,256,first line contain integ n 1 n 23 second line ...,singl line print singl number minimum number v...,first sampl use two variabl b1 b2 perform foll...,got posit integ sequenc a1 a2 number sequenc d...,279,D,2000.0,...,0,0,0,0,0,0,0,0,0,0
9,staircas,2,256,first line contain three integ n q 1 n 1000 1 ...,print q integ valu equal number differ stairca...,miss,given matrix consist n row column row number t...,1598,E,,...,0,0,0,0,0,0,0,0,0,0
10,tree paint,2,256,first line contain integ n number vertic tree ...,print one integ maximum number point gain play...,first exampl tree shown problem statement,given tree undirect connect acycl graph consis...,1187,E,,...,0,0,0,0,0,0,0,1,0,0
