In [None]:
!pip install pycaret
!pip install catboost
!pip install lightgbm

In [1]:
import pandas as pd
from pycaret.classification import *

In [27]:
train = pd.read_csv('/content/train-augmented.csv')
test = pd.read_csv('/content/test-augmented.csv')
sample_submission = pd.read_csv('/content/sample_submission.csv')

In [28]:
train = train.drop(['sentence', 'Text_english_translation'], axis=1)
train = train.rename(columns={'Unnamed: 16': 'SYM'})
test = test.drop(['sentence', 'Text_english_translation'], axis=1)
test = test.rename(columns={'Unnamed: 25': 'SYM'})

In [29]:
# process data and setup dataset to experiment with pycaret
experiment = setup(data = train,  
                   target = 'difficulty', 
                   numeric_features = ['SYM', 'INTJ'],
                   train_size = 0.9, 
                   fold = 5,
                   pca = False,
                   normalize = True,
                   silent = True,
                   session_id = 707)

Unnamed: 0,Description,Value
0,session_id,707
1,Target,difficulty
2,Target Type,Multiclass
3,Label Encoded,"A1: 0, A2: 1, B1: 2, B2: 3, C1: 4, C2: 5"
4,Original Data,"(4800, 35)"
5,Missing Values,False
6,Numeric Features,33
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


In [39]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.4333,0.8198,0.433,0.4298,0.428,0.3199,0.3209,0.352
catboost,CatBoost Classifier,0.4315,0.8079,0.4313,0.4299,0.4299,0.3177,0.318,17.248
et,Extra Trees Classifier,0.4248,0.8011,0.4246,0.4228,0.4225,0.3097,0.31,0.788
gbc,Gradient Boosting Classifier,0.4197,0.8051,0.4194,0.4206,0.4193,0.3036,0.3038,5.242
lightgbm,Light Gradient Boosting Machine,0.4183,0.7975,0.4181,0.4169,0.4171,0.3019,0.302,0.958
rf,Random Forest Classifier,0.4146,0.8016,0.4144,0.4146,0.4132,0.2975,0.2978,0.928
lda,Linear Discriminant Analysis,0.4118,0.8052,0.4115,0.4194,0.4114,0.2941,0.2954,0.036
ridge,Ridge Classifier,0.4079,0.0,0.4074,0.4066,0.3988,0.2892,0.2916,0.026
ada,Ada Boost Classifier,0.3815,0.7205,0.3807,0.3698,0.3611,0.2575,0.262,0.288
knn,K Neighbors Classifier,0.3606,0.7112,0.3601,0.36,0.3523,0.2326,0.2344,0.272


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=707, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [47]:
# Create model with Extra Trees Classifier 
model = create_model('catboost')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.4433,0.8143,0.443,0.4416,0.4419,0.3319,0.332
1,0.4086,0.7964,0.4085,0.4066,0.4066,0.2903,0.2906
2,0.4259,0.8065,0.426,0.4257,0.4257,0.3111,0.3111
3,0.4444,0.819,0.4442,0.4444,0.4436,0.3333,0.3336
4,0.4352,0.8032,0.4347,0.431,0.4319,0.3222,0.3225
Mean,0.4315,0.8079,0.4313,0.4299,0.4299,0.3177,0.318
SD,0.0132,0.008,0.0131,0.0135,0.0134,0.0159,0.0159


In [48]:
# Bag the Random Forest model to get better metrics
bagged_model = ensemble_model(model, method = 'Bagging')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.4363,0.8176,0.436,0.4349,0.435,0.3235,0.3237
1,0.4039,0.7959,0.4039,0.4022,0.4025,0.2847,0.2849
2,0.4225,0.8084,0.4225,0.423,0.4221,0.3069,0.3071
3,0.4606,0.8242,0.4603,0.4594,0.4589,0.3527,0.3531
4,0.4329,0.804,0.4325,0.4316,0.4305,0.3194,0.3198
Mean,0.4312,0.81,0.431,0.4302,0.4298,0.3174,0.3177
SD,0.0185,0.01,0.0184,0.0185,0.0183,0.0222,0.0223


In [49]:
final_model = finalize_model(bagged_model)

In [50]:
predictions = predict_model(final_model, data=test)

In [51]:
sample_submission['difficulty'] = predictions['Label']

In [52]:
sample_submission.to_csv('pycaret-catboost-with-cognates.csv', index=False)