In [1]:
import json
import pandas as pd
from tqdm.notebook import tqdm

data_file = 'dataset_shingles_maria_sample_5'
l = []

with open(data_file) as reader:
    for obj in tqdm(reader):
        try:
            line = json.loads(obj)
            l.append(line)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            continue  # Skip invalid lines


df_data = pd.DataFrame(l)

0it [00:00, ?it/s]

In [2]:
df_tmp = df_data.iloc[:100]

In [3]:
df_tmp

Unnamed: 0,num,hard_target,shingles,soft_target
0,150381,False,"[396334907, 1060847325, 1063332450, 1330563559...",False
1,150383,False,"[396334907, 1060847325, 1063332450, 1330563559...",False
2,150384,False,"[227257338, 329292840, 463346217, 1029363358, ...",True
3,150385,False,"[104601096, 422258184, 549984431, 560494154, 5...",True
4,150387,False,"[2039583, 4137730, 5047731, 6762261, 6993977, ...",True
...,...,...,...,...
95,150542,False,"[3269003, 12741328, 14442492, 16473842, 230843...",False
96,150543,False,"[12096343, 19752179, 20324125, 23799634, 25459...",False
97,150544,False,"[409378754, 1014151031, 1437623850, 1850578463...",True
98,150553,False,"[43447116, 55599410, 81608462, 122942364, 1364...",False


In [5]:
from collections import defaultdict

shingle_counts = defaultdict(lambda: {"b": 0, "c": 0})
for _, row in df_data.iterrows():
    for shingle in row['shingles']:
        shingle_counts[shingle]["c"] += 1
        if row['hard_target'] or row['soft_target']:
            shingle_counts[shingle]["b"] += 1

selected_shingles = {
    shingle: counts
    for shingle, counts in shingle_counts.items()
    if counts["b"] > 1 or counts["c"] > 100
}

shingle_groups = {k: set() for k in range(10)}
for shingle, counts in selected_shingles.items():
    ratio = counts["b"] / counts["c"]
    for k in range(10):
        if 0.1 * k <= ratio < 0.1 * (k + 1):
            shingle_groups[k].add(shingle)
            break

In [6]:
final_table = []
for _, row in df_data.iterrows():
    shingles = set(row['shingles'])
    features = {
        f"feature_{k}": len(shingles.intersection(shingle_groups[k])) / len(shingles) if shingles else 0
        for k in range(10)
    }
    features.update({
        "num": row['num'],
        "soft_target": row['soft_target'],
        "hard_target": row['hard_target']
    })
    final_table.append(features)

In [7]:
final_table

[{'feature_0': 1.0,
  'feature_1': 0.0,
  'feature_2': 0.0,
  'feature_3': 0.0,
  'feature_4': 0.0,
  'feature_5': 0.0,
  'feature_6': 0.0,
  'feature_7': 0.0,
  'feature_8': 0.0,
  'feature_9': 0.0,
  'num': 150381,
  'soft_target': False,
  'hard_target': False},
 {'feature_0': 1.0,
  'feature_1': 0.0,
  'feature_2': 0.0,
  'feature_3': 0.0,
  'feature_4': 0.0,
  'feature_5': 0.0,
  'feature_6': 0.0,
  'feature_7': 0.0,
  'feature_8': 0.0,
  'feature_9': 0.0,
  'num': 150383,
  'soft_target': False,
  'hard_target': False},
 {'feature_0': 0.0,
  'feature_1': 0.0,
  'feature_2': 0.0,
  'feature_3': 0.0,
  'feature_4': 0.0,
  'feature_5': 0.0,
  'feature_6': 0.0,
  'feature_7': 0.0,
  'feature_8': 0.0,
  'feature_9': 0.0,
  'num': 150384,
  'soft_target': True,
  'hard_target': False},
 {'feature_0': 0.0,
  'feature_1': 0.0,
  'feature_2': 0.0,
  'feature_3': 0.0,
  'feature_4': 0.03571428571428571,
  'feature_5': 0.03571428571428571,
  'feature_6': 0.0,
  'feature_7': 0.0,
  'feature_

In [9]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [12]:
final_df = final_table

In [16]:
final_df = pd.DataFrame(final_df)

In [18]:
final_df

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,num,soft_target,hard_target
0,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,150381,False,False
1,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,150383,False,False
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,150384,True,False
3,0.000000,0.000000,0.000000,0.000000,0.035714,0.035714,0.0,0.000000,0.000000,0.00000,150385,True,False
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000484,0.0,0.001935,0.006289,0.02806,150387,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,188241,False,False
19996,0.015453,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,188243,False,False
19997,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,188247,False,False
19998,0.007236,0.100579,0.029667,0.051375,0.054269,0.011577,0.0,0.000000,0.000724,0.00000,188250,False,False


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import numpy as np

final_df['hard_target_correct'] = final_df['hard_target']
final_df['soft_target_correct'] = final_df['soft_target']

X = final_df[['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4',
              'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9']]
y_hard = final_df['hard_target_correct'].astype(int)
y_soft = final_df['soft_target_correct'].astype(int)

X_train_hard, X_test_hard, y_train_hard, y_test_hard = train_test_split(X, y_hard, test_size=0.3, random_state=42)
X_train_soft, X_test_soft, y_train_soft, y_test_soft = train_test_split(X, y_soft, test_size=0.3, random_state=42)

def train_and_evaluate_model(X_train, X_test, y_train, y_test, model_name):
    results = {}

    cat_model = CatBoostClassifier(verbose=0)
    cat_model.fit(X_train, y_train)
    y_pred_cat = cat_model.predict(X_test)
    y_prob_cat = cat_model.predict_proba(X_test)[:, 1]

    results['CatBoost'] = {
        'Accuracy': accuracy_score(y_test, y_pred_cat),
        'ROC-AUC': roc_auc_score(y_test, y_prob_cat),
        'Precision': precision_score(y_test, y_pred_cat),
        'Recall': recall_score(y_test, y_pred_cat),
    }

    xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    xgb_model.fit(X_train, y_train)
    y_pred_xgb = xgb_model.predict(X_test)
    y_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]

    results['XGBoost'] = {
        'Accuracy': accuracy_score(y_test, y_pred_xgb),
        'ROC-AUC': roc_auc_score(y_test, y_prob_xgb),
        'Precision': precision_score(y_test, y_pred_xgb),
        'Recall': recall_score(y_test, y_pred_xgb),
    }

    print(f"\nMetrics for {model_name}:")
    for model, metrics in results.items():
        print(f"{model}:")
        for metric, value in metrics.items():
            print(f"  {metric}: {value:.4f}")

    return results

hard_results = train_and_evaluate_model(X_train_hard, X_test_hard, y_train_hard, y_test_hard, "Hard Target")

soft_results = train_and_evaluate_model(X_train_soft, X_test_soft, y_train_soft, y_test_soft, "Soft Target")

Parameters: { "use_label_encoder" } are not used.




Metrics for Hard Target:
CatBoost:
  Accuracy: 0.9620
  ROC-AUC: 0.9179
  Precision: 0.8030
  Recall: 0.5524
XGBoost:
  Accuracy: 0.9623
  ROC-AUC: 0.9222
  Precision: 0.8022
  Recall: 0.5601

Metrics for Soft Target:
CatBoost:
  Accuracy: 0.9072
  ROC-AUC: 0.9135
  Precision: 0.9022
  Recall: 0.7081
XGBoost:
  Accuracy: 0.9067
  ROC-AUC: 0.9134
  Precision: 0.9054
  Recall: 0.7028


Parameters: { "use_label_encoder" } are not used.

