In [1]:
!pip install catboost



In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, log_loss
import matplotlib.pyplot as plt
import csv
from tabulate import tabulate

def load_data_to_csv(input_filename, output_filename):
    data = []
    labels = []
    unique_shingles = set()

    with open(input_filename, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip():
                try:
                    record = json.loads(line)
                    shingles = record['shingles']
                    label = 1 if not record['target'] else 0
                    data.append((shingles, label))
                    labels.append(label)
                    unique_shingles.update(shingles)
                except json.JSONDecodeError as e:
                    print(f"JSON decoding error: {e}")

    shingle_indices = {shingle: i for i, shingle in enumerate(unique_shingles)}

    with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        header = ['label'] + [f'shingle_{i}' for i in range(len(unique_shingles))]
        writer.writerow(header)

        for shingles, label in data:
            row = [0] * (len(unique_shingles) + 1)
            row[0] = label
            for shingle in shingles:
                if shingle in shingle_indices:
                    row[shingle_indices[shingle] + 1] = 1
            writer.writerow(row)

def load_csv_data(filename):
    df = pd.read_csv(filename)
    X = df.drop(columns=['label']).values
    y = df['label'].values
    return X, y

json_filename = '/content/dataset_shingles_maria_sample.json'
csv_filename = '/content/dataset_shingles.csv'

load_data_to_csv(json_filename, csv_filename)

X, y = load_csv_data(csv_filename)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


model = CatBoostClassifier(
    iterations=100,
    learning_rate=0.1,
    loss_function='Logloss',
    eval_metric='Accuracy',
    verbose=0
)


train_pool = Pool(X_train, y_train)
val_pool = Pool(X_val, y_val)

model.fit(train_pool, eval_set=val_pool, use_best_model=False, plot=False, verbose=0)


train_losses = model.get_evals_result()['learn']['Logloss']
val_losses = model.get_evals_result()['validation']['Logloss']
train_accuracies = model.get_evals_result()['learn']['Accuracy']
val_accuracies = model.get_evals_result()['validation']['Accuracy']

print(tabulate(
    [[epoch+1, train_losses[epoch], val_losses[epoch], train_accuracies[epoch], val_accuracies[epoch]]
     for epoch in range(len(train_losses))],
    headers=["Epoch", "Train Loss", "Val Loss", "Train Accuracy", "Val Accuracy"],
    floatfmt=".4f"
))

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(range(1, len(train_losses) + 1), train_losses, label='Train Loss')
plt.plot(range(1, len(val_losses) + 1), val_losses, label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Log Loss')
plt.title('Log Loss over Epochs')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(range(1, len(train_accuracies) + 1), train_accuracies, label='Train Accuracy')
plt.plot(range(1, len(val_accuracies) + 1), val_accuracies, label='Val Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Accuracy over Epochs')
plt.legend()

plt.tight_layout()
plt.show()
