In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_curve, auc, roc_curve
import matplotlib.pyplot as plt
import csv

def load_data_to_csv(input_filename, output_filename):
    data = []
    labels = []
    unique_shingles = set()

    with open(input_filename, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip():
                try:
                    record = json.loads(line)
                    shingles = record['shingles']
                    label = 1 if not record['target'] else 0
                    data.append((shingles, label))
                    labels.append(label)
                    unique_shingles.update(shingles)
                except json.JSONDecodeError as e:
                    print(f"JSON decoding error: {e}")

    shingle_indices = {shingle: i for i, shingle in enumerate(unique_shingles)}

    with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        header = ['label'] + [f'shingle_{i}' for i in range(len(unique_shingles))]
        writer.writerow(header)

        for shingles, label in data:
            row = [0] * (len(unique_shingles) + 1)
            row[0] = label
            for shingle in shingles:
                if shingle in shingle_indices:
                    row[shingle_indices[shingle] + 1] = 1
            writer.writerow(row)

def load_csv_data(filename):
    df = pd.read_csv(filename)
    X = df.drop(columns=['label']).values
    y = df['label'].values
    return X, y


json_filename = '/content/dataset_shingles_maria_sample.json'
csv_filename = '/content/dataset_shingles.csv'
load_data_to_csv(json_filename, csv_filename)

X, y = load_csv_data(csv_filename)


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


sample_sizes = list(range(200, len(X_train), 200))
elasticity_results = []


for sample_size in sample_sizes:
    # Create a subset of the training data
    X_train_sample = X_train[:sample_size]
    y_train_sample = y_train[:sample_size]

  
    model = CatBoostClassifier(iterations=100, learning_rate=0.1, loss_function='Logloss', eval_metric='Accuracy', verbose=0)
    train_pool = Pool(X_train_sample, y_train_sample)
    val_pool = Pool(X_val, y_val)
    model.fit(train_pool, eval_set=val_pool, use_best_model=False, plot=False, verbose=0)

 
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    elasticity_results.append((sample_size, accuracy))


sample_sizes, accuracies = zip(*elasticity_results)
plt.plot(sample_sizes, accuracies, marker='o')
plt.xlabel("Sample Size")
plt.ylabel("Validation Accuracy")
plt.title("Sample Elasticity Curve")
plt.show()

optimal_sample_size = sample_sizes[accuracies.index(max(accuracies))]
X_train_optimal = X_train[:optimal_sample_size]
y_train_optimal = y_train[:optimal_sample_size]

model.fit(X_train_optimal, y_train_optimal, eval_set=val_pool, use_best_model=False, plot=False, verbose=0)

y_pred_probs = model.predict_proba(X_val)[:, 1]
roc_auc = roc_auc_score(y_val, y_pred_probs)

precision, recall, _ = precision_recall_curve(y_val, y_pred_probs)
pr_auc = auc(recall, precision)

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(recall, precision, label=f'PR AUC = {pr_auc:.2f}')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()


plt.subplot(1, 2, 2)
fpr, tpr, _ = roc_curve(y_val, y_pred_probs)
plt.plot(fpr, tpr, label=f'ROC AUC = {roc_auc:.2f}')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()

plt.tight_layout()
plt.show()


JSON decoding error: Unterminated string starting at: line 1 column 106523 (char 106522)


KeyboardInterrupt: 