In [1]:
import json
import os

input_file = '/content/dataset_shingles_maria_sample.json'
output_dir = '/content/parts'
output_base = 'split_file_'
records_per_file = 200

os.makedirs(output_dir, exist_ok=True)

with open(input_file, 'r') as file:
    file_count = 0
    records = []

    for line in file:
        try:
            record = json.loads(line)
            records.append(record)

            if len(records) == records_per_file:
                output_file = os.path.join(output_dir, f'{output_base}{file_count}.jsonl')
                with open(output_file, 'w') as out_file:
                    for rec in records:
                        json.dump(rec, out_file)
                        out_file.write('\n')
                file_count += 1
                records = []
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            continue

    if records:
        output_file = os.path.join(output_dir, f'{output_base}{file_count}.jsonl')
        with open(output_file, 'w') as out_file:
            for rec in records:
                json.dump(rec, out_file)
                out_file.write('\n')


Error decoding JSON: Unterminated string starting at: line 1 column 27704 (char 27703)


In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from catboost import CatBoostClassifier
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score
import glob
import matplotlib.pyplot as plt

file_paths = sorted(glob.glob('/content/parts/split_file_*.jsonl'))
incremental_accuracies = []
num_files_loaded = []

all_data_train = []

param_grid = {
    'iterations': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5]
}

base_model = CatBoostClassifier(auto_class_weights='Balanced', verbose=0)

all_data_full = []

for file_path in file_paths:
    data = pd.read_json(file_path, lines=True)
    reshaped_data = [{'shingle_id': int(shingle), 'target': row['target']} for _, row in data.iterrows() for shingle in row['shingles']]
    reshaped_df = pd.DataFrame(reshaped_data)
    all_data_full.append(reshaped_df)

final_data = pd.concat(all_data_full, ignore_index=True)

X_final = final_data[['shingle_id']]
y_final = final_data['target']
_, X_test_final, _, y_test_final = train_test_split(X_final, y_final, test_size=0.2, random_state=42)

for file_idx, file_path in enumerate(file_paths):
    try:
        data = pd.read_json(file_path, lines=True)
        reshaped_data = [{'shingle_id': int(shingle), 'target': row['target']} for _, row in data.iterrows() for shingle in row['shingles']]
        reshaped_df = pd.DataFrame(reshaped_data)

        all_data_train.append(reshaped_df)
        current_data = pd.concat(all_data_train, ignore_index=True)

        X_train = current_data[['shingle_id']]
        y_train = current_data['target']

        grid_search = GridSearchCV(estimator=base_model, param_grid=param_grid, scoring='accuracy', cv=3)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_

        y_pred_final = best_model.predict(X_test_final)
        y_pred_proba_final = best_model.predict_proba(X_test_final)[:, 1]
        accuracy = accuracy_score(y_test_final, y_pred_final)
        roc_auc = roc_auc_score(y_test_final, y_pred_proba_final)
        precision = precision_score(y_test_final, y_pred_final)
        recall = recall_score(y_test_final, y_pred_final)

        incremental_accuracies.append(accuracy)
        num_files_loaded.append(file_idx + 1)

        print(f"After loading {file_idx + 1} files:")
        print(f"Best Params: {grid_search.best_params_}")
        print(f"Accuracy: {accuracy:.4f}, ROC-AUC: {roc_auc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

    except Exception as e:
        print(f"Error processing file {file_path}: {e}. Moving on to the next file.")

plt.figure(figsize=(8, 6))
plt.plot(num_files_loaded, incremental_accuracies, marker='o', color='b', label="Accuracy")
plt.title("Accuracy vs Number of Files Loaded")
plt.xlabel("Number of Files Loaded (Training Data Size)")
plt.ylabel("Accuracy on Full Test Set")
plt.legend()
plt.show()


After loading 1 files:
Best Params: {'depth': 4, 'iterations': 50, 'l2_leaf_reg': 1, 'learning_rate': 0.01}
Accuracy: 0.4351, ROC-AUC: 0.5089, Precision: 0.1264, Recall: 0.5974
After loading 2 files:
Best Params: {'depth': 8, 'iterations': 200, 'l2_leaf_reg': 1, 'learning_rate': 0.1}
Accuracy: 0.5220, ROC-AUC: 0.5166, Precision: 0.1297, Recall: 0.4963
After loading 3 files:
Best Params: {'depth': 4, 'iterations': 50, 'l2_leaf_reg': 1, 'learning_rate': 0.1}
Accuracy: 0.4832, ROC-AUC: 0.5091, Precision: 0.1273, Recall: 0.5371
After loading 4 files:
Best Params: {'depth': 4, 'iterations': 200, 'l2_leaf_reg': 1, 'learning_rate': 0.1}
Accuracy: 0.5283, ROC-AUC: 0.5157, Precision: 0.1294, Recall: 0.4861
After loading 5 files:
Best Params: {'depth': 4, 'iterations': 50, 'l2_leaf_reg': 1, 'learning_rate': 0.01}
Accuracy: 0.5852, ROC-AUC: 0.5070, Precision: 0.1276, Recall: 0.3986
After loading 6 files:
Best Params: {'depth': 6, 'iterations': 200, 'l2_leaf_reg': 1, 'learning_rate': 0.1}
Accuracy