In [2]:
import json
import csv
from collections import defaultdict

def count_shingles(filename):
    bad_texts_shingle_counts = defaultdict(int)
    all_texts_shingle_counts = defaultdict(int)

    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip():
                try:
                    data = json.loads(line)

                    for shingle in data['shingles']:
                        all_texts_shingle_counts[shingle] += 1

                    if not data['target']:
                        for shingle in data['shingles']:
                            bad_texts_shingle_counts[shingle] += 1

                except json.JSONDecodeError as e:
                    print(f"Ошибка декодирования JSON: {e}")

    return bad_texts_shingle_counts, all_texts_shingle_counts

def save_to_csv(bad_texts_counts, all_texts_counts, output_filename):
    with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)

        writer.writerow(['Shingle', 'Count in bad texts', 'Count in all texts'])

        all_shingles = set(all_texts_counts.keys()).union(bad_texts_counts.keys())
        for shingle in all_shingles:
            writer.writerow([
                shingle,
                bad_texts_counts.get(shingle, 0),
                all_texts_counts.get(shingle, 0)
            ])

filename = '/content/dataset_shingles_maria_sample.json'
bad_texts_counts, all_texts_counts = count_shingles(filename)

output_filename = 'shingle_counts.csv'
save_to_csv(bad_texts_counts, all_texts_counts, output_filename)

Ошибка декодирования JSON: Unterminated string starting at: line 1 column 31068 (char 31067)


In [3]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [4]:
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from catboost import CatBoostClassifier
import numpy as np

def load_data(filename):
    X = []
    y = []

    # Чтение данных из файла
    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip():
                try:
                    data = json.loads(line)
                    X.append(data['shingles'])  # Шинглы - признаки
                    y.append(int(data['target']))  # Target (0 или 1)
                except json.JSONDecodeError as e:
                    print(f"Ошибка декодирования JSON: {e}")

    return X, y

filename = '/content/dataset_shingles_maria_sample.json'
X, y = load_data(filename)

mlb = MultiLabelBinarizer()
X_transformed = mlb.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)


model = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, loss_function='Logloss', verbose=False)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = np.mean(y_pred == y_test)
print(f"Точность модели: {accuracy:.4f}")


Ошибка декодирования JSON: Unterminated string starting at: line 1 column 5322 (char 5321)
Точность модели: 0.9438


In [5]:
import json
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from catboost import CatBoostClassifier
import numpy as np

def calculate_shingle_counts(filename):
    bad_shingle_counts = defaultdict(int)
    all_shingle_counts = defaultdict(int)

    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip():
                try:
                    data = json.loads(line)
                    target = data['target']
                    shingles = data['shingles']
                    for shingle in shingles:
                        all_shingle_counts[shingle] += 1
                        if not target:
                            bad_shingle_counts[shingle] += 1
                except json.JSONDecodeError as e:
                    print(f"Ошибка декодирования JSON: {e}")

    return bad_shingle_counts, all_shingle_counts

def load_data_with_counts(filename, bad_shingle_counts, all_shingle_counts):
    X = []
    y = []

    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip():
                try:
                    data = json.loads(line)
                    shingles = data['shingles']
                    target = data['target']

                    # Count in bad texts / Count in all texts
                    shingle_features = []
                    for shingle in shingles:
                        bad_count = bad_shingle_counts[shingle]
                        all_count = all_shingle_counts[shingle]
                        ratio = bad_count / all_count if all_count > 0 else 0
                        shingle_features.append(ratio)

                    X.append(shingle_features)
                    y.append(int(target))
                except json.JSONDecodeError as e:
                    print(f"Ошибка декодирования JSON: {e}")

    return X, y


filename = '/content/dataset_shingles_maria_sample.json'
bad_shingle_counts, all_shingle_counts = calculate_shingle_counts(filename)


X, y = load_data_with_counts(filename, bad_shingle_counts, all_shingle_counts)

max_shingles = max(len(x) for x in X)
X_padded = [x + [0] * (max_shingles - len(x)) for x in X]


X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

model = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, loss_function='Logloss', verbose=False)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)


accuracy = np.mean(y_pred == y_test)
print(f"Точность модели: {accuracy:.4f}")


Ошибка декодирования JSON: Unterminated string starting at: line 1 column 836 (char 835)
Ошибка декодирования JSON: Unterminated string starting at: line 1 column 836 (char 835)
Точность модели: 0.9988
