In [1]:
import json
import csv
from collections import defaultdict

def count_shingle_occurrences(filename):
    shingle_bad_counts = defaultdict(int)
    shingle_total_counts = defaultdict(int)
    total_texts = 0
    bad_texts = 0

    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip():
                try:
                    data = json.loads(line)
                    total_texts += 1
                    is_bad = not data['target']
                    if is_bad:
                        bad_texts += 1

                    unique_shingles = set(data['shingles'])

                    for shingle in unique_shingles:
                        shingle_total_counts[shingle] += 1
                        if is_bad:
                            shingle_bad_counts[shingle] += 1

                except json.JSONDecodeError as e:
                    print(f"JSON decoding error: {e}")

    return shingle_bad_counts, shingle_total_counts, total_texts, bad_texts

def classify_shingles_and_evaluate(filename, shingle_bad_counts, shingle_total_counts, total_texts, bad_texts):
    correct_predictions = 0
    total_predictions = 0

    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip():
                try:
                    data = json.loads(line)
                    total_predictions += 1
                    phishing_likelihood = sum(
                        shingle_bad_counts[shingle] / shingle_total_counts[shingle]
                        for shingle in data['shingles']
                        if shingle in shingle_total_counts
                    ) / len(data['shingles'])

                    # Classify as phishing (1) if likelihood > 50%, else not phishing (0)
                    prediction = 1 if phishing_likelihood > (bad_texts / total_texts) else 0
                    actual = 1 if not data['target'] else 0

                    if prediction == actual:
                        correct_predictions += 1

                except json.JSONDecodeError as e:
                    print(f"JSON decoding error: {e}")

    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    print(f"Accuracy: {accuracy:.2%}")

def save_to_csv(shingle_bad_counts, shingle_total_counts, output_filename):
    with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Shingle', 'Count in phishing texts', 'Count in all texts'])

        all_shingles = set(shingle_total_counts.keys()).union(shingle_bad_counts.keys())
        for shingle in all_shingles:
            writer.writerow([
                shingle,
                shingle_bad_counts.get(shingle, 0),
                shingle_total_counts.get(shingle, 0)
            ])

filename = '/content/dataset_shingles_maria_sample.json'
shingle_bad_counts, shingle_total_counts, total_texts, bad_texts = count_shingle_occurrences(filename)

output_filename = 'shingle_counts.csv'
save_to_csv(shingle_bad_counts, shingle_total_counts, output_filename)

classify_shingles_and_evaluate(filename, shingle_bad_counts, shingle_total_counts, total_texts, bad_texts)


JSON decoding error: Expecting ',' delimiter: line 1 column 5817 (char 5816)
JSON decoding error: Expecting value: line 1 column 5497 (char 5496)
Accuracy: 94.23%
