In [9]:
import json
import pandas as pd
from tqdm.notebook import tqdm

data_file = 'dataset_shingles_maria_sample_5'
l = []

with open(data_file) as reader:
    for obj in tqdm(reader):
        try:
            line = json.loads(obj)
            l.append(line)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            continue  # Skip invalid lines


df_data = pd.DataFrame(l)

0it [00:00, ?it/s]

Error decoding JSON: Unterminated string starting at: line 1 column 19248 (char 19247)


In [7]:
df_tmp = df_data.iloc[:100]

In [8]:
df_tmp

Unnamed: 0,num,hard_target,shingles,soft_target
0,150381,False,"[396334907, 1060847325, 1063332450, 1330563559...",False
1,150383,False,"[396334907, 1060847325, 1063332450, 1330563559...",False
2,150384,False,"[227257338, 329292840, 463346217, 1029363358, ...",True
3,150385,False,"[104601096, 422258184, 549984431, 560494154, 5...",True
4,150387,False,"[2039583, 4137730, 5047731, 6762261, 6993977, ...",True
...,...,...,...,...
95,150542,False,"[3269003, 12741328, 14442492, 16473842, 230843...",False
96,150543,False,"[12096343, 19752179, 20324125, 23799634, 25459...",False
97,150544,False,"[409378754, 1014151031, 1437623850, 1850578463...",True
98,150553,False,"[43447116, 55599410, 81608462, 122942364, 1364...",False


In [26]:
from collections import defaultdict

shingle_counts = defaultdict(lambda: {"total": 0, "false_target": 0})

for _, row in df_data.iterrows():
    for shingle in row['shingles']:
        shingle_counts[shingle]["total"] += 1
        if row['hard_target'] or row['soft_target']:  # False target
            shingle_counts[shingle]["false_target"] += 1

bad_shingles = {
    shingle: counts["false_target"] / counts["total"]
    for shingle, counts in shingle_counts.items()
    if counts["false_target"] / counts["total"] > 0.6
}

bad_shingles_df = pd.DataFrame.from_dict(bad_shingles, orient="index", columns=["soft_target"]).reset_index()
bad_shingles_df.rename(columns={"index": "shingle"}, inplace=True)

bad_shingles_df = bad_shingles_df[['shingle']]

bad_shingles_set = set(bad_shingles_df['shingle'])

final_table = []
for _, row in df_tmp.iterrows():
    shingles = row['shingles']

    intersection = len(set(shingles).intersection(bad_shingles_set)) / len(shingles) if shingles else 0

    soft_target = row['soft_target']
    hard_target = row['hard_target']

    final_table.append({
        "num": row['num'],
        "shingles": shingles,
        "soft_target": soft_target,
        "hard_target": hard_target,
        "intersection": intersection
    })

final_df = pd.DataFrame(final_table)



       num                                           shingles  soft_target  \
0   150381  [396334907, 1060847325, 1063332450, 1330563559...        False   
1   150383  [396334907, 1060847325, 1063332450, 1330563559...        False   
2   150384  [227257338, 329292840, 463346217, 1029363358, ...         True   
3   150385  [104601096, 422258184, 549984431, 560494154, 5...         True   
4   150387  [2039583, 4137730, 5047731, 6762261, 6993977, ...         True   
..     ...                                                ...          ...   
95  150542  [3269003, 12741328, 14442492, 16473842, 230843...        False   
96  150543  [12096343, 19752179, 20324125, 23799634, 25459...        False   
97  150544  [409378754, 1014151031, 1437623850, 1850578463...         True   
98  150553  [43447116, 55599410, 81608462, 122942364, 1364...        False   
99  150554  [87343846, 100823462, 191356912, 194972896, 21...        False   

    hard_target  intersection  
0         False      0.000000  

In [27]:
final_df

Unnamed: 0,num,shingles,soft_target,hard_target,intersection
0,150381,"[396334907, 1060847325, 1063332450, 1330563559...",False,False,0.000000
1,150383,"[396334907, 1060847325, 1063332450, 1330563559...",False,False,0.000000
2,150384,"[227257338, 329292840, 463346217, 1029363358, ...",True,False,1.000000
3,150385,"[104601096, 422258184, 549984431, 560494154, 5...",True,False,1.000000
4,150387,"[2039583, 4137730, 5047731, 6762261, 6993977, ...",True,False,0.999516
...,...,...,...,...,...
95,150542,"[3269003, 12741328, 14442492, 16473842, 230843...",False,False,0.000000
96,150543,"[12096343, 19752179, 20324125, 23799634, 25459...",False,False,0.000000
97,150544,"[409378754, 1014151031, 1437623850, 1850578463...",True,False,1.000000
98,150553,"[43447116, 55599410, 81608462, 122942364, 1364...",False,False,0.000000


In [28]:
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score

intersection_threshold = 0.9

final_df['pred_hard'] = (final_df['intersection'] > intersection_threshold).astype(int)
final_df['pred_soft'] = (final_df['intersection'] > intersection_threshold).astype(int)

hard_accuracy = accuracy_score(final_df['hard_target'], final_df['pred_hard'])
hard_roc_auc = roc_auc_score(final_df['hard_target'], final_df['intersection'])
hard_precision = precision_score(final_df['hard_target'], final_df['pred_hard'])
hard_recall = recall_score(final_df['hard_target'], final_df['pred_hard'])

soft_accuracy = accuracy_score(final_df['soft_target'], final_df['pred_soft'])
soft_roc_auc = roc_auc_score(final_df['soft_target'], final_df['intersection'])
soft_precision = precision_score(final_df['soft_target'], final_df['pred_soft'])
soft_recall = recall_score(final_df['soft_target'], final_df['pred_soft'])

print("Metrics for hard_target:")
print(f"Accuracy: {hard_accuracy:.3f}")
print(f"ROC-AUC: {hard_roc_auc:.3f}")
print(f"Precision: {hard_precision:.3f}")
print(f"Recall: {hard_recall:.3f}")

print("\nMetrics for soft_target:")
print(f"Accuracy: {soft_accuracy:.3f}")
print(f"ROC-AUC: {soft_roc_auc:.3f}")
print(f"Precision: {soft_precision:.3f}")
print(f"Recall: {soft_recall:.3f}")


Metrics for hard_target:
Accuracy: 0.710
ROC-AUC: 0.783
Precision: 0.125
Recall: 0.800

Metrics for soft_target:
Accuracy: 0.950
ROC-AUC: 0.986
Precision: 1.000
Recall: 0.865
