In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv("../13-HeapGlobal/musicbrainz-200-A01.csv.dapo", sep=",", encoding="utf-8", keep_default_na=False, na_values=[''], dtype={'TID': int, 'CID': str, 'CTID': str, 'SourceID': str, 'id': str, 'number': str, 'title': str, 'length': str, 'artist': str, 'album': str, 'year': str, 'language': str})
dataset = dataset.fillna('-')
dataset.head(2)

Unnamed: 0,TID,CID,CTID,SourceID,id,number,title,length,artist,album,year,language
0,1,1,1,1,WoM5452845,0,L'enfant aux yeux d'Italie (De vous à elle en ...,03:39,Daniel Balavoine,De vous à elle en passsant par moi,1975,-
1,2,32522,4,2,MBox36398328-HH,11,Silver Forest - 双剣乱舞 みょん＆me,229,-,Silver Forest 2006-2012 BEST1,12,Japanese


In [16]:
import uuid
import json

# Select 50% of records randomly
selected_indices = dataset.index[np.random.rand(len(dataset)) < 0.5]
df_selected = dataset.loc[selected_indices]

def perturb_record_with_id(record, original_id):
    perturbed1 = record.copy()
    perturbed2 = record.copy()
    
    perturbed1['artist'] = str(perturbed1['artist']).lower()
    perturbed2['year'] += "1"

    new_id1 = f"novo1{original_id}"
    new_id2 = f"novo2{original_id}"
    
    perturbed1['TID2'] = new_id1
    perturbed2['TID2'] = new_id2

    return [(new_id1, perturbed1), (new_id2, perturbed2)]

ground_truth = {}
perturbed_records = []

for idx, record in df_selected.iterrows():
    original_id = record['TID']
    perturbed = perturb_record_with_id(record, original_id)
    
    
    for new_id, perturbed_record in perturbed:
        ground_truth[new_id] = original_id
        perturbed_records.append(perturbed_record)

# Export ground truth to CSV with column headers
with open('ground_truth_music_brainz.csv', 'w') as f:
    f.write("novoidmusic1,antigoidmusic2\n")
    for new_id, original_id in ground_truth.items():
        f.write(f"{new_id},{original_id}\n")

# Create DataFrame for perturbed records and export to CSV
df_perturbed = pd.DataFrame(perturbed_records)
df_perturbed.to_csv('music_brainz-simple-mutated.csv', index=False)

In [17]:
df_perturbed.shape, dataset.shape

((193470, 13), (193750, 12))

In [18]:
df_perturbed.head()

Unnamed: 0,TID,CID,CTID,SourceID,id,number,title,length,artist,album,year,language,TID2
0,1,1,1,1,WoM5452845,0,L'enfant aux yeux d'Italie (De vous à elle en ...,03:39,daniel balavoine,De vous à elle en passsant par moi,1975.0,-,novo11
0,1,1,1,1,WoM5452845,0,L'enfant aux yeux d'Italie (De vous à elle en ...,03:39,Daniel Balavoine,De vous à elle en passsant par moi,19751.0,-,novo21
5,6,3,1,5,4489993,10,Your Grace,166000,kathy troccoli,Comfort,2005.0,English,novo16
5,6,3,1,5,4489993,10,Your Grace,166000,Kathy Troccoli,Comfort,20051.0,English,novo26
9,10,5,1,4,160109-A070,6,006-Try_(acoustic),unknown,neil young,"2008-02-15: Le Grand Rex, Paris, France (unknown)",,Eng.,novo110


In [20]:
dataset.head()

Unnamed: 0,TID,CID,CTID,SourceID,id,number,title,length,artist,album,year,language
0,1,1,1,1,WoM5452845,0,L'enfant aux yeux d'Italie (De vous à elle en ...,03:39,Daniel Balavoine,De vous à elle en passsant par moi,1975,-
1,2,32522,4,2,MBox36398328-HH,11,Silver Forest - 双剣乱舞 みょん＆me,229,-,Silver Forest 2006-2012 BEST1,12,Japanese
2,3,53749,2,3,4382873MB-01,2,shabnavard - Chavoush 2,27.183,شهرام ناظری,-,-,Persian
3,4,2,1,3,unk.,17,Mustard Gas - There and Back Again Lane,2.15,Action Painting!,-,'95,English
4,5,31368,3,4,215214-A048,6,006-Immer bis ich reier',3m 48sec,Nordwand,Das Pinke Album (unknown),,Ger.


In [26]:
# Check duplicates for all columns
print("Duplicate Analysis for All Columns")
print("=" * 80)

for column in dataset.columns:
    print(f"\nColumn: {column}")
    print("-" * 80)
    
    # Count duplicates
    value_counts = dataset[column].value_counts()
    total_values = len(dataset[column])
    unique_values = dataset[column].nunique()
    null_values = dataset[column].isnull().sum()
    duplicate_values = value_counts[value_counts > 1]
    
    print(f"Total values: {total_values}")
    print(f"Unique values: {unique_values}")
    print(f"Null values: {null_values}")
    print(f"Values appearing more than once: {len(duplicate_values)}")
    print(f"Total duplicate entries: {duplicate_values.sum() - len(duplicate_values)}")
    
    # Show top 10 most frequent duplicate values
    if len(duplicate_values) > 0:
        print(f"\nTop 10 most frequent duplicate values:")
        print(duplicate_values.head(10))

Duplicate Analysis for All Columns

Column: TID
--------------------------------------------------------------------------------
Total values: 193750
Unique values: 193750
Null values: 0
Values appearing more than once: 0
Total duplicate entries: 0

Column: CID
--------------------------------------------------------------------------------
Total values: 193750
Unique values: 100000
Null values: 0
Values appearing more than once: 50000
Total duplicate entries: 93750

Top 10 most frequent duplicate values:
CID
73510    5
12843    5
42053    5
12821    5
48318    5
13582    5
12823    5
50160    5
88321    5
12825    5
Name: count, dtype: int64

Column: CTID
--------------------------------------------------------------------------------
Total values: 193750
Unique values: 5
Null values: 0
Values appearing more than once: 5
Total duplicate entries: 193745

Top 10 most frequent duplicate values:
CTID
1    100000
2     50000
3     25000
4     12500
5      6250
Name: count, dtype: int64

Co

# Modificando NCVR

In [2]:
data = pd.read_csv('./ncvoter42.txt', sep='\t',encoding='latin1')
data

Unnamed: 0,county_id,county_desc,voter_reg_num,ncid,last_name,first_name,middle_name,name_suffix_lbl,status_cd,voter_status_desc,...,sanit_dist_abbrv,sanit_dist_desc,rescue_dist_abbrv,rescue_dist_desc,munic_dist_abbrv,munic_dist_desc,dist_1_abbrv,dist_1_desc,vtd_abbrv,vtd_desc
0,42,HALIFAX,100702,BZ61829,AARON,RHANDA,FAITH THORA,,A,ACTIVE,...,SAN,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-8,RR 8
1,42,HALIFAX,107751,BZ75895,AASBY,BRANDON,,,A,ACTIVE,...,,,,,,,7.0,PROSECUTORIAL DISTRICT 7,RR-9,RR 9
2,42,HALIFAX,104490,BZ73617,ABAKKAL,NOURA,,,A,ACTIVE,...,SAN,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-10,RR 10
3,42,HALIFAX,97201,BZ68595,ABBAN,SHANNAN,JANAY,,I,INACTIVE,...,SAN,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-1,RR 1
4,42,HALIFAX,68003,BZ45167,ABBOTT,AMY,CLARY,,D,DENIED,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40837,42,HALIFAX,107670,BZ45511,ZSEBEHAZY,ROBERT,JOHN,,A,ACTIVE,...,,,,,,,7.0,PROSECUTORIAL DISTRICT 7,WEL-3,WEL 3
40838,42,HALIFAX,106555,BZ74992,ZUBAIDI,AFAF,MOHAMED SALEH,,A,ACTIVE,...,SAN,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-1,RR 1
40839,42,HALIFAX,21018,BZ10671,ZUCKER,DIANNE,,,A,ACTIVE,...,SAN,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-10,RR 10
40840,42,HALIFAX,21011,BZ10666,ZUCKER,ROBERT,N,,A,ACTIVE,...,SAN,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-10,RR 10


In [3]:
data.loc[0].to_string()

'county_id                                               42\ncounty_desc                                        HALIFAX\nvoter_reg_num                                       100702\nncid                                               BZ61829\nlast_name                                            AARON\nfirst_name                                          RHANDA\nmiddle_name                                    FAITH THORA\nname_suffix_lbl                                        NaN\nstatus_cd                                                A\nvoter_status_desc                                   ACTIVE\nreason_cd                                               AV\nvoter_status_reason_desc                          VERIFIED\nres_street_address                  1327  WASHINGTON ST   \nres_city_desc                               ROANOKE RAPIDS\nstate_cd                                                NC\nzip_code                                           27870.0\nmail_addr1                             

In [9]:
selected_indices = data.index[np.random.rand(len(data)) < 0.5]
df_selected = data.loc[selected_indices]

In [10]:
df_selected


Unnamed: 0,county_id,county_desc,voter_reg_num,ncid,last_name,first_name,middle_name,name_suffix_lbl,status_cd,voter_status_desc,...,sanit_dist_abbrv,sanit_dist_desc,rescue_dist_abbrv,rescue_dist_desc,munic_dist_abbrv,munic_dist_desc,dist_1_abbrv,dist_1_desc,vtd_abbrv,vtd_desc
2,42,HALIFAX,104490,BZ73617,ABAKKAL,NOURA,,,A,ACTIVE,...,SAN,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-10,RR 10
4,42,HALIFAX,68003,BZ45167,ABBOTT,AMY,CLARY,,D,DENIED,...,,,,,,,,,,
6,42,HALIFAX,65983,BZ43147,ABBOTT,DAMIEN,S,,A,ACTIVE,...,,,,,,,7.0,PROSECUTORIAL DISTRICT 7,LIT-1,LIT 1
7,42,HALIFAX,92176,BZ65026,ABBOTT,ERICA,DENEEN,,I,INACTIVE,...,SAN,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-7,RR 7
8,42,HALIFAX,83239,BZ58220,ABBOTT,JOSIE,EDWARDS,,I,INACTIVE,...,SAN,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,WEL-3,WEL 3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40836,42,HALIFAX,102061,BZ71995,ZOTOS,KELSEY,ANN-MARIE,,I,INACTIVE,...,SAN,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-5,RR 5
40837,42,HALIFAX,107670,BZ45511,ZSEBEHAZY,ROBERT,JOHN,,A,ACTIVE,...,,,,,,,7.0,PROSECUTORIAL DISTRICT 7,WEL-3,WEL 3
40838,42,HALIFAX,106555,BZ74992,ZUBAIDI,AFAF,MOHAMED SALEH,,A,ACTIVE,...,SAN,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-1,RR 1
40839,42,HALIFAX,21018,BZ10671,ZUCKER,DIANNE,,,A,ACTIVE,...,SAN,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-10,RR 10


In [25]:
df_selected

Unnamed: 0,county_id,county_desc,voter_reg_num,ncid,last_name,first_name,middle_name,name_suffix_lbl,status_cd,voter_status_desc,...,sanit_dist_abbrv,sanit_dist_desc,rescue_dist_abbrv,rescue_dist_desc,munic_dist_abbrv,munic_dist_desc,dist_1_abbrv,dist_1_desc,vtd_abbrv,vtd_desc
2,42,HALIFAX,104490,BZ73617,ABAKKAL,NOURA,,,A,ACTIVE,...,SAN,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-10,RR 10
4,42,HALIFAX,68003,BZ45167,ABBOTT,AMY,CLARY,,D,DENIED,...,,,,,,,,,,
6,42,HALIFAX,65983,BZ43147,ABBOTT,DAMIEN,S,,A,ACTIVE,...,,,,,,,7.0,PROSECUTORIAL DISTRICT 7,LIT-1,LIT 1
7,42,HALIFAX,92176,BZ65026,ABBOTT,ERICA,DENEEN,,I,INACTIVE,...,SAN,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-7,RR 7
8,42,HALIFAX,83239,BZ58220,ABBOTT,JOSIE,EDWARDS,,I,INACTIVE,...,SAN,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,WEL-3,WEL 3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40836,42,HALIFAX,102061,BZ71995,ZOTOS,KELSEY,ANN-MARIE,,I,INACTIVE,...,SAN,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-5,RR 5
40837,42,HALIFAX,107670,BZ45511,ZSEBEHAZY,ROBERT,JOHN,,A,ACTIVE,...,,,,,,,7.0,PROSECUTORIAL DISTRICT 7,WEL-3,WEL 3
40838,42,HALIFAX,106555,BZ74992,ZUBAIDI,AFAF,MOHAMED SALEH,,A,ACTIVE,...,SAN,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-1,RR 1
40839,42,HALIFAX,21018,BZ10671,ZUCKER,DIANNE,,,A,ACTIVE,...,SAN,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-10,RR 10


In [None]:
import uuid
import json

def perturb_record_with_id(record, original_id):
    perturbed1 = record.copy()
    perturbed2 = record.copy()
    
    perturbed1['last_name'] = perturbed1['last_name'].lower()
    perturbed2['age_at_year_end'] += 1

    new_id1 = f"novo1{original_id}"
    new_id2 = f"novo2{original_id}"
    
    perturbed1['id'] = new_id1
    perturbed2['id'] = new_id2

    return [(new_id1, perturbed1), (new_id2, perturbed2)]

ground_truth = {}
perturbed_records = []

for idx, record in df_selected.iterrows():
    original_id = record['ncid']
    perturbed = perturb_record_with_id(record, original_id)
    for new_id, perturbed_record in perturbed:
        ground_truth[new_id] = original_id
        perturbed_records.append(perturbed_record)

# Export ground truth to CSV
with open('ground_truth.csv', 'w') as f:
    for original_id, new_id in ground_truth.items():
        f.write(f"{original_id},{new_id}\n")

# Create DataFrame for perturbed records and export to CSV
df_perturbed = pd.DataFrame(perturbed_records)
df_perturbed.to_csv('perturbed_records.csv', index=False)

In [38]:
ground_truth

{'BZ73617': 'novo2BZ73617', 'BZ45167': 'novo2BZ45167'}

In [22]:
df_perturbed.to_csv('perturbed_records.csv', index=False)

In [23]:
df_perturbed

Unnamed: 0,county_id,county_desc,voter_reg_num,ncid,last_name,first_name,middle_name,name_suffix_lbl,status_cd,voter_status_desc,...,sanit_dist_desc,rescue_dist_abbrv,rescue_dist_desc,munic_dist_abbrv,munic_dist_desc,dist_1_abbrv,dist_1_desc,vtd_abbrv,vtd_desc,id
2,42,HALIFAX,104490,BZ73617,abakkal,NOURA,,,A,ACTIVE,...,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-10,RR 10,novo1104490
2,42,HALIFAX,104490,BZ73617,ABAKKAL,NOURA,,,A,ACTIVE,...,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-10,RR 10,novo2104490
4,42,HALIFAX,68003,BZ45167,abbott,AMY,CLARY,,D,DENIED,...,,,,,,,,,,novo168003
4,42,HALIFAX,68003,BZ45167,ABBOTT,AMY,CLARY,,D,DENIED,...,,,,,,,,,,novo268003
6,42,HALIFAX,65983,BZ43147,abbott,DAMIEN,S,,A,ACTIVE,...,,,,,,7.0,PROSECUTORIAL DISTRICT 7,LIT-1,LIT 1,novo165983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40838,42,HALIFAX,106555,BZ74992,ZUBAIDI,AFAF,MOHAMED SALEH,,A,ACTIVE,...,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-1,RR 1,novo2106555
40839,42,HALIFAX,21018,BZ10671,zucker,DIANNE,,,A,ACTIVE,...,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-10,RR 10,novo121018
40839,42,HALIFAX,21018,BZ10671,ZUCKER,DIANNE,,,A,ACTIVE,...,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-10,RR 10,novo221018
40841,42,HALIFAX,110115,EH271079,zumbrunnen,DANIEL,,JR,A,ACTIVE,...,,,,ENF,TOWN OF ENFIELD,7.0,PROSECUTORIAL DISTRICT 7,ENF-3,ENF 3,novo1110115


In [None]:
truth

Unnamed: 0,104490,novo2104490
0,68003,novo268003
1,65983,novo265983
2,92176,novo292176
3,83239,novo283239
4,106195,novo2106195
...,...,...
20440,102061,novo2102061
20441,107670,novo2107670
20442,106555,novo2106555
20443,21018,novo221018


In [11]:
def perturb_record(record):
    perturbed1 = record.copy()
    perturbed2 = record.copy()
    
    perturbed1['last_name'] = perturbed1['last_name'].lower()
    perturbed2['age_at_year_end'] += 1

    return [perturbed1, perturbed2]

perturbed_records = []
for _, record in df_selected.iterrows():
    perturbed_records.extend(perturb_record(record))

# Passo 3: Criar o DataFrame B
df_b = pd.DataFrame(perturbed_records)


In [12]:
df_b

Unnamed: 0,county_id,county_desc,voter_reg_num,ncid,last_name,first_name,middle_name,name_suffix_lbl,status_cd,voter_status_desc,...,sanit_dist_abbrv,sanit_dist_desc,rescue_dist_abbrv,rescue_dist_desc,munic_dist_abbrv,munic_dist_desc,dist_1_abbrv,dist_1_desc,vtd_abbrv,vtd_desc
2,42,HALIFAX,104490,BZ73617,abakkal,NOURA,,,A,ACTIVE,...,SAN,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-10,RR 10
2,42,HALIFAX,104490,BZ73617,ABAKKAL,NOURA,,,A,ACTIVE,...,SAN,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-10,RR 10
4,42,HALIFAX,68003,BZ45167,abbott,AMY,CLARY,,D,DENIED,...,,,,,,,,,,
4,42,HALIFAX,68003,BZ45167,ABBOTT,AMY,CLARY,,D,DENIED,...,,,,,,,,,,
6,42,HALIFAX,65983,BZ43147,abbott,DAMIEN,S,,A,ACTIVE,...,,,,,,,7.0,PROSECUTORIAL DISTRICT 7,LIT-1,LIT 1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40838,42,HALIFAX,106555,BZ74992,ZUBAIDI,AFAF,MOHAMED SALEH,,A,ACTIVE,...,SAN,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-1,RR 1
40839,42,HALIFAX,21018,BZ10671,zucker,DIANNE,,,A,ACTIVE,...,SAN,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-10,RR 10
40839,42,HALIFAX,21018,BZ10671,ZUCKER,DIANNE,,,A,ACTIVE,...,SAN,RR SANITARY DISTRICT,,,RR,CITY OF ROANOKE RAPIDS,7.0,PROSECUTORIAL DISTRICT 7,RR-10,RR 10
40841,42,HALIFAX,110115,EH271079,zumbrunnen,DANIEL,,JR,A,ACTIVE,...,,,,,ENF,TOWN OF ENFIELD,7.0,PROSECUTORIAL DISTRICT 7,ENF-3,ENF 3


In [16]:
df_b.to_csv('ncvoter42_perturbed.csv', index=False)

In [14]:
data.to_csv('ncvoter42.csv', index=False)
