In [69]:
import pandas as pd
import re

In [70]:
dataset_path = r"C:\Users\Lian4ik\Desktop\derm_project\data\dataset.csv"
metadata_path = r"C:\Users\Lian4ik\Desktop\derm_project\data\metadata.csv"

In [71]:
dataset_df = pd.read_csv(dataset_path)
metadata_df = pd.read_csv(metadata_path)

In [72]:
# Extract the core ID from 'image_id' in dataset.csv
dataset_df['core_id'] = dataset_df['image_id'].apply(lambda x: re.match(r"ISIC_\d+", x).group())


In [73]:
dataset_df.head()

Unnamed: 0,image_id,target,core_id
0,ISIC_0015290_0.JPG,0,ISIC_0015290
1,ISIC_0015291_0.JPG,0,ISIC_0015291
2,ISIC_0015292_0.JPG,0,ISIC_0015292
3,ISIC_0015293_0.JPG,0,ISIC_0015293
4,ISIC_0015294_0.JPG,0,ISIC_0015294


In [74]:
# Rename 'isic_id' to 'core_id' in metadata.csv for merging
metadata_df = metadata_df.rename(columns={'isic_id': 'core_id'})

In [75]:
metadata_df.head()

Unnamed: 0,core_id,copyright_licence,full_url,full_size,256_url,256_size,pixels_x,pixels_y,image_type,concomitant_biopsy,sex,anatom_site_general,benign_malignant,diagnosis_1,diagnosis_confirm_type,age_approx,lesion_id,patient_id
0,ISIC_1064919,CC-BY-NC,https://content.isic-archive.com/e11a451e-56d3...,153259,https://content.isic-archive.com/eed95197-f4fc...,3302,1872,1053,dermoscopic,False,male,lower extremity,benign,Benign,serial imaging showing no change,50.0,IL_3752689,IP_8330575
1,ISIC_2708737,CC-BY,https://content.isic-archive.com/7fd26d90-cbbd...,1690118,https://content.isic-archive.com/d0ea6212-39f0...,5031,6000,4000,dermoscopic,False,female,posterior torso,benign,Benign,serial imaging showing no change,65.0,IL_2374200,IP_8051701
2,ISIC_1944865,CC-BY,https://content.isic-archive.com/d74f790a-4fdd...,1141325,https://content.isic-archive.com/2bfbd1eb-6bca...,3271,6000,4000,dermoscopic,False,female,lower extremity,benign,Benign,serial imaging showing no change,45.0,IL_1790311,IP_5889408
3,ISIC_0032083,CC-BY-NC,https://content.isic-archive.com/bd3a61e1-10b8...,19579,https://content.isic-archive.com/e0531cf3-420a...,3862,600,450,dermoscopic,False,female,,benign,Benign,serial imaging showing no change,55.0,IL_1497139,
4,ISIC_0028959,CC-BY-NC,https://content.isic-archive.com/7b8adcca-2e9f...,26583,https://content.isic-archive.com/787e7f62-20b8...,6426,600,450,dermoscopic,False,male,anterior torso,benign,Benign,serial imaging showing no change,45.0,IL_0372697,


In [76]:
# Select only relevant columns from metadata.csv
metadata_df = metadata_df[['core_id', 'sex', 'age_approx']]

In [77]:
metadata_df.head()

Unnamed: 0,core_id,sex,age_approx
0,ISIC_1064919,male,50.0
1,ISIC_2708737,female,65.0
2,ISIC_1944865,female,45.0
3,ISIC_0032083,female,55.0
4,ISIC_0028959,male,45.0


In [78]:
# Merge the dataframes on 'core_id'
merged_df = pd.merge(dataset_df, metadata_df, on='core_id', how='left')

In [79]:
merged_df.head()

Unnamed: 0,image_id,target,core_id,sex,age_approx
0,ISIC_0015290_0.JPG,0,ISIC_0015290,female,30.0
1,ISIC_0015291_0.JPG,0,ISIC_0015291,female,35.0
2,ISIC_0015292_0.JPG,0,ISIC_0015292,male,60.0
3,ISIC_0015293_0.JPG,0,ISIC_0015293,female,45.0
4,ISIC_0015294_0.JPG,0,ISIC_0015294,female,30.0


In [80]:
# Drop 'core_id' after merging to restore original structure
merged_df = merged_df.drop(columns=['core_id'])

In [81]:
merged_df.head()

Unnamed: 0,image_id,target,sex,age_approx
0,ISIC_0015290_0.JPG,0,female,30.0
1,ISIC_0015291_0.JPG,0,female,35.0
2,ISIC_0015292_0.JPG,0,male,60.0
3,ISIC_0015293_0.JPG,0,female,45.0
4,ISIC_0015294_0.JPG,0,female,30.0


In [82]:
# Check for missing values in 'sex' and 'age' columns
missing_values = merged_df[['sex', 'age_approx']].isnull().sum()

# Display results
print("Missing values in merged dataset:")
print(missing_values)

Missing values in merged dataset:
sex           2060
age_approx    2253
dtype: int64


In [83]:
# Remove all rows with missing values
merged_df_cleaned = merged_df.dropna()

# Save the cleaned dataset
cleaned_data_path = r"C:\Users\Lian4ik\Desktop\derm_project\data\merged_cleaned_dataset.csv"
merged_df_cleaned.to_csv(cleaned_data_path, index=False)



In [84]:
merged_df_cleaned.head()

Unnamed: 0,image_id,target,sex,age_approx
0,ISIC_0015290_0.JPG,0,female,30.0
1,ISIC_0015291_0.JPG,0,female,35.0
2,ISIC_0015292_0.JPG,0,male,60.0
3,ISIC_0015293_0.JPG,0,female,45.0
4,ISIC_0015294_0.JPG,0,female,30.0


In [85]:
# Check for missing values in 'sex' and 'age' columns
missing_values = merged_df_cleaned[['sex', 'age_approx']].isnull().sum()

# Display results
print("Missing values in merged dataset:")
print(missing_values)

Missing values in merged dataset:
sex           0
age_approx    0
dtype: int64


In [86]:
merged_df_cleaned.shape[0]

70248

In [87]:
# Convert `age_approx` to integers
merged_df_cleaned["age_approx"] = merged_df_cleaned["age_approx"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df_cleaned["age_approx"] = merged_df_cleaned["age_approx"].astype(int)


In [88]:
# Save the merged dataset
merged_df_cleaned.to_csv(r"C:\Users\Lian4ik\Desktop\derm_project\data\merged_cleaned_dataset.csv", index=False)