# Art Auction Price Prediction - Data Exploration
## Initial Dataset Analysis

### Import Libraries

In [37]:
# !pip install pandas numpy seaborn matplotlib

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

plt.style.use('seaborn-v0_8')
%matplotlib inline

### Dataset Configuration

In [39]:
file_list = [
    'artDataset.csv',
    'artists.csv', 
    'artistsBAT.csv',
    'artworks.csv'
]

artist_dfs = [
    'artworks_andy_warhol.csv',
    'artworks_claude_monet.csv', 
    'artworks_david_hockney.csv',
    'artworks_gerhard_richter.csv',
    'artworks_gerhard_richter1.csv',
    'artworks_jean_michel_basquiat.csv',
    'artworks_mark_rothko.csv',
    'artworks_piccaso4.csv',
    'artworks_zao_wou_ki.csv'
]

base_path = "../data/row/"
artist_files_full = [os.path.join(base_path, file_name) for file_name in artist_dfs]

print("COMBINING ARTIST DATASETS")
print("=" * 60)

def combine_artist_datasets(file_paths):
    """
    Combine all individual artist datasets into one unified dataframe
    """
    all_dfs = []
    
    for file_path in file_paths:
        try:
            file_name = os.path.basename(file_path)
            df = pd.read_csv(file_path)
            
            artist_name = file_name.replace('artworks_', '').replace('.csv', '').replace('_', ' ').title()
            df['artist'] = artist_name  # Add artist column
            
            df.columns = [col.lower().strip() for col in df.columns]
            
            print(f"✓ {artist_name}: {df.shape[0]} artworks")
            all_dfs.append(df)
            
        except Exception as e:
            print(f"✗ Error reading {file_path}: {e}")
    
    if all_dfs:
        combined_df = pd.concat(all_dfs, ignore_index=True)
        return combined_df
    else:
        return pd.DataFrame()

combined_artists_df = combine_artist_datasets(artist_files_full)

output_path = os.path.join("../data/processed/", "all_artists_artworks.csv")

combined_artists_df.to_csv(output_path, index=False)
print(f"\n✓ Combined dataset saved to: {output_path}")

file_list.append("all_artists_artworks.csv")
print(f"✓ Added to file_list")

file_list_with_path = [os.path.join(base_path, file_name) for file_name in file_list]

COMBINING ARTIST DATASETS
✓ Andy Warhol: 320 artworks
✓ Claude Monet: 42 artworks
✓ David Hockney: 79 artworks
✓ Gerhard Richter: 115 artworks
✓ Gerhard Richter1: 115 artworks
✓ Jean Michel Basquiat: 120 artworks
✓ Mark Rothko: 26 artworks
✓ Piccaso4: 246 artworks
✓ Zao Wou Ki: 81 artworks

✓ Combined dataset saved to: ../data/processed/all_artists_artworks.csv
✓ Added to file_list


### Comprehensive Dataset Analysis Function

In [40]:
def analyze_dataset(file_name):
    """
    Comprehensive analysis of a single dataset
    """
    print("=" * 80)
    print(f"ANALYZING: {file_name}")
    print("=" * 80)
    
    try:
        df = pd.read_csv(file_name)
        
        print("\n1. BASIC INFORMATION:")
        print(f"   Shape: {df.shape} (rows: {df.shape[0]}, columns: {df.shape[1]})")
        
        print("\n2. DATA TYPES SUMMARY:")
        dtype_summary = df.dtypes.value_counts()
        for dtype, count in dtype_summary.items():
            print(f"   {dtype}: {count} columns")
        
        print("\n3. COLUMNS DETAIL:")
        for i, (col, dtype) in enumerate(zip(df.columns, df.dtypes), 1):
            print(f"   {i:2d}. {col:<30} {dtype}")
        
        print("\n4. MISSING VALUES ANALYSIS:")
        missing_data = df.isnull().sum()
        total_cells = np.prod(df.shape)
        total_missing = missing_data.sum()
        
        print(f"   Total missing values: {total_missing}/{total_cells} ({total_missing/total_cells*100:.2f}%)")
        
        missing_cols = missing_data[missing_data > 0]
        if len(missing_cols) > 0:
            print("   Columns with missing values:")
            for col, missing_count in missing_cols.items():
                percentage = (missing_count / len(df)) * 100
                print(f"     {col}: {missing_count} missing ({percentage:.1f}%)")
        else:
            print("   No missing values found!")
        
        print("\n5. SAMPLE DATA (first 3 rows):")
        display(df.head(3))
        
        numerical_cols = df.select_dtypes(include=[np.number]).columns
        if len(numerical_cols) > 0:
            print("\n6. NUMERICAL COLUMNS STATISTICS:")
            print(df[numerical_cols].describe())
        
        categorical_cols = df.select_dtypes(include=['object']).columns
        if len(categorical_cols) > 0:
            print("\n7. CATEGORICAL COLUMNS OVERVIEW:")
            for col in categorical_cols:
                unique_count = df[col].nunique()
                print(f"   {col}: {unique_count} unique values")
                if unique_count <= 10:  # Show values if not too many
                    print(f"     Values: {df[col].unique()}")
        
        return df
        
    except Exception as e:
        print(f"ERROR reading {file_name}: {e}")
        return None

### Execute Analysis on All Datasets

In [41]:
dataframes = {}

for file_name in file_list_with_path:
    df = analyze_dataset(file_name)
    if df is not None:
        # Store dataframe in dictionary using filename as key (without .csv extension)
        key_name = file_name.replace('.csv', '').replace("../data/processed/", '')
        dataframes[key_name] = df
        output_filename = f"{os.path.basename(file_name)}"
        output_path = os.path.join("../data/processed/", output_filename)
        df.to_csv(output_path, index=False)
        print(f"✓ Saved: {output_path}")


    
    print("\n" + "="*80 + "\n")

print(f"Successfully loaded {len(dataframes)} datasets")

ANALYZING: ../data/row/artDataset.csv

1. BASIC INFORMATION:
   Shape: (754, 9) (rows: 754, columns: 9)

2. DATA TYPES SUMMARY:
   object: 8 columns
   int64: 1 columns

3. COLUMNS DETAIL:
    1. Unnamed: 0                     int64
    2. price                          object
    3. artist                         object
    4. title                          object
    5. yearCreation                   object
    6. signed                         object
    7. condition                      object
    8. period                         object
    9. movement                       object

4. MISSING VALUES ANALYSIS:
   Total missing values: 1/6786 (0.01%)
   Columns with missing values:
     artist: 1 missing (0.1%)

5. SAMPLE DATA (first 3 rows):


Unnamed: 0.1,Unnamed: 0,price,artist,title,yearCreation,signed,condition,period,movement
0,0,28.500 USD,Tommaso Ottieri,Bayreuth Opera,2021,Signed on verso,This work is in excellent condition.,Contemporary,Baroque
1,1,3.000 USD,Pavel Tchelitchew,Drawings of the Opera,First Half 20th Century,Signed and titled,Not examined out of frame.No obvious signs of ...,Post-War,Surrealism
2,2,5.000 USD,Leo Gabin,Two on Sidewalk,2016,"Signed, titled and dated on verso",This work is in excellent condition.,Contemporary,Abstract



6. NUMERICAL COLUMNS STATISTICS:
       Unnamed: 0
count  754.000000
mean   376.500000
std    217.805341
min      0.000000
25%    188.250000
50%    376.500000
75%    564.750000
max    753.000000

7. CATEGORICAL COLUMNS OVERVIEW:
   price: 108 unique values
   artist: 454 unique values
   title: 679 unique values
   yearCreation: 136 unique values
   signed: 390 unique values
   condition: 376 unique values
   period: 5 unique values
     Values: ['Contemporary' 'Post-War' 'Modern' '19th Century' '[nan]']
   movement: 34 unique values
✓ Saved: ../data/processed/artDataset.csv


ANALYZING: ../data/row/artists.csv

1. BASIC INFORMATION:
   Shape: (15091, 6) (rows: 15091, columns: 6)

2. DATA TYPES SUMMARY:
   object: 3 columns
   float64: 2 columns
   int64: 1 columns

3. COLUMNS DETAIL:
    1. Artist ID                      int64
    2. Name                           object
    3. Nationality                    object
    4. Gender                         object
    5. Birth Year       

Unnamed: 0,Artist ID,Name,Nationality,Gender,Birth Year,Death Year
0,1,Robert Arneson,American,Male,1930.0,1992.0
1,2,Doroteo Arnaiz,Spanish,Male,1936.0,
2,3,Bill Arnold,American,Male,1941.0,



6. NUMERICAL COLUMNS STATISTICS:
          Artist ID    Birth Year   Death Year
count  15091.000000  11237.000000  4579.000000
mean   18297.556027   1930.852719  1974.287399
std    16632.963898     34.531997    31.153665
min        1.000000   1730.000000  1795.000000
25%     4195.500000   1910.000000  1958.000000
50%     8593.000000   1936.000000  1980.000000
75%    33088.500000   1956.000000  1998.000000
max    67695.000000   2012.000000  2017.000000

7. CATEGORICAL COLUMNS OVERVIEW:
   Name: 15039 unique values
   Nationality: 125 unique values
   Gender: 3 unique values
     Values: ['Male' 'Female' nan 'male']
✓ Saved: ../data/processed/artists.csv


ANALYZING: ../data/row/artistsBAT.csv

1. BASIC INFORMATION:
   Shape: (50, 8) (rows: 50, columns: 8)

2. DATA TYPES SUMMARY:
   object: 6 columns
   int64: 2 columns

3. COLUMNS DETAIL:
    1. id                             int64
    2. name                           object
    3. years                          object
    4. genre   

Unnamed: 0,id,name,years,genre,nationality,bio,wikipedia,paintings
0,0,Amedeo Modigliani,1884 - 1920,Expressionism,Italian,Amedeo Clemente Modigliani (Italian pronunciat...,http://en.wikipedia.org/wiki/Amedeo_Modigliani,193
1,1,Vasiliy Kandinskiy,1866 - 1944,"Expressionism,Abstractionism",Russian,Wassily Wassilyevich Kandinsky (Russian: Васи́...,http://en.wikipedia.org/wiki/Wassily_Kandinsky,88
2,2,Diego Rivera,1886 - 1957,"Social Realism,Muralism",Mexican,Diego María de la Concepción Juan Nepomuceno E...,http://en.wikipedia.org/wiki/Diego_Rivera,70



6. NUMERICAL COLUMNS STATISTICS:
             id   paintings
count  50.00000   50.000000
mean   24.50000  168.920000
std    14.57738  157.451105
min     0.00000   24.000000
25%    12.25000   81.000000
50%    24.50000  123.000000
75%    36.75000  191.750000
max    49.00000  877.000000

7. CATEGORICAL COLUMNS OVERVIEW:
   name: 50 unique values
   years: 50 unique values
   genre: 31 unique values
   nationality: 17 unique values
   bio: 50 unique values
   wikipedia: 50 unique values
✓ Saved: ../data/processed/artistsBAT.csv


ANALYZING: ../data/row/artworks.csv

1. BASIC INFORMATION:
   Shape: (130262, 21) (rows: 130262, columns: 21)

2. DATA TYPES SUMMARY:
   object: 12 columns
   float64: 8 columns
   int64: 1 columns

3. COLUMNS DETAIL:
    1. Artwork ID                     int64
    2. Title                          object
    3. Artist ID                      object
    4. Name                           object
    5. Date                           object
    6. Medium            

Unnamed: 0,Artwork ID,Title,Artist ID,Name,Date,Medium,Dimensions,Acquisition Date,Credit,Catalogue,...,Classification,Object Number,Diameter (cm),Circumference (cm),Height (cm),Length (cm),Width (cm),Depth (cm),Weight (kg),Duration (s)
0,2,"Ferdinandsbrücke Project, Vienna, Austria, Ele...",6210,Otto Wagner,1896,Ink and cut-and-pasted painted pages on paper,"19 1/8 x 66 1/2"" (48.6 x 168.9 cm)",1996-04-09,Fractional and promised gift of Jo Carole and ...,Y,...,Architecture,885.1996,,,48.6,,168.9,,,
1,3,"City of Music, National Superior Conservatory ...",7470,Christian de Portzamparc,1987,Paint and colored pencil on print,"16 x 11 3/4"" (40.6 x 29.8 cm)",1995-01-17,Gift of the architect in honor of Lily Auchinc...,Y,...,Architecture,1.1995,,,40.6401,,29.8451,,,
2,4,"Villa near Vienna Project, Outside Vienna, Aus...",7605,Emil Hoppe,1903,"Graphite, pen, color pencil, ink, and gouache ...","13 1/2 x 12 1/2"" (34.3 x 31.8 cm)",1997-01-15,Gift of Jo Carole and Ronald S. Lauder,Y,...,Architecture,1.1997,,,34.3,,31.8,,,



6. NUMERICAL COLUMNS STATISTICS:
          Artwork ID  Diameter (cm)  Circumference (cm)    Height (cm)  \
count  130262.000000    1399.000000           10.000000  111893.000000   
mean    82501.371636      23.248939           44.868020      37.712992   
std     58124.331702      45.460079           28.631604      48.151347   
min         2.000000       0.635000            9.900000       0.000000   
25%     34171.250000       7.900000           23.500000      18.100000   
50%     69541.500000      13.700000           36.000000      27.940056   
75%    128955.750000      24.782500           71.125000      44.450100   
max    218011.000000     914.400000           83.800000    9140.000000   

       Length (cm)     Width (cm)    Depth (cm)    Weight (kg)  Duration (s)  
count   736.000000  111003.000000  11443.000000     298.000000  3.084000e+03  
mean     89.117417      38.176838     18.291359    1248.278691  7.830060e+03  
std     329.717487      67.250118     57.703925   11856.456824

### Dataset Summary Comparison

In [42]:
print("DATASET SUMMARY COMPARISON")
print("=" * 100)

summary_data = []
for name, df in dataframes.items():
    summary_data.append({
        'Dataset': name.split('/')[-1],
        'Rows': df.shape[0],
        'Columns': df.shape[1],
        'Numerical Cols': len(df.select_dtypes(include=[np.number]).columns),
        'Categorical Cols': len(df.select_dtypes(include=['object']).columns),
        'Total Missing': df.isnull().sum().sum(),
        'Missing %': (df.isnull().sum().sum() / np.prod(df.shape)) * 100
    })

summary_df = pd.DataFrame(summary_data)
display(summary_df)

DATASET SUMMARY COMPARISON


Unnamed: 0,Dataset,Rows,Columns,Numerical Cols,Categorical Cols,Total Missing,Missing %
0,artDataset,754,9,1,8,1,0.014736
1,artists,15091,6,3,3,19926,22.006494
2,artistsBAT,50,8,2,6,0,0.0
3,artworks,130262,21,9,12,839439,30.686836
