### MACHINE LEARNING PROJECT

___
### PREREQUISITES

To process images I'll be using OpenCV library. It is important to take a look at the [docs][1] before running the next cell  
as you may want to use another OpenCV package. For this project I'll use the *'main modules package'*

[1]:https://pypi.org/project/opencv-python/

In [2]:
# !pip install opencv-python

In [3]:
### TODO - .yaml

___
### IMPORTS

In [1]:
# Modules used for data handling / test
import os
import csv
import cv2
import pathlib
import time
import pickle

from utils import get_collection, show_collection, nameof, mklist


# Modules used for EDA
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib.patches import Rectangle


# Modules used for image processing
import cv2

from collections import Counter
from utils import crop_img, chi_osc, extract_img_data, get_img_rgb
from utils import resize_img, reduce_col_palette, whitespace


# Modules used for ML
from sklearn.cluster import KMeans
from utils import color_quant

from utils import data_report

In [5]:
### TODO ### Import a class from a module

# For a better pd.DataFrame visualization
class display(object):
    '''This class was found in 'Python Data Science Handbook' by jakevdp (Jake Vanderplas),
    which you can access though his GitHub repository
    (https://github.com/jakevdp/PythonDataScienceHandbook)'''
    
    template = '''<div style="float: left; padding: 10px;">
                  <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
                  </div>'''
    
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_()) for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a)) for a in self.args)

In [6]:
# Modules settings
%matplotlib inline

___
### UPDATE UTILS

In [2]:
# This cell only needs to be executed to update utils
# if modified after been imported

%run utils

___
### BASIC EDA

In [8]:
raw_data = pd.read_csv('./data/raw_museum/raw_data.csv', names=['img_ID', 'artist', 'height', 'width'])

raw_data.shape, raw_data.head()

((5233, 4),
                 img_ID      artist  height  width
 0  9223372032559824886  caravaggio     559    474
 1               186636  caravaggio     900    863
 2               186724  caravaggio     800    541
 3               186639  caravaggio    3239   4501
 4               186671  caravaggio     912   1200)

In [9]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5233 entries, 0 to 5232
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   img_ID  5233 non-null   int64 
 1   artist  5233 non-null   object
 2   height  5233 non-null   int64 
 3   width   5233 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 163.7+ KB


In [10]:
raw_data.nunique()

img_ID    5233
artist      14
height    1609
width     1510
dtype: int64

In [11]:
# Get all paths to .jpg files
extensions = ['.jpg']

raw_museum = get_collection(path='./images/raw_museum', extensions=extensions)
big_museum = get_collection(path='./images/big_museum', extensions=extensions)
large_museum = get_collection(path='./images/large_museum', extensions=extensions)
medium_museum = get_collection(path='./images/medium_museum', extensions=extensions)
small_museum = get_collection(path='./images/small_museum', extensions=extensions)

collections = [big_museum, large_museum, medium_museum, small_museum]

print(f'{len(raw_museum)} images in raw_museum')
print(f'{len(big_museum)} images in big_museum')
print(f'{len(large_museum)} images in large_museum')
print(f'{len(medium_museum)} images in medium_museum')
print(f'{len(small_museum)} images in small_museum:')

5468 images in raw_collection
4276 images in large_collection
4276 images in mid_collection
4276 images in mid_sqr_collection
4276 images in low_sqr_collection:


In [12]:
# Get all paths to .csv files
extensions = ['.csv']

big_museum = get_collection(path='./images/big_data', extensions=extensions)
large_museum = get_collection(path='./images/large_data', extensions=extensions)
medium_museum = get_collection(path='./images/medium_data', extensions=extensions)
small_museum = get_collection(path='./images/small_data', extensions=extensions)

# Turn paths into str
big_museum = [str(i) for i in big_data]
large_museum = [str(i) for i in large_data]
medium_museum = [str(i) for i in medium_data]
small_museum = [str(i) for i in small_data]

# Build museums DataFrame
columns_names=['img_ID', 'artist', 'height', 'width', 'whitespace', 'chiaroscuro',
               'color_01', 'color_02', 'color_03', 'color_04', 'color_05', 
               'color_06', 'color_07', 'color_08', 'color_09', 'color_10']

big_museum = pd.concat((pd.read_csv(file, names=columns_names) for file in big_museum), ignore_index=True)
large_museum = pd.concat((pd.read_csv(file, names=columns_names) for file in large_museum), ignore_index=True)
medium_museum = pd.concat((pd.read_csv(file, names=columns_names) for file in medium_museum), ignore_index=True)
small_museum = pd.concat((pd.read_csv(file, names=columns_names) for file in small_museum), ignore_index=True)

museums = [big_museum, large_museum, medium_museum, small_museum]

In [13]:
[museum.shape for museum in museums]

[(4815, 16), (4649, 16), (4476, 16), (4600, 16)]

In [14]:
[museum['img_ID'].nunique() for museum in museums]

[4815, 4649, 4476, 4600]

I'll keep only the common images to all museums

In [15]:
A = big_museum['img_ID'].isin(large_museum['img_ID'])
B = big_museum['img_ID'].isin(medium_museum['img_ID'])
C = big_museum['img_ID'].isin(small_museum['img_ID'])

big_museum = big_museum[A & B & C]

In [None]:
A = large_museum['img_ID'].isin(big_museum['img_ID'])
B = large_museum['img_ID'].isin(medium_museum['img_ID'])
C = large_museum['img_ID'].isin(small_museum['img_ID'])

large_museum = large_museum[A & B & C]

In [None]:
A = medium_museum['img_ID'].isin(big_museum['img_ID'])
B = medium_museum['img_ID'].isin(large_museum['img_ID'])
C = medium_museum['img_ID'].isin(small_museum['img_ID'])

medium_museum = medium_museum[A & B & C]

In [None]:
A = small_museum['img_ID'].isin(big_museum['img_ID'])
B = small_museum['img_ID'].isin(large_museum['img_ID'])
C = small_museum['img_ID'].isin(medium_museum['img_ID'])

small_museum = small_museum[A & B & C]

In [19]:
# Update museums
museums = [big_museum, large_museum, medium_museum, small_museum]

[museum.shape for museum in museums]

[(4271, 16), (4271, 16), (4271, 16), (4271, 16)]

I'll not only drop the rows but also the images

In [20]:
# IMG_ID are not int64 type but object type, so I'll cast it
for museum in museums:
    museum['img_ID'] = museum['img_ID'].astype(str, errors='ignore')

In [21]:
big_museum.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4271 entries, 0 to 4637
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   img_ID       4271 non-null   object 
 1   artist       4271 non-null   object 
 2   height       4271 non-null   int64  
 3   width        4271 non-null   int64  
 4   whitespace   4271 non-null   float64
 5   chiaroscuro  4271 non-null   float64
 6   color_01     4271 non-null   object 
 7   color_02     4271 non-null   object 
 8   color_03     4271 non-null   object 
 9   color_04     4271 non-null   object 
 10  color_05     4271 non-null   object 
 11  color_06     4271 non-null   object 
 12  color_07     4271 non-null   object 
 13  color_08     4271 non-null   object 
 14  color_09     4271 non-null   object 
 15  color_10     4271 non-null   object 
dtypes: float64(2), int64(2), object(12)
memory usage: 567.2+ KB


In [22]:
for museum in museums:
    museum_name = nameof(museum, globals())
    del_images = 0
    
    for image in museum:
        img_ID = str(image).split('/')[-1].split('.')[0]
        
        # I can use any museum as all have the same images
        if big_museum['img_ID'][big_museum['img_ID'].str.contains(img_ID)].any():
            continue
        elif os.path.exists(image):
            os.remove(image)
            del_images += 1
        
    print(f'{del_images} images deleted from {museum_name}')

0 images deleted from large_collection
0 images deleted from mid_collection
0 images deleted from mid_sqr_collection
0 images deleted from low_sqr_collection


In [23]:
# Update museums
# Get all paths to .jpg files
extensions = ['.jpg']

raw_museum = get_collection(path='./images/raw_museum', extensions=extensions)
big_museum = get_collection(path='./images/big_museum', extensions=extensions)
large_museum = get_collection(path='./images/large_museum', extensions=extensions)
medium_museum = get_collection(path='./images/medium_museum', extensions=extensions)
small_museum = get_collection(path='./images/small_museum', extensions=extensions)

collections = [big_museum, large_museum, medium_museum, small_museum]

print(f'{len(raw_museum)} images in raw_museum')
print(f'{len(big_museum)} images in big_museum')
print(f'{len(large_museum)} images in large_museum')
print(f'{len(medium_museum)} images in medium_museum')
print(f'{len(small_museum)} images in small_museum:')

5468 images in raw_collection
4276 images in large_collection
4276 images in mid_collection
4276 images in mid_sqr_collection
4276 images in low_sqr_collection:


___

Let's take a closer look to the datasets

In [24]:
l_repo = data_report(big_museum)
m_repo = data_report(large_museum)
ms_repo = data_report(medium_museum)
ls_repo = data_report(small_museum)

display('l_repo', 'm_repo', 'ms_repo', 'ls_repo')

COL_N,img_ID,artist,height,width,whitespace,chiaroscuro,color_01,color_02,color_03,color_04,color_05,color_06,color_07,color_08,color_09,color_10
DATA_TYPE,object,object,int64,int64,float64,float64,object,object,object,object,object,object,object,object,object,object
MISSINGS (%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
UNIQUE_VALUES,4271,14,1,442,248,3871,87,83,88,87,88,90,90,89,94,94
CARDIN (%),100.0,0.33,0.02,10.35,5.81,90.63,2.04,1.94,2.06,2.04,2.06,2.11,2.11,2.08,2.2,2.2

COL_N,img_ID,artist,height,width,whitespace,chiaroscuro,color_01,color_02,color_03,color_04,color_05,color_06,color_07,color_08,color_09,color_10
DATA_TYPE,object,object,int64,int64,float64,float64,object,object,object,object,object,object,object,object,object,object
MISSINGS (%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.02
UNIQUE_VALUES,4271,14,1,1,2,2380,75,67,77,77,75,76,76,83,74,75
CARDIN (%),100.0,0.33,0.02,0.02,0.05,55.72,1.76,1.57,1.8,1.8,1.76,1.78,1.78,1.94,1.73,1.76

COL_N,img_ID,artist,height,width,whitespace,chiaroscuro,color_01,color_02,color_03,color_04,color_05,color_06,color_07,color_08,color_09,color_10
DATA_TYPE,object,object,int64,int64,float64,float64,object,object,object,object,object,object,object,object,object,object
MISSINGS (%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.02
UNIQUE_VALUES,4271,14,1,1,2,2380,74,73,80,79,73,78,77,79,82,77
CARDIN (%),100.0,0.33,0.02,0.02,0.05,55.72,1.73,1.71,1.87,1.85,1.71,1.83,1.8,1.85,1.92,1.8

COL_N,img_ID,artist,height,width,whitespace,chiaroscuro,color_01,color_02,color_03,color_04,color_05,color_06,color_07,color_08,color_09,color_10
DATA_TYPE,object,object,int64,int64,float64,float64,object,object,object,object,object,object,object,object,object,object
MISSINGS (%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.02,0.05,0.09,0.19,0.49
UNIQUE_VALUES,4271,14,1,1,2,1527,74,69,69,68,67,72,70,75,80,77
CARDIN (%),100.0,0.33,0.02,0.02,0.05,35.75,1.73,1.62,1.62,1.59,1.57,1.69,1.64,1.76,1.87,1.8


In [25]:
with open('./data/big_museum/big_museum_clean.csv', "w", newline="") as datafile:
    writer = csv.writer(datafile)            
    writer.writerows(big_museum)

with open('./data/large_museum/large_museum_clean.csv', "w", newline="") as datafile:
    writer = csv.writer(datafile)            
    writer.writerows(large_museum)

with open('./data/medium_museum/medium_museum._cleancsv', "w", newline="") as datafile:
    writer = csv.writer(datafile)            
    writer.writerows(medium_museum)

with open('./data/small_museum/small_museum_clean.csv', "w", newline="") as datafile:
    writer = csv.writer(datafile)            
    writer.writerows(small_museum)