# Machine Learning: Paintings Classification Problem

## 0. Introduction: 
In this project we try to accurately predict the artist that painted a given painting. To this end, we train three different classifiers and we test them on a dataset compiled of paintings from four different painters: Salvador Dali, Claude Monet, Pablo Picasso, H. Rembrandt.


In [None]:
import numpy as np
from numpy import asarray
import matplotlib.pyplot as plt
import glob
import os
import pandas as pd
import cv2
from skimage.filters import sobel
from skimage.feature import hog
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import time
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import joblib
import urllib.request
import validators

## 1. Data Processing

### 1.1 Convert Images

This code segment contains all methods that process the original images by transforming them to a given size (SIZE var.) and then transforming them to greyscale. After that, they are stored in an numpy array. There is also the option to replace the original images with the resized ones, for easier manipulation at a later time and smaller storage space. Finaly, the data_process method is used for preparing the data for the training of the model, while the image_process method takes as argument only a single image and is used later for demo purposes.

In [None]:
def data_process(string_path):
    SIZE = 64  # set the image size convertion 

    train_images = []
    train_labels = []

    for dir_path in glob.glob(string_path):
        #print(dir_path)
        label = dir_path.split('\\')[-1]
        #print(label)

        k = 0
        for img_path in glob.glob(os.path.join(dir_path, '*.jpg')):
            if k < 500:
                #print(img_path)
                img = cv2.imread(img_path, cv2.IMREAD_COLOR)
                img = cv2.resize(img, (SIZE,SIZE))
                # cv2.imwrite(img_path, img)   # Use this to replace original images with resized ones
                # convert to greyscale
                img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                train_images.append(img)
                train_labels.append(label)
            k+=1

    train_images = np.array(train_images)
    train_labels = np.array(train_labels)

    return train_images, train_labels

In [None]:
# data process

# Data for training/testing/validation
x_train, train_labels = data_process('RawData/train/*')
x_train = x_train / 255.0
x_train.shape

# Data for demonstration
x_demo, demo_labels = data_process('RawData/demo/*')
x_demo = x_demo / 255.0

### 1.2 Extract features from the images & create the future matrix

This segment contains the functions that are used to extract features from the images. The input is an numpy array containing the original images (as it is transformed and prepared from the previous functions). The feature_extractor() method takes as input the numpy array and calculates 4 different Gabor features per image, as well as hog features for each image and its pixel values. Its output is a pandas dataframe with each feature as a column (plus a hog_size var that is used later for processing the final feature matrix).

In [None]:
# feature extraction function
def feature_extractor(dataset):
    SIZE = 64
    image_dataset = pd.DataFrame()
    for image in range(dataset.shape[0]):
        df = pd.DataFrame()
        input_img = dataset[image, :, :]  # one more dimension [:] if I have color
        img = input_img
        
        # Pixel values
        pixel_values = img.reshape(-1)
        df['Pixel_Value'] = pixel_values
        #df['Image_Name'] = image
        
        # Gabor
        num = 1
        kernels = []
        for theta in range(2):
            theta = theta / 4. * np.pi
            for sigma in range(1,3):  # range(1,3) default
                lamda = np.pi / 4.
                gamma = 0.5
                gabor_label = 'Gabor' + str(num)
                ksize = 9
                kernel = cv2.getGaborKernel((ksize, ksize), sigma, theta, lamda, gamma, 0, ktype=cv2.CV_32F)    
                kernels.append(kernel)
                
                fimg = cv2.filter2D(img, cv2.CV_8UC3, kernel)
                filtered_img = fimg.reshape(-1)
                df[gabor_label] = filtered_img  #Labels columns as Gabor1, Gabor2, etc.
#                print(gabor_label, ': theta=', theta, ': sigma=', sigma, ': lamda=', lamda, ': gamma=', gamma)
                num += 1  #Increment for gabor column label
    
        # Hog
        resized_img = cv2.resize(img, (64,128))
        fd, hog_image = hog(resized_img, orientations=9, pixels_per_cell=(8, 8), 
                    cells_per_block=(2, 2), visualize=True)
        
        hog_size = fd.shape
        d_zeroes = pd.DataFrame(np.zeros((SIZE*SIZE, 1)))
        d_hog = pd.DataFrame(fd)
        df2 = d_zeroes.iloc[3780:,:]
        df3 = pd.concat([d_hog,df2],axis=0, ignore_index=True)
        df['hog'] = df3

            
        image_dataset = pd.concat([image_dataset, df],axis=0) 
    return image_dataset, hog_size

This segment contains the method that is used to create the final feature matrix. It takes as input the pandas dataframe previously created by the feature_extractor method and it reshapes it by flattening the columns and concatinating them. The end result is a pandas dataframe that contains the extracted features as a row for each image. That is, each row is essentialy the feature vector corresponding to an image.

In [None]:
def create_feature_matrix(x_train, image_features):
    SIZE = 64
    pixels_index= SIZE*SIZE 
    image_numbers = x_train.shape[0]

    feature_matrix = pd.DataFrame()
    for i in range(0, image_numbers):
        new = pd.DataFrame()
        temp_hog = pd.DataFrame()
        df1 = image_features.iloc[:pixels_index,:]
        df2 = image_features.iloc[pixels_index:,:]

        new = pd.concat([new, df1['Pixel_Value']],axis=0, ignore_index=True) 
        new = pd.concat([new, df1['Gabor1']],axis=0, ignore_index=True) 
        new = pd.concat([new, df1['Gabor2']],axis=0, ignore_index=True) 
        new = pd.concat([new, df1['Gabor3']],axis=0, ignore_index=True) 
        new = pd.concat([new, df1['Gabor4']],axis=0, ignore_index=True)

        temp_hog = df1['hog']
        temp_hog = temp_hog[:3780]   # TODO: add hog size VAR

        new = pd.concat([new, temp_hog],axis=0, ignore_index=True)
        new = new.T

        feature_matrix = pd.concat([feature_matrix, new], axis=0, ignore_index=True) 
        image_features = df2
        new = pd.DataFrame(None)
        temp_hog = pd.DataFrame(None)
    
    return feature_matrix
#    feature_matrix.shape

Use the above functions to convert the .jpg images in train/demo folders to a feature matrix corresponding to each, then export the feature matrix to .csv files for future use. Export also a PCA reduced feature matrix for visualization. In total, after we run the next 4 cells we should have generated four distinct files in our main project directory: 
1. feature_matrix.csv
2. feature_matrix_pca.csv 
3. feature_matrix_demo.csv
4. feature_matrix_demo_pca.csv

In [None]:
# Data for training

# feature extraction
image_features, hog_size = feature_extractor(x_train)
# feature matrix creation
feature_matrix = create_feature_matrix(x_train, image_features)

# pandas dataframe creation
feature_matrix['painter'] = train_labels
feature_matrix.reset_index(drop=True)

### Export Feature Matrix to .csv file
feature_matrix.to_csv('feature_matrix.csv', index=False)

In [None]:
#feature_matrix.head()

### 1.3 Train Data PCA & Visualization

In [None]:
# PCA for visualization   ***** Do this only once, then load the dataset

feature_matrix = pd.read_csv('feature_matrix.csv')  # ****LOAD if you have allready created it from previous section

X = feature_matrix.drop(columns = 'painter')
y = feature_matrix['painter']

scaler = StandardScaler()
X_norm = scaler.fit_transform(X)
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_norm)

feature_matrix_pca = pd.DataFrame(data = X_pca, columns = ['pca_1', 'pca_2', 'pca_3'])
feature_matrix_pca['painter'] = y

feature_matrix_pca.to_csv('feature_matrix_pca.csv', index=False)

In [None]:
# Display the pca reduced image of the dataset:
fig = px.scatter_3d(feature_matrix_pca, x='pca_1', y='pca_2', z='pca_3',
              color='painter')
fig.show()

### 1.4 Create Demo Feature Matrix

In [None]:
# Data for DEMO

# feature extraction
image_features, hog_size = feature_extractor(x_demo)
# feature matrix creation
feature_matrix_demo = create_feature_matrix(x_demo, image_features)

# pandas dataframe creation
feature_matrix_demo['painter'] = demo_labels
feature_matrix_demo.reset_index(drop=True)

### Export Feature Matrix to .csv file
feature_matrix_demo.to_csv('feature_matrix_demo.csv', index=False)

### 1.5 Demo Data PCA & Visualization

In [None]:
### PCA for demo visualization

# feature_matrix_demo = pd.read_csv('feature_matrix_demo.csv')  ****LOAD if you have allready created it from previous section

X = feature_matrix_demo.drop(columns = 'painter')
y = feature_matrix_demo['painter']

scaler = StandardScaler()
X_norm = scaler.fit_transform(X)
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_norm)

feature_matrix_pca = pd.DataFrame(data = X_pca, columns = ['pca_1', 'pca_2', 'pca_3'])
feature_matrix_pca['painter'] = y

feature_matrix_pca.to_csv('feature_matrix_demo_pca.csv', index=False)

In [None]:
# Display the pca reduced image of the dataset:
fig = px.scatter_3d(feature_matrix_pca, x='pca_1', y='pca_2', z='pca_3',
              color='painter')
fig.show()

### 1.6 Feature selection

In this section we import the feature matrix that we created and we apply a selectKBest method in order to rank the features and select the best. We use a basic random forest classifier as benchmark.

In [None]:
feature_matrix = pd.read_csv('feature_matrix.csv')   # LOAD if you have allready created it from previous section

In [None]:
X = feature_matrix.drop(columns = 'painter')
y = feature_matrix['painter']

from sklearn.feature_selection import SelectKBest, chi2
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)

score_features = []

for i in range(1000, len(X.iloc[0]), 1000):
    X_new = SelectKBest(chi2, k=i).fit_transform(X, y)
    rf = RandomForestClassifier()
    scores = cross_val_score(rf, X_new, y, cv=3)
    scores = round(scores.mean(),4)*100
    score_features.append([scores, i])
    print(f'Mean score for {i} features is {scores}')

In [None]:
#print(np.max(score_features))
score_features = np.array(score_features)
score_feature = np.round(score_features)
print(score_features[:,0])
max = np.max(score_features[:,0])
#print(max)
x =np.where(score_features == max)
#print(x[0])
bestK = int(score_features[x[0], 1])
#print(score_features[x[0], 1])
#print(bestK)

print(f"The best feature number that yielded the highest accuracy based on the selectKBest: {bestK}")
select = SelectKBest(score_func=chi2, k=bestK)
z = select.fit_transform(X,y)

In [None]:
feature_matrix_kbest = pd.DataFrame(data = z)
feature_matrix_kbest['painter'] = y
feature_matrix_kbest.tail()
feature_matrix_kbest.to_csv('feature_matrix_kbest.csv', index=False)

In [None]:
feature_matrix_kbest.head()

In [None]:
fig = px.scatter(x=score_features[:,0], y=score_features[:,1], labels={'x':'cross-val score (cv=3)', 'y':'number of features'}) # override keyword names with labels
fig.show()