In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
from functools import reduce
import cv2 
import math
from mpl_toolkits.axes_grid1 import ImageGrid
from sklearn.decomposition import PCA

In [2]:
if os.path.split(os.getcwd())[-1] == 'notebooks': os.chdir(os.path.split(os.getcwd())[-2])
print(os.getcwd())

/home/fitzaudoen/apps/oculardisease_recog


In [3]:
sns.set()
sns.set_context('talk')

In [4]:
df = pd.read_csv('data/full_df.csv')

In [6]:
def diagnosis_identifier(row):
    diagid = ''
    #print(row)
    for key in ['N', 'D','G', 'C', 'A', 'H', 'M', 'O']:
        if (int(row[key]) == 1):
            #print('yes')
            #print(key)
            diagid+=key
            #print(key)
    if len(diagid)==0: diagid='NAN'
    return diagid

df['diag_id'] = df.apply(lambda row: diagnosis_identifier(row), axis=1)

In [8]:
def extract_img_features(fn, imgdir='data/preprocessed_images'):
    '''
    Take input filename for preprocessed fundus image.
    Assumes resolution of 512 px by 512 px. 
    Returns a 1 dimensional row vector
    '''
    path = os.path.join(imgdir, fn)
    img = cv2.imread(path)
    try:
        img = img.reshape(1, 512*512*3)
    except:
        raise ValueError('Unable to reshape image. Ensure the image is 512 px by 512 px')
    
    return img


def df_to_arrays(df, imgdir='data/preprocessed_images'):
    '''
    Take input dataframe and add columns for pixel values for that rows images. Returns a new dataframe with 
    the existing data and the new image data.
    
    Keyword Arguments:
    df -- dataframe of image data. Assumes df has a column for image filename. Images must confirm to requirements
        for extract_img_features function
    imgdir -- location of images refrences in dataframe filename column
    
    Returns:
    data_arr: header information and diagnosis ID
    img_arr: image data in row vector form
    '''
    
    # Extract data from dataframe
    data_arr = df.values
    data_cols = list(df.columns.values)
    # Make sure filename is passed
    if 'filename' not in data_cols: raise ValueError('Dataframe must contain a filename column.')
    
    # Iterate over each row of data an extract an image for each filename
    img_arr = None
    for row in data_arr:
        fn = row[df.columns.values=='filename'][0]
        # print(fn)
        img = extract_img_features(fn, imgdir)
        if img_arr is None: img_arr=img
        else: img_arr=np.concatenate((img_arr, img))

   # img_features = np.array(img_features)  
    
    #Concatenate data into a new dataframe
    #pxcol_names = ['Pxcol_{}'.format(val) for val in range(0, img_features.shape[1])]
    
    #image_df = pd.DataFrame(img_features, columns=pxcol_names)
    #data_df = pd.DataFrame(data, columns=data_cols)
    
    return data_arr, img_arr

In [12]:
data, images = df_to_arrays(df.sample(500))

In [18]:
np.unique(data[:,19])

array(['A', 'AH', 'AO', 'C', 'CO', 'D', 'DA', 'DC', 'DCO', 'DG', 'DGO',
       'DH', 'DM', 'DMO', 'DO', 'G', 'GA', 'GH', 'GM', 'GO', 'H', 'HO',
       'M', 'MO', 'N', 'O'], dtype=object)

In [20]:
pca = PCA(n_components=0.7)

In [None]:
pca.fit(images)