<a href="https://colab.research.google.com/github/neekonsu/eqgen_ipynb/blob/master/Correlation_Between_Equine_Deafness_and_Facial_Features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Initial Setup


###Import main libraries

In [0]:
# import main ML libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
# for image processing, we import openCV
import cv2
# for interfacing with filesystem
import os
# for using randomness
from random import shuffle
# for listing progress bars where we need them
from tqdm import tqdm
# for converting png to jpg
from PIL import Image
# for building recursive file array
from glob import glob 
# FORMAT MATPLOTLIB
%matplotlib inline

###Clone Git Repository (batch)

In [2]:
try:
  assert os.listdir('/content/eqgen_batch')
except:
  !git clone https://github.com/neekonsu/eqgen_batch.git

Cloning into 'eqgen_batch'...
remote: Enumerating objects: 93, done.[K
remote: Counting objects: 100% (93/93), done.[K
remote: Compressing objects: 100% (93/93), done.[K
remote: Total 93 (delta 0), reused 93 (delta 0), pack-reused 0[K
Unpacking objects: 100% (93/93), done.


###Set initial variables

In [0]:
# paths to raw data
batch_path = '/content/eqgen_batch'
false_path = '/content/eqgen_batch/0'
true_path =  '/content/eqgen_batch/1'
# feature - label variables
x = []
y = []

### **Authenticate with google**

In [0]:
# from google.colab import drive
# # Mount google drive filesystem to /content/gdrive/My Drive/*
# drive.mount('/content/gdrive')

###Init drive service (v3)

In [5]:
print('Hearing horses (0)')
!ls '/content/eqgen_batch/0'

Hearing horses (0)
abigail-verberne-1179842-unsplash.jpg
adam-jang-314067-unsplash.jpg
alexandra-zota-1242250-unsplash.jpg
alex-blajan-119661-unsplash.jpg
alice-donovan-rouse-115131-unsplash.jpg
annie-spratt-207738-unsplash.jpg
annie-spratt-587023-unsplash.jpg
carlee-dittemore-1226247-unsplash.jpg
courtney-miller-1230955-unsplash.jpg
crystal-huff-757104-unsplash.jpg
danny-gallegos-355559-unsplash.jpg
darius-soodmand-118384-unsplash.jpg
florian-van-duyn-387776-unsplash.jpg
gene-devine-476058-unsplash.jpg
ghost-presenter-425367-unsplash.jpg
james-coleman-1210076-unsplash.jpg
joanne-o-keefe-270785-unsplash.jpg
liam-martens-716280-unsplash.jpg
lily-banse-460652-unsplash.jpg
lukas-l-452028-unsplash.jpg
melanie-hartshorn-348846-unsplash.jpg
michael-mroczek-115985-unsplash.jpg
nicolas-j-leclercq-1141089-unsplash.jpg
paolo-nicolello-1171866-unsplash.jpg
peter-kisteman-329830-unsplash.jpg
randy-fath-667618-unsplash.jpg
randy-fath-736175-unsplash.jpg
revolt-386471-unsplash.jpg
samantha-scholl-15

In [6]:
print('Deaf horses (1)')
!ls '/content/eqgen_batch/1'

Deaf horses (1)
'1655930_10152876159715960_1332774283920715435_n (1).jpg'
 1655930_10152876159715960_1332774283920715435_n.jpg
 16962_1201287838262_1410553281_30489563_4324513_n-300x225.jpg
 19139_1309272405249_1033766008_30956817_4480404_n-225x300.jpg
 25272_1344971737710_1033766008_31050554_2864086_n-300x238.jpg
'37852155_10156550080655960_5003683379976077312_o (1).jpg'
 37852155_10156550080655960_5003683379976077312_o.jpg
'37853970_10156550080000960_4414984096887865344_n (1).png'
 37853970_10156550080000960_4414984096887865344_n.png
 41706122_10156677046735960_5167040244876312576_n.jpg
 42297474_10156687915410960_1788544808642412544_n.png
 42749174_10156704324275960_6505795329294598144_n.png
'44032542_10156743316045960_6073517715744620544_n (1).jpg'
 44032542_10156743316045960_6073517715744620544_n.jpg
 44395302_10156758646395960_5481495541013544960_n.jpg
'45742689_10156808706615960_8187578173060284416_o (1).jpg'
 45742689_10156808706615960_8187578173060284416_o.jpg
 45754048_101568

#Handle Data

##def one-hot encoder function

In [7]:
# (1) indicates deafness, (0) indicates hearing; encoded in onehot
def ohl(e):
  if e == 1:
    ohl = np.array([1,0])
  elif e == 0:
    ohl = np.array([0,1])
  return ohl
# checking function behavior
print("deaf:")
print(ohl(1))

print("hearing:")
print(ohl(0))

deaf:
[1 0]
hearing:
[0 1]


##def image filetype homogenizer (to .jpg)

###.jpeg -> .jpg

In [8]:
# homogenize hearing horses (0)
for filename in tqdm(os.listdir(false_path)):
  # set iterator count for refactored filenames
  i = 0
  # convert .jpeg to .jpg for consistency and shorter filename length
  if filename.endswith(".jpeg"):
    # split filename by base and extention (base.ext -> 'base', 'ext')
    # send filename components to respective variables
    pre, ext = os.path.splitext(filename)
    # rename file as iterator.jpg
    # I am now realizing that the pre, ext filename split was never used
    os.rename(os.path.join(false_path, filename), str(i) + ".jpg")
#   # increment iterator
  i += 1
# print refactored filelist to confirm refactoring success
print(os.listdir(false_path))

100%|██████████| 31/31 [00:00<00:00, 13251.47it/s]

['liam-martens-716280-unsplash.jpg', 'alexandra-zota-1242250-unsplash.jpg', 'peter-kisteman-329830-unsplash.jpg', 'crystal-huff-757104-unsplash.jpg', 'courtney-miller-1230955-unsplash.jpg', 'michael-mroczek-115985-unsplash.jpg', 'samantha-scholl-157435-unsplash.jpg', 'gene-devine-476058-unsplash.jpg', 'darius-soodmand-118384-unsplash.jpg', 'alice-donovan-rouse-115131-unsplash.jpg', 'alex-blajan-119661-unsplash.jpg', 'james-coleman-1210076-unsplash.jpg', 'annie-spratt-207738-unsplash.jpg', 'nicolas-j-leclercq-1141089-unsplash.jpg', 'randy-fath-667618-unsplash.jpg', 'danny-gallegos-355559-unsplash.jpg', 'randy-fath-736175-unsplash.jpg', 'revolt-386471-unsplash.jpg', 'ghost-presenter-425367-unsplash.jpg', 'florian-van-duyn-387776-unsplash.jpg', 'adam-jang-314067-unsplash.jpg', 'lily-banse-460652-unsplash.jpg', 'joanne-o-keefe-270785-unsplash.jpg', 'vincent-botta-276146-unsplash.jpg', 'annie-spratt-587023-unsplash.jpg', 'carlee-dittemore-1226247-unsplash.jpg', 'waqas-arshad-1158480-unsplas




In [9]:
# homogenize deaf horses (1)
for filename in tqdm(os.listdir(true_path)):
  # set iterator count for refactored filenames
  i = 0
  # convert .jpeg to .jpg for consistency and shorter filename length
  if filename.endswith(".jpeg"):
    # split filename by base and extention (base.ext -> 'base', 'ext')
    # send filename components to respective variables
    pre, ext = os.path.splitext(filename)
    # rename file as iterator.jpg
    # I am now realizing that the pre, ext filename split was never used
    os.rename(os.path.join(true_path, filename), str(i) + ".jpg")
#   # increment iterator
  i += 1
# print refactored filelist to confirm refactoring success
print(os.listdir(true_path))

100%|██████████| 67/67 [00:00<00:00, 6590.64it/s]

['withpearl81014.jpg', '45783547_10156808706545960_418658615851745280_o (1).jpg', '37853970_10156550080000960_4414984096887865344_n (1).png', 'indy809.jpg', 'Blue - Deaf Horse.jpg', '46384937_10156827654140960_4352674463704154112_o.jpg', '19139_1309272405249_1033766008_30956817_4480404_n-225x300.jpg', '942208_10151663561390960_818164739_n (1).jpg', '37852155_10156550080655960_5003683379976077312_o (1).jpg', '44395302_10156758646395960_5481495541013544960_n.jpg', '46454524_10156827654155960_4596089175404445696_o.jpg', '942208_10151663561390960_818164739_n.jpg', '48418750_10156910169320960_302738908821585920_n.jpg', 'hqdefault.jpg', '1655930_10152876159715960_1332774283920715435_n (1).jpg', '41706122_10156677046735960_5167040244876312576_n.jpg', '46118114_10156808706765960_8705940662232547328_o.jpg', '42749174_10156704324275960_6505795329294598144_n.png', 'pearl2jogcart.jpg', 'karatsteppinout5413.jpg', '88A.jpg', '45801763_10156808706670960_5762628556723060736_o.jpg', '42297474_101566879




###.png -> .jpg

In [10]:
# homogenize deaf horses (1)

# construct file array
pngs = glob('/content/eqgen_batch/1/*.png')
# iterate array^^ for .png -> .jpg conversion
print('converting .png: \n')
for j in tqdm(pngs):
  # open image for editing
  img = cv2.imread(j)
  # write new filetype
  cv2.imwrite(j[:-3] + 'jpg', img)
# print refactored filelist to confirm refactoring success
print(pngs)
# remove.png residue
print('removing duplicates: \n')
for i in tqdm(pngs):
  os.remove(i)
print('.png ->.jpg done.')

  0%|          | 0/4 [00:00<?, ?it/s]

converting .png: 



100%|██████████| 4/4 [00:00<00:00, 36.69it/s]
100%|██████████| 4/4 [00:00<00:00, 3946.65it/s]

['/content/eqgen_batch/1/37853970_10156550080000960_4414984096887865344_n (1).png', '/content/eqgen_batch/1/42749174_10156704324275960_6505795329294598144_n.png', '/content/eqgen_batch/1/42297474_10156687915410960_1788544808642412544_n.png', '/content/eqgen_batch/1/37853970_10156550080000960_4414984096887865344_n.png']
removing duplicates: 

.png ->.jpg done.





###Refactor Images

In [11]:
# check paths

#(0)
print("hearing path:")
print(os.listdir(false_path))
# (1)
print("deaf path:")
print(os.listdir(true_path))

hearing path:
['liam-martens-716280-unsplash.jpg', 'alexandra-zota-1242250-unsplash.jpg', 'peter-kisteman-329830-unsplash.jpg', 'crystal-huff-757104-unsplash.jpg', 'courtney-miller-1230955-unsplash.jpg', 'michael-mroczek-115985-unsplash.jpg', 'samantha-scholl-157435-unsplash.jpg', 'gene-devine-476058-unsplash.jpg', 'darius-soodmand-118384-unsplash.jpg', 'alice-donovan-rouse-115131-unsplash.jpg', 'alex-blajan-119661-unsplash.jpg', 'james-coleman-1210076-unsplash.jpg', 'annie-spratt-207738-unsplash.jpg', 'nicolas-j-leclercq-1141089-unsplash.jpg', 'randy-fath-667618-unsplash.jpg', 'danny-gallegos-355559-unsplash.jpg', 'randy-fath-736175-unsplash.jpg', 'revolt-386471-unsplash.jpg', 'ghost-presenter-425367-unsplash.jpg', 'florian-van-duyn-387776-unsplash.jpg', 'adam-jang-314067-unsplash.jpg', 'lily-banse-460652-unsplash.jpg', 'joanne-o-keefe-270785-unsplash.jpg', 'vincent-botta-276146-unsplash.jpg', 'annie-spratt-587023-unsplash.jpg', 'carlee-dittemore-1226247-unsplash.jpg', 'waqas-arshad-1

In [12]:
# sequence and label images of hearing horses (0)
i = 0
for filename in tqdm(os.listdir(false_path)):
  os.rename(os.path.join(false_path, filename), os.path.join(false_path, str(i) + '.jpg'))
  i += 1
print('done.')

100%|██████████| 31/31 [00:00<00:00, 19464.58it/s]

done.





In [13]:
# sequence and label images of hearing horses (1)
i = 0
for filename in tqdm(os.listdir(true_path)):
  os.rename(os.path.join(true_path, filename), os.path.join(true_path, str(i) + '.jpg'))
  i += 1
print('done.')

100%|██████████| 46/46 [00:00<00:00, 17496.87it/s]

done.





###Load Images

In [22]:
files1 = glob(true_path + '/*.jpg')
for myFile in files1:
    print(myFile)
    image = cv2.imread (myFile, 1)
    small = cv2.resize(image, (50, 50))
    x.append(image)
    y.append(ohl(1))
print('Y-data shape:', np.array(y).shape)

/content/eqgen_batch/1/2.jpg
/content/eqgen_batch/1/41.jpg
/content/eqgen_batch/1/9.jpg
/content/eqgen_batch/1/23.jpg
/content/eqgen_batch/1/29.jpg
/content/eqgen_batch/1/44.jpg
/content/eqgen_batch/1/40.jpg
/content/eqgen_batch/1/31.jpg
/content/eqgen_batch/1/32.jpg
/content/eqgen_batch/1/3.jpg
/content/eqgen_batch/1/20.jpg
/content/eqgen_batch/1/43.jpg
/content/eqgen_batch/1/27.jpg
/content/eqgen_batch/1/30.jpg
/content/eqgen_batch/1/7.jpg
/content/eqgen_batch/1/35.jpg
/content/eqgen_batch/1/19.jpg
/content/eqgen_batch/1/8.jpg
/content/eqgen_batch/1/4.jpg
/content/eqgen_batch/1/21.jpg
/content/eqgen_batch/1/1.jpg
/content/eqgen_batch/1/12.jpg
/content/eqgen_batch/1/26.jpg
/content/eqgen_batch/1/5.jpg
/content/eqgen_batch/1/13.jpg
/content/eqgen_batch/1/37.jpg
/content/eqgen_batch/1/18.jpg
/content/eqgen_batch/1/11.jpg
/content/eqgen_batch/1/36.jpg
/content/eqgen_batch/1/25.jpg
/content/eqgen_batch/1/10.jpg
/content/eqgen_batch/1/45.jpg
/content/eqgen_batch/1/33.jpg
/content/eqgen_bat

In [23]:
files0 = glob(false_path + '/*.jpg')
for myFile in files0:
    print(myFile)
    image = cv2.imread (myFile, 1)
    small = cv2.resize(image, (50, 50))
    x.append(image)
    y.append(ohl(0))
print('X-data shape:', np.array(x).shape)
print('Y-data shape:', np.array(y).shape)

/content/eqgen_batch/0/2.jpg
/content/eqgen_batch/0/9.jpg
/content/eqgen_batch/0/23.jpg
/content/eqgen_batch/0/29.jpg
/content/eqgen_batch/0/3.jpg
/content/eqgen_batch/0/20.jpg
/content/eqgen_batch/0/27.jpg
/content/eqgen_batch/0/30.jpg
/content/eqgen_batch/0/7.jpg
/content/eqgen_batch/0/19.jpg
/content/eqgen_batch/0/8.jpg
/content/eqgen_batch/0/4.jpg
/content/eqgen_batch/0/21.jpg
/content/eqgen_batch/0/1.jpg
/content/eqgen_batch/0/12.jpg
/content/eqgen_batch/0/26.jpg
/content/eqgen_batch/0/5.jpg
/content/eqgen_batch/0/13.jpg
/content/eqgen_batch/0/18.jpg
/content/eqgen_batch/0/11.jpg
/content/eqgen_batch/0/25.jpg
/content/eqgen_batch/0/10.jpg
/content/eqgen_batch/0/15.jpg
/content/eqgen_batch/0/0.jpg
/content/eqgen_batch/0/14.jpg
/content/eqgen_batch/0/17.jpg
/content/eqgen_batch/0/28.jpg
/content/eqgen_batch/0/6.jpg
/content/eqgen_batch/0/22.jpg
/content/eqgen_batch/0/16.jpg
/content/eqgen_batch/0/24.jpg
X-data shape: (307,)
