In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import keras, os, time
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

filename='./data/feature-data-dummy/0P5X_euk.txt'
target='genus'
print('Target column is ',target)


Target column is  genus


In [4]:
# Load data
X=pd.read_csv(filename,sep='\t',low_memory=False)
print('Shape of the dataset:',X.shape)
print('Number of cols',len(X.columns))
print('Number of rows',len(X.index))

Shape of the dataset: (4172, 218)
Number of cols 218
Number of rows 4172


In [5]:
# Remove rows without target
X.dropna(axis=0,subset=[target], inplace=True)
print('Shape after removing rows with no target:',X.shape)

Shape after removing rows with no target: (3586, 218)


In [6]:
# Separate data from labels
y = X[target]
X.drop([target], axis=1, inplace=True)


In [7]:
# Remove empty columns
empty_cols=[col for col in X.columns if X[col].count()==0]
if len(empty_cols)>0:
    X.drop(empty_cols,axis=1,inplace=True)
print("Shape after dropping empty columns",X.shape)

Shape after dropping empty columns (3586, 103)


In [8]:
# Remove other columns that shouldn't be used

# Columns that are good for the dataset
size_cols=['maj_axis_len','min_axis_len','area','aspect_ratio','eccentricity','estimated_volume','file_size','image_height','image_width','orientation','solidity',]
color_cols=sorted(X.loc[:, X.columns.str.contains('intensity')])

# Columns that should absolutely be removed
leakage_cols=['class','species','family','empire','kingdom','order','phylum']

# Other columns
time_cols=['acquisition_time']
useless_cols=sorted(X.loc[:, X.columns.str.contains('modif')])+['_id','extension','filename','group_id','tags','upload_id']
boh_cols=['multiple_species','partially_cropped'] # These columns are mostly nan, but I suspect that these nan should be False


keep_cols=size_cols+color_cols
remove_cols=leakage_cols+useless_cols+time_cols+boh_cols


remaining_cols=[col for col in X.columns if col not in remove_cols+keep_cols]
if len(remaining_cols)!=0:
    print('There are still some columns that you didn\'t take into account!')
    print(remaining_cols)

X.drop(remove_cols,axis=1,inplace=True)

print('Shape after removing the bad columns:',X.shape)

Shape after removing the bad columns: (3586, 67)


In [9]:
# Columns with categorical values
categoric_cols = [col for col in X.columns if X[col].dtype=='object']
int_cols = [col for col in X.columns if X[col].dtype=='int']
float_cols = [col for col in X.columns if X[col].dtype=='float']

if len(int_cols+float_cols+categoric_cols)!=len(X.columns):
    print('There are still some types that you didn\'t take into account!')
    print(set([X[col].dtype for col in X.columns]))

X.drop(categoric_cols,axis=1,inplace=True)

In [10]:
# Split train and test
X=X.values # If I work with numpy arrays I can use verbatim the code of the other models
y=y.values
(trainX, testX, trainY, testY) = train_test_split(X, y, test_size=0.2, random_state=42)

train_size=len(trainX)
test_size=len(testX)


In [11]:
# Transform labels into vectors
lb = LabelBinarizer()
trainY = lb.fit_transform(trainY)
testY = lb.transform(testY)


In [12]:
# Define MLP
model = Sequential()
model.add(Dense(1024, input_shape=(len(trainX[0]),), activation="sigmoid"))
model.add(Dense(512, activation="sigmoid"))
model.add(Dense(len(lb.classes_), activation="softmax"))

opt = keras.optimizers.SGD(lr=0.1,nesterov=True)
model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])



Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [None]:
# Train the model
start=time.time()
history = model.fit(
    trainX, trainY, batch_size=32, 
    validation_data=(testX, testY), 
    epochs=20)
trainingTime=time.time()-start
print('Training took',trainingTime/60,'minutes')


In [None]:
### evaluate the network
predictions = model.predict(testX, batch_size=32)
clrep=classification_report(testY.argmax(axis=1), predictions.argmax(axis=1), target_names=lb.classes_)
print(clrep)
