## Import useful libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder

## Part1: Data Preparation

### Load Data Files

In [3]:
# load training data
train = pd.read_csv('train.csv')
# load testing data
test = pd.read_csv('test.csv')

### Describe Train Data

In [4]:
# view first 5 rows of the train dataframe 
train.head()

Unnamed: 0,id,species,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,1,Acer_Opalus,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0.0,...,0.007812,0.0,0.00293,0.00293,0.035156,0.0,0.0,0.004883,0.0,0.025391
1,2,Pterocarya_Stenoptera,0.005859,0.0,0.03125,0.015625,0.025391,0.001953,0.019531,0.0,...,0.000977,0.0,0.0,0.000977,0.023438,0.0,0.0,0.000977,0.039062,0.022461
2,3,Quercus_Hartwissiana,0.005859,0.009766,0.019531,0.007812,0.003906,0.005859,0.068359,0.0,...,0.1543,0.0,0.005859,0.000977,0.007812,0.0,0.0,0.0,0.020508,0.00293
3,5,Tilia_Tomentosa,0.0,0.003906,0.023438,0.005859,0.021484,0.019531,0.023438,0.0,...,0.0,0.000977,0.0,0.0,0.020508,0.0,0.0,0.017578,0.0,0.047852
4,6,Quercus_Variabilis,0.005859,0.003906,0.048828,0.009766,0.013672,0.015625,0.005859,0.0,...,0.09668,0.0,0.021484,0.0,0.0,0.0,0.0,0.0,0.0,0.03125


In [5]:
# view last 5 rows of the train dataframe
train.tail()

Unnamed: 0,id,species,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
985,1575,Magnolia_Salicifolia,0.060547,0.11914,0.007812,0.003906,0.0,0.14844,0.017578,0.0,...,0.24219,0.0,0.03418,0.0,0.010742,0.0,0.0,0.0,0.0,0.018555
986,1578,Acer_Pictum,0.001953,0.003906,0.021484,0.10742,0.001953,0.0,0.0,0.0,...,0.1709,0.0,0.018555,0.0,0.011719,0.0,0.0,0.000977,0.0,0.021484
987,1581,Alnus_Maximowiczii,0.001953,0.003906,0.0,0.021484,0.078125,0.003906,0.007812,0.0,...,0.004883,0.000977,0.004883,0.027344,0.016602,0.007812,0.0,0.027344,0.0,0.001953
988,1582,Quercus_Rubra,0.0,0.0,0.046875,0.056641,0.009766,0.0,0.0,0.0,...,0.083008,0.030273,0.000977,0.00293,0.014648,0.0,0.041992,0.0,0.001953,0.00293
989,1584,Quercus_Afares,0.023438,0.019531,0.03125,0.015625,0.005859,0.019531,0.035156,0.0,...,0.0,0.0,0.00293,0.0,0.012695,0.0,0.0,0.023438,0.025391,0.022461


In [6]:
# statistical analysis of the train data
train.describe()

Unnamed: 0,id,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,margin9,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
count,990.0,990.0,990.0,990.0,990.0,990.0,990.0,990.0,990.0,990.0,...,990.0,990.0,990.0,990.0,990.0,990.0,990.0,990.0,990.0,990.0
mean,799.59596,0.017412,0.028539,0.031988,0.02328,0.014264,0.038579,0.019202,0.001083,0.007167,...,0.036501,0.005024,0.015944,0.011586,0.016108,0.014017,0.002688,0.020291,0.008989,0.01942
std,452.477568,0.019739,0.038855,0.025847,0.028411,0.01839,0.05203,0.017511,0.002743,0.008933,...,0.063403,0.019321,0.023214,0.02504,0.015335,0.060151,0.011415,0.03904,0.013791,0.022768
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,415.25,0.001953,0.001953,0.013672,0.005859,0.001953,0.0,0.005859,0.0,0.001953,...,0.0,0.0,0.000977,0.0,0.004883,0.0,0.0,0.0,0.0,0.000977
50%,802.5,0.009766,0.011719,0.025391,0.013672,0.007812,0.015625,0.015625,0.0,0.005859,...,0.004883,0.0,0.005859,0.000977,0.012695,0.0,0.0,0.003906,0.00293,0.011719
75%,1195.5,0.025391,0.041016,0.044922,0.029297,0.017578,0.056153,0.029297,0.0,0.007812,...,0.043701,0.0,0.022217,0.009766,0.021484,0.0,0.0,0.023438,0.012695,0.029297
max,1584.0,0.087891,0.20508,0.15625,0.16992,0.11133,0.31055,0.091797,0.03125,0.076172,...,0.42969,0.20215,0.17285,0.2002,0.10645,0.57813,0.15137,0.37598,0.086914,0.1416


In [17]:
# columns of the train data
train.columns

Index(['id', 'species', 'margin1', 'margin2', 'margin3', 'margin4', 'margin5',
       'margin6', 'margin7', 'margin8',
       ...
       'texture55', 'texture56', 'texture57', 'texture58', 'texture59',
       'texture60', 'texture61', 'texture62', 'texture63', 'texture64'],
      dtype='object', length=194)

In [21]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990 entries, 0 to 989
Columns: 194 entries, id to texture64
dtypes: float64(192), int64(1), object(1)
memory usage: 1.5+ MB


In [8]:
# shape of the data
train.shape

(990, 194)

### Checking the Train data for missing or duplicates values

In [34]:
# check missing values
print("Number of missing values in each column in the training data is:\n")
print(train.isna().sum())
print("\nThe sum of all missing values in all column in the training dataframe is:",train.isna().sum().sum())

Number of missing values in each column in the training data is:

id           0
species      0
margin1      0
margin2      0
margin3      0
            ..
texture60    0
texture61    0
texture62    0
texture63    0
texture64    0
Length: 194, dtype: int64

The sum of all missing values in all column in the training dataframe is: 0


So, there is no missing values in any column in the _**Train Data**_

In [38]:
# check duplicate values
