## Import useful libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder
import random

## Part1: Data Preparation

### 1. Load Data Files

In [3]:
# load training data
train = pd.read_csv('train.csv')
# load testing data
test = pd.read_csv('test.csv')

### 1.1 Describe Data

In [4]:
# view first 5 rows of the train dataframe 
train.head()

Unnamed: 0,id,species,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,1,Acer_Opalus,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0.0,...,0.007812,0.0,0.00293,0.00293,0.035156,0.0,0.0,0.004883,0.0,0.025391
1,2,Pterocarya_Stenoptera,0.005859,0.0,0.03125,0.015625,0.025391,0.001953,0.019531,0.0,...,0.000977,0.0,0.0,0.000977,0.023438,0.0,0.0,0.000977,0.039062,0.022461
2,3,Quercus_Hartwissiana,0.005859,0.009766,0.019531,0.007812,0.003906,0.005859,0.068359,0.0,...,0.1543,0.0,0.005859,0.000977,0.007812,0.0,0.0,0.0,0.020508,0.00293
3,5,Tilia_Tomentosa,0.0,0.003906,0.023438,0.005859,0.021484,0.019531,0.023438,0.0,...,0.0,0.000977,0.0,0.0,0.020508,0.0,0.0,0.017578,0.0,0.047852
4,6,Quercus_Variabilis,0.005859,0.003906,0.048828,0.009766,0.013672,0.015625,0.005859,0.0,...,0.09668,0.0,0.021484,0.0,0.0,0.0,0.0,0.0,0.0,0.03125


In [5]:
# view last 5 rows of the train dataframe
train.tail()

Unnamed: 0,id,species,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
985,1575,Magnolia_Salicifolia,0.060547,0.11914,0.007812,0.003906,0.0,0.14844,0.017578,0.0,...,0.24219,0.0,0.03418,0.0,0.010742,0.0,0.0,0.0,0.0,0.018555
986,1578,Acer_Pictum,0.001953,0.003906,0.021484,0.10742,0.001953,0.0,0.0,0.0,...,0.1709,0.0,0.018555,0.0,0.011719,0.0,0.0,0.000977,0.0,0.021484
987,1581,Alnus_Maximowiczii,0.001953,0.003906,0.0,0.021484,0.078125,0.003906,0.007812,0.0,...,0.004883,0.000977,0.004883,0.027344,0.016602,0.007812,0.0,0.027344,0.0,0.001953
988,1582,Quercus_Rubra,0.0,0.0,0.046875,0.056641,0.009766,0.0,0.0,0.0,...,0.083008,0.030273,0.000977,0.00293,0.014648,0.0,0.041992,0.0,0.001953,0.00293
989,1584,Quercus_Afares,0.023438,0.019531,0.03125,0.015625,0.005859,0.019531,0.035156,0.0,...,0.0,0.0,0.00293,0.0,0.012695,0.0,0.0,0.023438,0.025391,0.022461


In [6]:
# statistical analysis of the train data
train.describe()

Unnamed: 0,id,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,margin9,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
count,990.0,990.0,990.0,990.0,990.0,990.0,990.0,990.0,990.0,990.0,...,990.0,990.0,990.0,990.0,990.0,990.0,990.0,990.0,990.0,990.0
mean,799.59596,0.017412,0.028539,0.031988,0.02328,0.014264,0.038579,0.019202,0.001083,0.007167,...,0.036501,0.005024,0.015944,0.011586,0.016108,0.014017,0.002688,0.020291,0.008989,0.01942
std,452.477568,0.019739,0.038855,0.025847,0.028411,0.01839,0.05203,0.017511,0.002743,0.008933,...,0.063403,0.019321,0.023214,0.02504,0.015335,0.060151,0.011415,0.03904,0.013791,0.022768
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,415.25,0.001953,0.001953,0.013672,0.005859,0.001953,0.0,0.005859,0.0,0.001953,...,0.0,0.0,0.000977,0.0,0.004883,0.0,0.0,0.0,0.0,0.000977
50%,802.5,0.009766,0.011719,0.025391,0.013672,0.007812,0.015625,0.015625,0.0,0.005859,...,0.004883,0.0,0.005859,0.000977,0.012695,0.0,0.0,0.003906,0.00293,0.011719
75%,1195.5,0.025391,0.041016,0.044922,0.029297,0.017578,0.056153,0.029297,0.0,0.007812,...,0.043701,0.0,0.022217,0.009766,0.021484,0.0,0.0,0.023438,0.012695,0.029297
max,1584.0,0.087891,0.20508,0.15625,0.16992,0.11133,0.31055,0.091797,0.03125,0.076172,...,0.42969,0.20215,0.17285,0.2002,0.10645,0.57813,0.15137,0.37598,0.086914,0.1416


In [7]:
# columns of the train data
train.columns

Index(['id', 'species', 'margin1', 'margin2', 'margin3', 'margin4', 'margin5',
       'margin6', 'margin7', 'margin8',
       ...
       'texture55', 'texture56', 'texture57', 'texture58', 'texture59',
       'texture60', 'texture61', 'texture62', 'texture63', 'texture64'],
      dtype='object', length=194)

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990 entries, 0 to 989
Columns: 194 entries, id to texture64
dtypes: float64(192), int64(1), object(1)
memory usage: 1.5+ MB


In [9]:
# shape of the data
train.shape

(990, 194)

In [10]:
# view first 5 rows of the test dataframe 
test.head()

Unnamed: 0,id,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,margin9,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,4,0.019531,0.009766,0.078125,0.011719,0.003906,0.015625,0.005859,0.0,0.005859,...,0.006836,0.0,0.015625,0.000977,0.015625,0.0,0.0,0.0,0.003906,0.053711
1,7,0.007812,0.005859,0.064453,0.009766,0.003906,0.013672,0.007812,0.0,0.033203,...,0.0,0.0,0.006836,0.001953,0.013672,0.0,0.0,0.000977,0.037109,0.044922
2,9,0.0,0.0,0.001953,0.021484,0.041016,0.0,0.023438,0.0,0.011719,...,0.12891,0.0,0.000977,0.0,0.0,0.0,0.0,0.015625,0.0,0.0
3,12,0.0,0.0,0.009766,0.011719,0.017578,0.0,0.003906,0.0,0.003906,...,0.012695,0.015625,0.00293,0.036133,0.013672,0.0,0.0,0.089844,0.0,0.008789
4,13,0.001953,0.0,0.015625,0.009766,0.039062,0.0,0.009766,0.0,0.005859,...,0.0,0.042969,0.016602,0.010742,0.041016,0.0,0.0,0.007812,0.009766,0.007812


In [11]:
# view last 5 rows of the test dataframe
test.tail()

Unnamed: 0,id,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,margin9,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
589,1576,0.0,0.0,0.003906,0.015625,0.041016,0.0,0.017578,0.0,0.005859,...,0.098633,0.0,0.004883,0.0,0.003906,0.0,0.0,0.018555,0.0,0.000977
590,1577,0.0,0.003906,0.003906,0.005859,0.017578,0.0,0.017578,0.005859,0.0,...,0.012695,0.004883,0.004883,0.00293,0.009766,0.0,0.0,0.09082,0.0,0.016602
591,1579,0.017578,0.029297,0.015625,0.013672,0.003906,0.015625,0.025391,0.0,0.0,...,0.073242,0.0,0.02832,0.0,0.001953,0.0,0.0,0.0,0.042969,0.006836
592,1580,0.013672,0.009766,0.060547,0.025391,0.035156,0.025391,0.039062,0.0,0.003906,...,0.003906,0.0,0.000977,0.0,0.011719,0.0,0.0,0.0,0.011719,0.018555
593,1583,0.0,0.11719,0.0,0.019531,0.0,0.13672,0.001953,0.005859,0.0,...,0.10742,0.012695,0.016602,0.000977,0.004883,0.0,0.0,0.015625,0.0,0.017578


In [12]:
# statistical analysis of the test data
test.describe()

Unnamed: 0,id,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,margin9,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
count,594.0,594.0,594.0,594.0,594.0,594.0,594.0,594.0,594.0,594.0,...,594.0,594.0,594.0,594.0,594.0,594.0,594.0,594.0,594.0,594.0
mean,780.673401,0.017562,0.028425,0.031858,0.022556,0.014527,0.037497,0.019222,0.001085,0.007092,...,0.035291,0.005923,0.015033,0.011762,0.015881,0.011217,0.002617,0.019975,0.009389,0.02097
std,465.646977,0.019585,0.038351,0.025719,0.028797,0.018029,0.051372,0.017122,0.002697,0.009515,...,0.064482,0.026934,0.022318,0.024771,0.014898,0.05253,0.011204,0.034704,0.013457,0.023407
min,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,368.5,0.001953,0.001953,0.013672,0.005859,0.001953,0.0,0.005859,0.0,0.001953,...,0.0,0.0,0.000977,0.0,0.004883,0.0,0.0,0.0,0.0,0.000977
50%,774.0,0.009766,0.010743,0.023438,0.013672,0.007812,0.013672,0.015625,0.0,0.005859,...,0.003906,0.0,0.005859,0.001953,0.012695,0.0,0.0,0.003418,0.00293,0.013184
75%,1184.5,0.028809,0.041016,0.042969,0.027344,0.019531,0.056641,0.029297,0.0,0.007812,...,0.038086,0.0,0.019531,0.010498,0.022461,0.0,0.0,0.022461,0.014648,0.032227
max,1583.0,0.085938,0.18945,0.16797,0.16406,0.09375,0.27148,0.087891,0.021484,0.083984,...,0.35352,0.44141,0.15332,0.17773,0.083984,0.60645,0.12305,0.24707,0.086914,0.14941


In [13]:
# columns of the test data
test.columns

Index(['id', 'margin1', 'margin2', 'margin3', 'margin4', 'margin5', 'margin6',
       'margin7', 'margin8', 'margin9',
       ...
       'texture55', 'texture56', 'texture57', 'texture58', 'texture59',
       'texture60', 'texture61', 'texture62', 'texture63', 'texture64'],
      dtype='object', length=193)

In [14]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 594 entries, 0 to 593
Columns: 193 entries, id to texture64
dtypes: float64(192), int64(1)
memory usage: 895.8 KB


In [15]:
# shape of the data
test.shape

(594, 193)

### 1.2 Clean Data
Train data is cleaned and don't need to apply any cleaning approaches to it

Test data also is cleaned and don't need to apply any cleaning approaches to it

### 1.3 Checking the Data for missing or duplicate values

In [18]:
# check missing values for train data
print("Number of missing values in each column in the training data is:\n")
print(train.isna().sum())
print("\nThe sum of all missing values in all column in the training dataframe is:",train.isna().sum().sum())

Number of missing values in each column in the training data is:

id           0
species      0
margin1      0
margin2      0
margin3      0
            ..
texture60    0
texture61    0
texture62    0
texture63    0
texture64    0
Length: 194, dtype: int64

The sum of all missing values in all column in the training dataframe is: 0


So, there is no missing values in any column in the _**Train Data**_

In [19]:
# check duplicate values for train data
print('Number of duplicated rows in the training data is:',train.duplicated().sum())

Number of duplicated rows in the training data is: 0


So, there is no duplicated values in any column in the _**Train Data**_

In [16]:
# check missing values for train data
print("Number of missing values in each column in the testing data is:\n")
print(train.isna().sum())
print("\nThe sum of all missing values in all column in the testing dataframe is:",test.isna().sum().sum())

Number of missing values in each column in the testing data is:

id           0
species      0
margin1      0
margin2      0
margin3      0
            ..
texture60    0
texture61    0
texture62    0
texture63    0
texture64    0
Length: 194, dtype: int64

The sum of all missing values in all column in the testing dataframe is: 0


So, there is no missing values in any column in the _**Test Data**_

In [17]:
# check duplicate values for train data
print('Number of duplicated rows in the testing data is:',test.duplicated().sum())

Number of duplicated rows in the testing data is: 0


So, there is no duplicated values in any column in the _**Test Data**_

### 1.4 Visualize the  Train data using proper visualization methods.

### 1.5 Drawing of some  Images

In [48]:
random_ids = [random.randint(0,(train.shape[0] + test.shape[0] + 1)) for i in range(8)]


### 1.6 Carry out required correlation analysis

### 2. Divide the data into a training and test set using approximately 80% for training
Data is divided already in kaggle into a **Train** and **Test** data so we don't need to apply any divide process here 

### 3. Standardize the data, by computing the mean and standard deviation for each feature dimension using the training set only, then subtracting the mean and dividing by the stdev for each feature and each sample.

### 4. Encode the labels

## Part2: Training a neural network