# Explanation for CNN 

In [55]:
'''
Author: Ding Shuya
Description: This notebook will give an explanation about CNN network data preprocessing way. 
'''

'\nAuthor: Ding Shuya\nDescription: This notebook will give an explanation about CNN network data preprocessing way. \n'

In [56]:
def feature_engineering_CNN(df, out_data_path, category, sample=60000):
    '''
    function:
    - aggregates 2 user defined functions that prepares dataframe for CNN modeling.
    - it also prints out how long it takes to run.

    input:
    - df = dataframe that was converted from raw_data json file
    - category = used to name output pickle file
    - sample = number of datapoints included in the final dataframe. 


    output:
    - pickled dataframe that will be used for CNN modeling (1176 features)
    - each row represents 42 by 28 pixel image
    file name: "./data/{}.pkl".format(category)
    '''

    start_time = time.time()
    #runs CNN feature engineering functions
    df_1 = CNN_feat_eng_pt1(df)
    df_2 = CNN_feat_eng_pt2(df_1)

    df_2.index = range(len(df_2))
    random_ind = np.random.choice(list(df_2.index), sample, replace=False)
    df_2 = df_2.loc[list(random_ind)]

    df_2.index = df_2['key_id']
    df_2.to_pickle(out_data_path + "{}.pkl".format(category))
    print("--- %s seconds ---" % (time.time() - start_time))
    return df_2
def _array_normalizer(array1,Xmin,Xmax,array_min):
    '''
    function:
        - normalize X,Y array by range of X
        - used in feature_eng_pt2
    input:
        array1 = array that you want to normalize (1D array or list)
        Xmin = minimum value of your X array (int)
        Xmax = maximum value of your X array (int)
        array_min = minimum value of array1

    output:
        normalized array of array1
    '''
    return (np.array(array1)-np.array([array_min]*len(array1)))/float(Xmax-Xmin)

## CNN_feat_eng_pt1

In [57]:
import os
root_path = '/export/home/di0002ya/quickdraw/cnn_base'
data_path = '/export/home/di0002ya/quickdraw/data/sy_data/quick_draw/'
out_data_path = '/export/home/di0002ya/quickdraw/data/sy_data/quick_draw_output/'
os.chdir(root_path)
from feature_engineering_func_ntu import *
import pandas as pd
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
import numpy as np
np.random.seed(32113)
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.layers.convolutional import ZeroPadding2D
from keras.models import load_model
class_lists = ['cat', 'dog', 'lion', 'tiger']
sample = 6
# for classes in class_lists:
#     filepath = data_path + classes + '.ndjson'
#     df = pd.read_json(filepath, lines=True)
#     feature_engineering_CNN(df,out_data_path, classes, sample)

In [59]:
filepath = filepath = data_path + 'cat' + '.ndjson'
df = pd.read_json(filepath, lines=True)


In [63]:
df.columns

Index(['countrycode', 'drawing', 'key_id', 'recognized', 'timestamp', 'word'], dtype='object')

In [64]:
#Generation Stroke Number 
#Convert True and False 
#Filter Stoke < = 15 & 'Recognized'
df['stroke_number']=df['drawing'].str.len()
b_loon = {True: 1, False:0}
df['recognized'] = df['recognized'].map(b_loon)
df_cf = df[(df['recognized']==1) & (df['stroke_number'] <= 15)]

In [65]:
# Select ith 'drawing': timeinfo last timestamps of last strokes 
df_cf['final_time'] = [df_cf.loc[i,'drawing'][df_cf.stroke_number[i]-1][2][-1] for i in df_cf.index]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [67]:
X = {}
Y = {}
Ymax ={}
time = {}
ttnum_dp = {}
sumtimeps = {}

In [68]:
# Before diving into loop, let's see only one drawings. 
i = 0 
num = df_cf.loc[i,'stroke_number']
#store X,Y,time of the stroke in a temp list
Xt = [df_cf.loc[i,'drawing'][stroke][0] for stroke in range(num)]
Yt = [df_cf.loc[i,'drawing'][stroke][1] for stroke in range(num)]
tt = [df_cf.loc[i,'drawing'][stroke][2] for stroke in range(num)]

We could see that above functions convert drawings into 3 temp lists. 
If ith drawing has N strokes, then Xt, Yt, tt length will be N lists of list. 
Xt = [X1,...XN]
X1 = [x11,....x1T] if first stroke has T discrete points. 

In [73]:
# calculate the difference between final and initial time of a stroke
Tdifftemp = [(df_cf.loc[i,'drawing'][stroke][2][-1] - df_cf.loc[i,'drawing'][stroke][2][0])\
             for stroke in range(num)]
#Tdifftemp[t] = -tt[t][0]+tt[t][-1]

In [77]:
print(-tt[0][0]+tt[0][-1])

3095


In [78]:
print(Tdifftemp[0])

3095


In [79]:
# Xtemp/Ytemp = [item for stroke in Xt for item in stroke] Flatten all the X/Y info into one list
Xtemp = [item for stroke in Xt for item in stroke]
Ytemp = [item for stroke in Yt for item in stroke]

In [80]:
len(Xtemp)

190

In [81]:
# time = [item for stroke in time for item in stroke] Flatten all the time info of ith drawing into one list 
time[i] = [item for stroke in tt for item in stroke]

In [38]:
# Normalize Xtemp&Ytemp for ith drawing
Xmintemp = np.min(Xtemp)-10
Xmaxtemp = np.max(Xtemp)+10
Ymintemp = np.min(Ytemp)-10
Xnorm = _array_normalizer(Xtemp, Xmintemp,Xmaxtemp,Xmintemp)
Ynorm = _array_normalizer(Ytemp, Xmintemp,Xmaxtemp,Ymintemp)

In [40]:
# Store norm value into X/Y which stores all norm value for all drawings. 
Ymax[i] = np.max(Ynorm)
X[i] = Xnorm
Y[i] = Ynorm


In [82]:
ttnum_dp[i] = len(Ynorm)
sumtimeps[i] = sum(Tdifftemp)

Let us see the overall functions 

In [83]:
df_1 = CNN_feat_eng_pt1(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df_cf['final_time'] = [df_cf.loc[i,'drawing'][df_cf.stroke_number[i]-1][2][-1] for i in df_cf.index]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df_cf['total_number_of_datapoints'] = pd.Series(ttnum_dp)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df_cf['Ymax'] = pd.Series(Ymax)
A value is trying to be set on a copy of a slice from a DataFrame

## CNN_feat_eng_pt2

In [85]:
df_cf = df_1

In [86]:
orig_index = df_cf.index
df_cf.index = range(len(df_cf))
image_pile = np.zeros((len(df_cf),1176))

Let us see the 1st index of df_cf. 

In [88]:
ind = df_cf.index[0]

In [89]:
ind

0

In [92]:
xarray = np.around(np.array(df_cf.loc[ind,'X'])*28)

In [93]:
xarray

array([14., 14., 14., 13., 13., 13., 12., 12., 12., 12., 11., 11., 11.,
       11., 11., 11., 12., 12., 12., 12., 12., 11., 11., 10., 10.,  9.,
        9.,  8.,  7.,  7.,  6.,  6.,  6.,  5.,  5.,  6.,  6.,  7.,  8.,
        9., 10., 11., 12., 14., 16., 17., 19., 20., 22., 22., 23., 23.,
       23., 24., 23., 23., 23., 23., 22., 21., 21., 21., 20., 20., 20.,
       20., 20., 20., 20., 20., 20., 20., 20., 20., 19., 19., 19., 19.,
       19., 19., 19., 18., 18., 18., 18., 17., 17., 17., 17., 17., 17.,
       17., 17., 16., 16., 16., 15., 15., 15., 15., 14.,  8.,  8.,  7.,
        5.,  4.,  3.,  3.,  2.,  1.,  1.,  1.,  8.,  8.,  7.,  6.,  5.,
        4.,  3.,  2.,  1.,  1.,  0., 10.,  9.,  8.,  7.,  6.,  5.,  5.,
        4.,  4., 19., 20., 21., 22., 23., 24., 25., 26., 27., 27., 27.,
       27., 19., 20., 21., 23., 24., 26., 27., 28., 28., 19., 19., 20.,
       23., 23., 23., 23., 23., 23., 14., 13., 13., 12., 12., 12., 12.,
       12., 13., 13., 14., 15., 15., 15., 15., 15., 15., 15., 14

In [94]:
yarray = np.around(np.array(df_cf.loc[ind,'Y'])*42/float(df_cf.loc[ind,'Ymax']))

In [95]:
        xarray[xarray>=28.] = 27
        yarray[yarray>=42.] = 41

In [97]:
yarray

array([15., 15., 13., 12., 11., 10.,  9.,  8.,  7.,  7.,  7.,  6.,  7.,
        9., 10., 12., 14., 15., 16., 16., 16., 16., 16., 16., 17., 17.,
       17., 18., 18., 19., 21., 23., 24., 26., 27., 31., 32., 33., 35.,
       36., 37., 37., 38., 38., 38., 37., 36., 34., 32., 31., 29., 27.,
       25., 23., 20., 20., 19., 18., 17., 17., 16., 16., 16., 16., 15.,
       14., 14., 13., 12., 11., 10.,  9.,  8.,  6.,  5.,  3.,  3.,  1.,
        1.,  1.,  2.,  3.,  4.,  5.,  7.,  9., 11., 11., 12., 12., 13.,
       13., 14., 14., 14., 14., 15., 15., 15., 15., 15., 27., 27., 27.,
       26., 26., 26., 26., 26., 26., 26., 26., 32., 32., 32., 32., 32.,
       33., 33., 34., 34., 35., 35., 35., 36., 37., 38., 38., 40., 40.,
       41., 41., 27., 27., 27., 27., 27., 27., 27., 27., 27., 28., 28.,
       28., 30., 30., 31., 32., 32., 33., 34., 34., 34., 33., 34., 36.,
       40., 40., 41., 41., 41., 41., 26., 27., 27., 27., 28., 28., 29.,
       29., 30., 30., 30., 30., 30., 29., 29., 28., 27., 27., 27

In [104]:
len(df_cf.loc[ind,'X'])

190

In [None]:
df_final = CNN_feat_eng_pt2(df_cf)