In [72]:
import pandas as pd
import os
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

## Load data

In [5]:
data_path = 'raw_data'
file_list = os.listdir(data_path)
file_list = [file for file in file_list if '.csv' in file]
file_list

['open.csv',
 'close.csv',
 'one.csv',
 'hip_hop.csv',
 'four.csv',
 'two.csv',
 'stop.csv',
 'good.csv',
 'promise.csv',
 'three.csv',
 'ok.csv']

In [6]:
target_path = 'gestures/csv'
target_list = os.listdir(target_path)
target_list = [file for file in target_list if '.csv' in file]
target_list

['open.csv',
 'close.csv',
 'one.csv',
 'hip_hop.csv',
 'four.csv',
 'two.csv',
 'stop.csv',
 'good.csv',
 'promise.csv',
 'three.csv',
 'ok.csv']

In [7]:
assert len(file_list) == len(target_list)

In [8]:
# raw data
df_list = []
for file in file_list:
    df = pd.read_csv(os.path.join(data_path, file))
    # get label according to file name
    label = file.split('.')[0]
    df['label'] = label
    df_list.append(df)
raw_df = pd.concat(df_list, ignore_index=True)
raw_df = raw_df.drop(columns=['Unnamed: 0'])
raw_df.head()

Unnamed: 0,width_0_1,angle_0_1,width_1_2,angle_1_2,width_2_3,angle_2_3,width_0_4,angle_0_4,width_4_5,angle_4_5,...,angle_11_12,width_13_14,angle_13_14,width_14_15,angle_14_15,width_1_4,angle_1_4,width_4_7,angle_4_7,label
0,0.78513,-0.984254,0.0,-0.748731,0.0,-0.89258,0.993145,-0.263319,0.532146,-0.240884,...,0.332903,0.407166,0.464137,0.310414,0.372555,0.656727,0.646102,0.0,1.413172,open
1,0.779328,-0.972377,0.0,-0.75383,0.0,-0.901753,0.995146,-0.262813,0.517027,-0.238114,...,0.337878,0.397044,0.458153,0.300901,0.347386,0.648799,0.635982,0.0,1.420984,open
2,0.787007,-0.972377,0.0,-0.75383,0.0,-0.916226,0.99892,-0.259332,0.52578,-0.226799,...,0.349856,0.400956,0.458153,0.311944,0.371709,0.65419,0.646513,0.0,1.436985,open
3,0.776338,-0.974037,0.0,-0.75383,0.0,-0.927295,0.994023,-0.259332,0.522122,-0.217651,...,0.337878,0.398991,0.458153,0.308668,0.356912,0.651979,0.635982,0.0,1.418147,open
4,0.762687,-0.98725,0.0,-0.774371,0.0,-0.908637,0.991506,-0.259578,0.517312,-0.2295,...,0.332352,0.396828,0.437271,0.308653,0.347108,0.65986,0.617354,0.0,1.408343,open


In [9]:
# target data
target_df_list = []
for file in target_list:
    df = pd.read_csv(os.path.join(target_path, file))
    # get label according to file name
    label = file.split('.')[0]
    df['label'] = label
    target_df_list.append(df)
target_df = pd.concat(target_df_list, ignore_index=True)
target_df = target_df.drop(columns=['Unnamed: 0'])
target_df.head()

Unnamed: 0,width_0_1,angle_0_1,width_1_2,angle_1_2,width_2_3,angle_2_3,width_0_4,angle_0_4,width_4_5,angle_4_5,...,angle_11_12,width_13_14,angle_13_14,width_14_15,angle_14_15,width_1_4,angle_1_4,width_4_7,angle_4_7,label
0,0.82163,-0.909301,0.0,-0.648029,0.0,-0.769057,1.008399,-0.266002,0.559157,-0.223376,...,0.340993,0.430316,0.579263,0.299872,0.522321,0.609065,0.679544,0.0,1.466426,open
1,0.953761,-0.556436,0.0,0.508739,0.0,1.730683,1.09609,-0.246469,0.244587,0.39171,...,-2.721255,0.159099,-0.225796,0.256453,-2.692501,0.362849,0.705664,0.0,1.799885,close
2,0.771801,-0.88973,0.0,-0.428346,0.0,0.629149,1.015601,-0.179194,0.350393,-1.750452,...,2.419276,0.420891,-1.873194,0.231416,2.458795,0.660952,0.68509,0.0,1.551014,close
3,0.941237,-0.770269,0.0,0.349303,0.0,1.455474,1.073424,-0.284578,0.169356,-0.251125,...,-2.925351,0.160825,-0.942896,0.223347,-2.88998,0.502295,0.782517,0.0,1.686843,close
4,0.761992,-0.948033,0.0,0.753021,0.0,1.785298,1.033844,-0.287652,0.185841,-2.530864,...,-2.942204,0.261864,-2.509768,0.127047,-2.532791,0.636079,0.536967,0.0,1.542793,close


## Data Cleansing

In [10]:
raw_df.describe()

Unnamed: 0,width_0_1,angle_0_1,width_1_2,angle_1_2,width_2_3,angle_2_3,width_0_4,angle_0_4,width_4_5,angle_4_5,...,width_11_12,angle_11_12,width_13_14,angle_13_14,width_14_15,angle_14_15,width_1_4,angle_1_4,width_4_7,angle_4_7
count,5927.0,5927.0,5927.0,5927.0,5927.0,5927.0,5927.0,5927.0,5927.0,5927.0,...,5927.0,5927.0,5927.0,5927.0,5927.0,5927.0,5927.0,5927.0,5927.0,5927.0
mean,0.804824,-0.697155,0.0,-0.011591,0.0,0.402939,1.046458,-0.246301,0.488992,-0.111813,...,0.331023,-1.310043,0.333086,-0.196744,0.277862,-0.817372,0.51591,0.496215,0.0,1.587364
std,0.117905,0.26462,0.0,0.653644,0.0,1.01634,0.050366,0.059127,0.187976,0.716408,...,0.089888,1.738285,0.175863,1.233361,0.071068,1.640673,0.120783,0.35415,0.0,0.367385
min,0.380807,-1.13816,0.0,-1.163657,0.0,-1.48783,0.913379,-0.44671,0.025853,-3.127661,...,0.008131,-3.135235,0.005244,-3.141593,0.024064,-3.139602,0.062346,-1.310696,0.0,-3.029166
25%,0.734036,-0.881594,0.0,-0.675149,0.0,-0.767017,1.013414,-0.279163,0.371095,-0.275033,...,0.258266,-2.740179,0.158563,-0.931987,0.227442,-2.495441,0.434822,0.354376,0.0,1.490824
50%,0.789463,-0.75867,0.0,0.061968,0.0,0.722461,1.046522,-0.252247,0.522618,-0.166583,...,0.347581,-2.541385,0.391298,0.192701,0.291116,0.001365,0.535984,0.612602,0.0,1.595929
75%,0.865265,-0.581683,0.0,0.527515,0.0,1.218602,1.076898,-0.222427,0.584651,0.129245,...,0.393712,0.165149,0.451376,0.45946,0.326383,0.378891,0.607685,0.70006,0.0,1.736937
max,1.46623,0.900753,0.0,2.029885,0.0,2.100169,1.300875,0.110011,1.202917,3.113697,...,0.645405,3.140569,0.858733,3.136743,0.548021,3.139988,0.765481,2.051654,0.0,3.013749


In [11]:
# columns to remove
elim_cols = ["width_1_2", "width_2_3", "width_4_7"]

In [12]:
# remove columns with zero values
raw_df = raw_df.drop(columns=elim_cols)
raw_df

Unnamed: 0,width_0_1,angle_0_1,angle_1_2,angle_2_3,width_0_4,angle_0_4,width_4_5,angle_4_5,width_5_6,angle_5_6,...,width_11_12,angle_11_12,width_13_14,angle_13_14,width_14_15,angle_14_15,width_1_4,angle_1_4,angle_4_7,label
0,0.785130,-0.984254,-0.748731,-0.892580,0.993145,-0.263319,0.532146,-0.240884,0.346437,-0.222086,...,0.390217,0.332903,0.407166,0.464137,0.310414,0.372555,0.656727,0.646102,1.413172,open
1,0.779328,-0.972377,-0.753830,-0.901753,0.995146,-0.262813,0.517027,-0.238114,0.338664,-0.203045,...,0.382608,0.337878,0.397044,0.458153,0.300901,0.347386,0.648799,0.635982,1.420984,open
2,0.787007,-0.972377,-0.753830,-0.916226,0.998920,-0.259332,0.525780,-0.226799,0.342001,-0.203045,...,0.388039,0.349856,0.400956,0.458153,0.311944,0.371709,0.654190,0.646513,1.436985,open
3,0.776338,-0.974037,-0.753830,-0.927295,0.994023,-0.259332,0.522122,-0.217651,0.344186,-0.186223,...,0.384484,0.337878,0.398991,0.458153,0.308668,0.356912,0.651979,0.635982,1.418147,open
4,0.762687,-0.987250,-0.774371,-0.908637,0.991506,-0.259578,0.517312,-0.229500,0.339354,-0.198701,...,0.379844,0.332352,0.396828,0.437271,0.308653,0.347108,0.659860,0.617354,1.408343,open
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5922,0.735670,-0.837633,-0.687362,0.374721,0.960145,-0.211021,0.331888,-0.810695,0.289361,-2.620933,...,0.389653,0.150422,0.395994,0.368010,0.307903,0.279633,0.564602,0.658589,1.275603,ok
5923,0.744471,-0.846433,-0.692030,0.431264,0.960813,-0.217521,0.325304,-0.795732,0.286148,-2.618338,...,0.391834,0.118363,0.399696,0.376370,0.305340,0.295072,0.566147,0.666912,1.281010,ok
5924,0.740446,-0.841737,-0.656389,0.415061,0.963129,-0.217384,0.328174,-0.772148,0.280851,-2.600173,...,0.391315,0.129009,0.396567,0.360874,0.304936,0.305718,0.564508,0.656400,1.291656,ok
5925,0.735670,-0.837633,-0.675024,0.326077,0.961908,-0.216261,0.315890,-0.794129,0.279552,-2.606151,...,0.389506,0.123031,0.401173,0.364784,0.297367,0.285230,0.561898,0.650422,1.285679,ok


In [13]:
target_df = target_df.drop(columns=elim_cols)
target_df

Unnamed: 0,width_0_1,angle_0_1,angle_1_2,angle_2_3,width_0_4,angle_0_4,width_4_5,angle_4_5,width_5_6,angle_5_6,...,width_11_12,angle_11_12,width_13_14,angle_13_14,width_14_15,angle_14_15,width_1_4,angle_1_4,angle_4_7,label
0,0.821630,-0.909301,-0.648029,-0.769057,1.008399,-0.266002,0.559157,-0.223376,0.345909,-0.183198,...,0.374982,0.340993,0.430316,0.579263,0.299872,0.522321,0.609065,0.679544,1.466426,open
1,0.953761,-0.556436,0.508739,1.730683,1.096090,-0.246469,0.244587,0.391710,0.282748,-2.920200,...,0.314066,-2.721255,0.159099,-0.225796,0.256453,-2.692501,0.362849,0.705664,1.799885,close
2,0.771801,-0.889730,-0.428346,0.629149,1.015601,-0.179194,0.350393,-1.750452,0.252760,2.504965,...,0.254004,2.419276,0.420891,-1.873194,0.231416,2.458795,0.660952,0.685090,1.551014,close
3,0.941237,-0.770269,0.349303,1.455474,1.073424,-0.284578,0.169356,-0.251125,0.294492,3.083759,...,0.276544,-2.925351,0.160825,-0.942896,0.223347,-2.889980,0.502295,0.782517,1.686843,close
4,0.761992,-0.948033,0.753021,1.785298,1.033844,-0.287652,0.185841,-2.530864,0.169233,2.825019,...,0.119560,-2.942204,0.261864,-2.509768,0.127047,-2.532791,0.636079,0.536967,1.542793,close
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,0.795185,-0.831560,-0.384150,-0.569072,1.038140,-0.240734,0.527748,-0.278002,0.342055,-0.243396,...,0.283586,-3.122481,0.390891,-1.824266,0.209992,3.038452,0.582135,0.623897,1.603910,three
68,0.761955,0.516647,0.619945,0.649658,1.078433,0.040246,0.607884,0.182988,0.392545,0.068925,...,0.244001,-2.590497,0.510832,1.605524,0.218724,-2.330275,0.533734,-0.675068,-2.632175,three
69,0.753022,0.213506,0.829681,1.045382,1.114678,-0.073629,0.642161,0.096474,0.390611,-0.068675,...,0.223589,-2.597280,0.378402,1.326529,0.201894,-2.266641,0.446681,-0.571359,2.508167,three
70,0.737993,-0.820210,-0.755411,0.111254,0.958879,-0.189058,0.346865,-0.950777,0.308685,-2.516728,...,0.393493,0.117797,0.425249,0.412876,0.316598,0.345272,0.568110,0.686663,1.212471,ok


#### Combine data
Connect each row of the raw data with the target data based on their labels

In [14]:
labels = list(target_df['label'].unique())
labels

['open',
 'close',
 'one',
 'hip_hop',
 'four',
 'two',
 'stop',
 'good',
 'promise',
 'three',
 'ok']

In [15]:
centroids_columns = []
for col in target_df.columns:
    centroids_columns.append(f"c_{col}")
centroids_columns.pop() # remove the label column
centroids_columns

['c_width_0_1',
 'c_angle_0_1',
 'c_angle_1_2',
 'c_angle_2_3',
 'c_width_0_4',
 'c_angle_0_4',
 'c_width_4_5',
 'c_angle_4_5',
 'c_width_5_6',
 'c_angle_5_6',
 'c_width_7_8',
 'c_angle_7_8',
 'c_width_8_9',
 'c_angle_8_9',
 'c_width_10_11',
 'c_angle_10_11',
 'c_width_11_12',
 'c_angle_11_12',
 'c_width_13_14',
 'c_angle_13_14',
 'c_width_14_15',
 'c_angle_14_15',
 'c_width_1_4',
 'c_angle_1_4',
 'c_angle_4_7']

In [16]:
def compute_distance(feature1, feature2):
    return np.linalg.norm(feature1 - feature2, ord=2)

In [17]:
negative_df = target_df[target_df['label'] != 'ok']
negative_df = negative_df.sample(500, replace=True).reset_index(drop=True)
negative_df = negative_df.drop(columns='label')
negative_df.columns = centroids_columns
negative_df

Unnamed: 0,c_width_0_1,c_angle_0_1,c_angle_1_2,c_angle_2_3,c_width_0_4,c_angle_0_4,c_width_4_5,c_angle_4_5,c_width_5_6,c_angle_5_6,...,c_angle_10_11,c_width_11_12,c_angle_11_12,c_width_13_14,c_angle_13_14,c_width_14_15,c_angle_14_15,c_width_1_4,c_angle_1_4,c_angle_4_7
0,0.771801,-0.889730,-0.428346,0.629149,1.015601,-0.179194,0.350393,-1.750452,0.252760,2.504965,...,-1.927351,0.254004,2.419276,0.420891,-1.873194,0.231416,2.458795,0.660952,0.685090,1.551014
1,0.899508,-0.913656,-0.666600,-0.943812,1.063166,-0.300892,0.650828,-0.202650,0.384197,-0.171942,...,-0.301053,0.322134,-2.735372,0.141743,-0.835715,0.277213,-2.672248,0.613581,0.703100,1.624772
2,0.728466,-0.161615,0.698113,1.293707,1.098044,-0.208361,0.446705,1.359811,0.266158,-2.689194,...,1.323169,0.306565,-2.699857,0.574152,0.615532,0.354007,0.598786,0.417147,-0.231129,1.861842
3,0.969048,-0.853309,-0.755191,-1.079250,1.078677,-0.239127,0.205258,-0.723566,0.334824,2.978908,...,-1.306131,0.273717,3.058560,0.227241,-1.270898,0.231126,-3.090464,0.628371,0.856309,1.752990
4,0.753022,0.213506,0.829681,1.045382,1.114678,-0.073629,0.642161,0.096474,0.390611,-0.068675,...,1.351675,0.223589,-2.597280,0.378402,1.326529,0.201894,-2.266641,0.446681,-0.571359,2.508167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.984834,-0.951774,0.609513,1.193277,1.131225,-0.387017,0.067138,1.060480,0.288240,-3.046496,...,-2.612462,0.190312,-2.681680,0.109394,-1.941287,0.168554,-2.575912,0.606898,0.668434,1.681796
496,0.753022,0.213506,0.829681,1.045382,1.114678,-0.073629,0.642161,0.096474,0.390611,-0.068675,...,1.351675,0.223589,-2.597280,0.378402,1.326529,0.201894,-2.266641,0.446681,-0.571359,2.508167
497,0.795185,-0.831560,-0.384150,-0.569072,1.038140,-0.240734,0.527748,-0.278002,0.342055,-0.243396,...,-1.759643,0.283586,-3.122481,0.390891,-1.824266,0.209992,3.038452,0.582135,0.623897,1.603910
498,0.728466,-0.161615,0.698113,1.293707,1.098044,-0.208361,0.446705,1.359811,0.266158,-2.689194,...,1.323169,0.306565,-2.699857,0.574152,0.615532,0.354007,0.598786,0.417147,-0.231129,1.861842


In [18]:
combined_df_list = []
for label in labels:
    each_raw_df = raw_df[raw_df['label'] == label]
    each_raw_df = each_raw_df.drop(columns='label')

    each_target_df = target_df[target_df['label'] == label]
    each_target_df = each_target_df.drop(columns='label')
    # each_target_df.columns = centroids_columns

    # find true data
    result_target_list = []
    for _, row in each_raw_df.iterrows():
        f1 = np.array(row)

        # find the target
        min_distance = 100
        index = 0
        count = 0
        for _, row in each_target_df.iterrows():
            feat = np.array(row)
            distance = compute_distance(f1, feat)
            if distance < min_distance:
                min_distance = distance
                index = count
            count += 1
        result_target_list.append(np.array(each_target_df.iloc[index]))

    result_target_df = pd.DataFrame(result_target_list, columns=centroids_columns)

    # combine true data
    combined_true_df = result_target_df.join(each_raw_df.reset_index(drop=True), how='left')
    combined_true_df['target'] = 1

    # find false data - false centroid
    negative_df = target_df[target_df['label'] != label]
    negative_df = negative_df.sample(len(each_raw_df), replace=True).reset_index(drop=True)
    negative_df = negative_df.drop(columns='label')
    negative_df.columns = centroids_columns
    combined_false_df = negative_df.join(each_raw_df.reset_index(drop=True), how='left')
    combined_false_df['target'] = 0

    # combine true and false data
    combined_df = pd.concat([combined_true_df, combined_false_df])
    combined_df_list.append(combined_df)

main_df = pd.concat(combined_df_list)
main_df

Unnamed: 0,c_width_0_1,c_angle_0_1,c_angle_1_2,c_angle_2_3,c_width_0_4,c_angle_0_4,c_width_4_5,c_angle_4_5,c_width_5_6,c_angle_5_6,...,width_11_12,angle_11_12,width_13_14,angle_13_14,width_14_15,angle_14_15,width_1_4,angle_1_4,angle_4_7,target
0,0.821630,-0.909301,-0.648029,-0.769057,1.008399,-0.266002,0.559157,-0.223376,0.345909,-0.183198,...,0.390217,0.332903,0.407166,0.464137,0.310414,0.372555,0.656727,0.646102,1.413172,1
1,0.821630,-0.909301,-0.648029,-0.769057,1.008399,-0.266002,0.559157,-0.223376,0.345909,-0.183198,...,0.382608,0.337878,0.397044,0.458153,0.300901,0.347386,0.648799,0.635982,1.420984,1
2,0.821630,-0.909301,-0.648029,-0.769057,1.008399,-0.266002,0.559157,-0.223376,0.345909,-0.183198,...,0.388039,0.349856,0.400956,0.458153,0.311944,0.371709,0.654190,0.646513,1.436985,1
3,0.821630,-0.909301,-0.648029,-0.769057,1.008399,-0.266002,0.559157,-0.223376,0.345909,-0.183198,...,0.384484,0.337878,0.398991,0.458153,0.308668,0.356912,0.651979,0.635982,1.418147,1
4,0.821630,-0.909301,-0.648029,-0.769057,1.008399,-0.266002,0.559157,-0.223376,0.345909,-0.183198,...,0.379844,0.332352,0.396828,0.437271,0.308653,0.347108,0.659860,0.617354,1.408343,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529,0.728466,-0.161615,0.698113,1.293707,1.098044,-0.208361,0.446705,1.359811,0.266158,-2.689194,...,0.389653,0.150422,0.395994,0.368010,0.307903,0.279633,0.564602,0.658589,1.275603,0
530,0.617407,-0.150124,0.583819,0.702035,1.000498,-0.159534,0.147218,1.334231,0.179728,3.056245,...,0.391834,0.118363,0.399696,0.376370,0.305340,0.295072,0.566147,0.666912,1.281010,0
531,0.571323,-0.274745,0.982801,1.324302,1.063060,-0.261711,0.375435,1.277392,0.207166,2.895977,...,0.391315,0.129009,0.396567,0.360874,0.304936,0.305718,0.564508,0.656400,1.291656,0
532,0.761992,-0.948033,0.753021,1.785298,1.033844,-0.287652,0.185841,-2.530864,0.169233,2.825019,...,0.389506,0.123031,0.401173,0.364784,0.297367,0.285230,0.561898,0.650422,1.285679,0


## Transform data
convert base and target feature to their differences. After experiment, this gave better result

In [19]:
raw_columns = list(raw_df.columns)
raw_columns.pop()
raw_columns

['width_0_1',
 'angle_0_1',
 'angle_1_2',
 'angle_2_3',
 'width_0_4',
 'angle_0_4',
 'width_4_5',
 'angle_4_5',
 'width_5_6',
 'angle_5_6',
 'width_7_8',
 'angle_7_8',
 'width_8_9',
 'angle_8_9',
 'width_10_11',
 'angle_10_11',
 'width_11_12',
 'angle_11_12',
 'width_13_14',
 'angle_13_14',
 'width_14_15',
 'angle_14_15',
 'width_1_4',
 'angle_1_4',
 'angle_4_7']

In [20]:
np.array(np.abs(main_df['width_0_1'] - main_df['c_width_0_1'])) * 10

array([0.36499643, 0.42301722, 0.3462361 , ..., 1.69123187, 0.26322035,
       0.55995084])

In [21]:
trans_dict = {}
for raw_col, cen_col in zip(raw_columns, centroids_columns):
    trans_dict[raw_col] = np.array(np.abs(main_df[raw_col] - main_df[cen_col]))
trans_dict['target'] = np.array(main_df['target'])
trans_df = pd.DataFrame(trans_dict)
trans_df

Unnamed: 0,width_0_1,angle_0_1,angle_1_2,angle_2_3,width_0_4,angle_0_4,width_4_5,angle_4_5,width_5_6,angle_5_6,...,width_11_12,angle_11_12,width_13_14,angle_13_14,width_14_15,angle_14_15,width_1_4,angle_1_4,angle_4_7,target
0,0.036500,0.074953,0.100702,0.123523,0.015253,0.002683,0.027011,0.017507,0.000529,0.038889,...,0.015235,0.008090,0.023150,0.115126,0.010541,0.149766,0.047662,0.033442,0.053254,1
1,0.042302,0.063077,0.105801,0.132697,0.013253,0.003189,0.042130,0.014737,0.007244,0.019847,...,0.007626,0.003115,0.033271,0.121110,0.001029,0.174935,0.039734,0.043561,0.045442,1
2,0.034624,0.063077,0.105801,0.147169,0.009479,0.006670,0.033377,0.003422,0.003908,0.019847,...,0.013056,0.008863,0.029360,0.121110,0.012072,0.150612,0.045125,0.033031,0.029441,1
3,0.045292,0.064737,0.105801,0.158239,0.014376,0.006670,0.037035,0.005726,0.001723,0.003025,...,0.009501,0.003115,0.031325,0.121110,0.008796,0.165409,0.042915,0.043561,0.048279,1
4,0.058943,0.077949,0.126343,0.139580,0.016892,0.006423,0.041845,0.006124,0.006555,0.015503,...,0.004862,0.008641,0.033488,0.141992,0.008781,0.175213,0.050796,0.062189,0.058083,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11849,0.007203,0.676018,1.385475,0.918986,0.137899,0.002659,0.114817,2.170506,0.023202,0.068261,...,0.083087,2.850279,0.178158,0.247522,0.046104,0.319152,0.147455,0.889718,0.586238,0
11850,0.127064,0.696309,1.275849,0.270771,0.039685,0.057987,0.178086,2.129963,0.106420,5.674583,...,0.120897,3.044046,0.144166,0.031188,0.024984,0.232025,0.182984,0.841610,0.213134,0
11851,0.169123,0.566992,1.639190,0.909242,0.099931,0.044327,0.047261,2.049540,0.073685,5.496150,...,0.126716,2.756402,0.196361,0.872890,0.146601,2.801049,0.065489,0.892971,0.378004,0
11852,0.026322,0.110400,1.428046,1.459221,0.071936,0.071391,0.130049,1.736735,0.110319,5.431170,...,0.269947,3.065236,0.139309,2.874553,0.170320,2.818022,0.074181,0.113455,0.257115,0


#### Split data

In [22]:
# x = main_df.values[:, :-1]
# y = main_df.values[:, -1]
x = trans_df.values[:, :-1]
y = trans_df.values[:, -1]

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=50)

## Train the model

In [41]:
pca = PCA(n_components=25)
pca.fit(x_train)

PCA(n_components=25)

In [42]:
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.fit_transform(x_test)

In [55]:
linear_model = LinearRegression()

In [56]:
linear_model.fit(x_train, y_train)

LinearRegression()

In [81]:
# l1_ratio : ratio of l1 in the elsticnet penalty (l2 = 1 - l1_ratio)
logistic_model = LogisticRegression(penalty='elasticnet', solver='saga', C=0.5, l1_ratio=0.5, max_iter=400, random_state=50)
# logistic_model = LogisticRegression(penalty='l2', C=0.5, max_iter=1000)

In [82]:
logistic_model.fit(x_train, y_train)

LogisticRegression(C=0.5, l1_ratio=0.5, max_iter=400, penalty='elasticnet',
                   random_state=50, solver='saga')

In [160]:
# random forest 
rf_model = RandomForestRegressor(max_depth=3, n_estimators=5, random_state=50)

In [161]:
rf_model.fit(x_train_pca, y_train)

RandomForestRegressor(max_depth=3, n_estimators=5, random_state=50)

In [176]:
# neural network
nn = Sequential([
    Dense(units=1, activation='sigmoid', input_shape=(25, 1)),
    Flatten(),
#     Dense(units=10, activation='sigmoid'),
    Dense(units=2, activation='softmax')
])

In [177]:
nn.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_23 (Dense)             (None, 25, 1)             2         
_________________________________________________________________
flatten_8 (Flatten)          (None, 25)                0         
_________________________________________________________________
dense_24 (Dense)             (None, 2)                 52        
Total params: 54
Trainable params: 54
Non-trainable params: 0
_________________________________________________________________


In [178]:
nn.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [179]:
x_nn_train = np.expand_dims(x_train, -1)
y_nn_train = np.array(list(map(lambda x: [1, 0] if x == 0 else [0, 1], y_train)))
y_nn_train

array([[0, 1],
       [0, 1],
       [0, 1],
       ...,
       [1, 0],
       [1, 0],
       [0, 1]])

In [180]:
nn.fit(x=x_nn_train, y=y_nn_train, validation_split=0.2, epochs=10, batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f98486d7520>

## Test the model

In [58]:
linear_model.score(x_test, y_test)

0.7990315766540133

In [59]:
linear_model.predict(x_test)

array([-0.20958001,  0.65252517,  0.92066486, ...,  0.86452008,
        0.95303408,  0.93408421])

In [83]:
logistic_model.score(x_test, y_test)

0.9966263705369693

In [125]:
logistic_model.predict(x_test)

array([0., 0., 1., ..., 1., 1., 1.])

In [162]:
rf_model.score(x_test_pca, y_test)

0.9363772785570305

In [158]:
rf_model.predict(x_test_pca)

array([4.17066686e-04, 4.17066686e-04, 9.88075229e-01, ...,
       9.88075229e-01, 9.88075229e-01, 9.88075229e-01])

In [181]:
x_nn_test = np.expand_dims(x_test, -1)
y_nn_test = np.array(list(map(lambda x: [1, 0] if x == 0 else [0, 1], y_test)))
nn.evaluate(x_nn_test, y_nn_test, verbose=1)



[0.04660976305603981, 0.9910036325454712]

## Save model

In [132]:
# linear model
with open('models/linear_comp.pickle', 'wb') as f:
    pickle.dump(linear_model, f)

In [117]:
# logistic model
with open('models/logistic_comp.pickle', 'wb') as f:
    pickle.dump(logistic_model, f)

In [163]:
# random forest
with open('models/random_forest_comp.pickle', 'wb') as f:
    pickle.dump(rf_model, f)
# save the pca transformer
with open('models/rf_pca.pickle', 'wb') as f:
    pickle.dump(pca, f)

In [182]:
# neural network
nn.save('models/nn_comp.h5')