In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import svm
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler

In [2]:
pd.set_option('display.max_columns', 200)

In [3]:
path = 'C://Users//LENOVO//Desktop//Help_Me!//neobe_code_challenge.csv'

In [4]:
data = pd.read_csv(path, index_col = 'index')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 676 entries, 0 to 675
Data columns (total 4 columns):
user_id            676 non-null int64
expected_result    676 non-null float64
timestamp          676 non-null object
matrix             676 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 26.4+ KB


In [6]:
data.head()

Unnamed: 0_level_0,user_id,expected_result,timestamp,matrix
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,85,-1.0,2018-05-17 11:00:16,"[[0.4005973, 0.45082605, 0.49999896, 0.5503350..."
1,85,-1.0,2018-05-17 11:00:16,"[[3.9006763, 3.9500546, 4.0018487, 4.0503683, ..."
2,85,-1.0,2018-05-17 11:00:16,"[[5.9508233, 6.0001464, 6.050508, 6.100889, 6...."
3,85,-1.0,2018-05-17 11:00:01,"[[0.24993609, 0.30028215, 0.35154766, 0.400782..."
4,85,-1.0,2018-05-17 11:00:01,"[[2.4998322, 2.5501323, 2.6006367, 2.649971, 2..."


In [7]:
data.describe()

Unnamed: 0,user_id,expected_result
count,676.0,676.0
mean,77.177515,0.20858
std,12.016575,0.976456
min,56.0,-1.0
25%,83.0,-1.0
50%,83.0,1.0
75%,84.25,1.0
max,85.0,1.0


In [8]:
data.user_id.value_counts()

83    189
85    169
56    164
84    154
Name: user_id, dtype: int64

In [9]:
data.expected_result.value_counts()

 1.0    407
-1.0    266
 0.0      3
Name: expected_result, dtype: int64

In [10]:
data = data.drop(['timestamp'], axis=1)
data = data[data['expected_result']!=0]
data.reset_index(inplace=True)

In [11]:
val_group = data.groupby(by=['user_id','expected_result']).expected_result.agg({'group_sum':'sum'})
print (val_group)

                         group_sum
user_id expected_result           
56      -1.0                 -77.0
         1.0                  87.0
83      -1.0                 -55.0
         1.0                 131.0
84      -1.0                 -66.0
         1.0                  88.0
85      -1.0                 -68.0
         1.0                 101.0


is deprecated and will be removed in a future version
  """Entry point for launching an IPython kernel.


In [12]:
data_copy = data.copy()

In [13]:
"""
    Function to get the respective statistics{mean, max, min, median}
of each element(list={each list of column matrix}) of the matrix
column(input_attribute)
"""


def matrix_conv(input_attribute, apply_func):
    input_attribute = input_attribute.matrix
    input_split = input_attribute.split('],')
    result = []
    for element in input_split:
        input_replace = element.replace('[', '')
        input_change = input_replace.replace(']', '')
        input_update = input_change.split(',')
        input_modified = np.array([float(x) for x in input_update])
        result.append(apply_func(input_modified))
    return result


In [14]:
"""
    Create a single list containing mean of each dimension (total 27)
and then segregate them into individuals columns
"""


def add_stats_col(apply_func, col_initials,
                  matrix_conv=matrix_conv, dataframe=data_copy):

    dataframe[col_initials] = dataframe.apply(matrix_conv, axis=1,
                                              apply_func=apply_func)

    col_names = [col_initials + str(idx) for idx in range(1, 28)]

    return pd.DataFrame(dataframe[col_initials].values.tolist(),
                        columns=col_names)


In [15]:
data_mean = add_stats_col(np.mean, 'Array_mean')
data_min = add_stats_col(np.min, 'Array_min')
data_max = add_stats_col(np.max, 'Array_max')
data_median = add_stats_col(np.median, 'Array_median')


In [16]:
"""
    Create a temporary dataframe to capture the variance of each calculated
columns (27 * 4 {4 statistics})
"""


index_value = [x for x in range(1, 28)]
temp_mean = pd.DataFrame(data_mean.apply(np.var, axis=0), columns=['mean'])
temp_min = pd.DataFrame(data_min.apply(np.var, axis=0), columns=['min'])
temp_max = pd.DataFrame(data_max.apply(np.var, axis=0), columns=['max'])
temp_median = pd.DataFrame(data_median.apply(np.var, axis=0),
                           columns=['median'])
temp_mean.index = temp_median.index = temp_min.index \
                = temp_max.index = index_value


In [17]:
template = pd.concat([temp_mean['mean'],temp_median['median'],
                      temp_min['min'],temp_max['max']],axis=1, ignore_index=False)

In [18]:
drop_list_var = template[template.var(axis=1) < 0.1].index.tolist()
column_name = ['Array_min','Array_max','Array_mean','Array_median']

In [19]:
drop_list_var

[1, 2, 3, 4, 5, 6, 7, 8, 11, 17, 24, 25, 26, 27]

In [20]:
corr_data = data_mean.corr()

In [21]:
# Trying to check the correlation here to drop highly correlated columns.
corr_data[corr_data[corr_data.columns.values]>  0.8]
corr_data[corr_data[corr_data.columns.values]< -0.8]
# Inference after analyzing the correlation matrix (for both mean and median) we can conclude
# Array_mean6-Array_mean7
# Array_mean8-Array_mean12
# Array_mean9-Array_mean17-Array_mean24
# Array_mean22-Array_mean23
# Array_mean26-Array_mean27
# Which are already included above 

Unnamed: 0,Array_mean1,Array_mean2,Array_mean3,Array_mean4,Array_mean5,Array_mean6,Array_mean7,Array_mean8,Array_mean9,Array_mean10,Array_mean11,Array_mean12,Array_mean13,Array_mean14,Array_mean15,Array_mean16,Array_mean17,Array_mean18,Array_mean19,Array_mean20,Array_mean21,Array_mean22,Array_mean23,Array_mean24,Array_mean25,Array_mean26,Array_mean27
Array_mean1,,,,,,,,,,,,,,,,,,,,,,,,,,,
Array_mean2,,,,,,,,,,,,,,,,,,,,,,,,,,,
Array_mean3,,,,,,,,,,,,,,,,,,,,,,,,,,,
Array_mean4,,,,,,,,,,,,,,,,,,,,,,,,,,,
Array_mean5,,,,,,,,,,,,,,,,,,,,,,,,,,,
Array_mean6,,,,,,,,,,,,,,,,,,,,,,,,,,,
Array_mean7,,,,,,,,,,,,,,,,,,,,,,,,,,,
Array_mean8,,,,,,,,,,,,,,,,,,,,,,,,,,,
Array_mean9,,,,,,,,,,,,,,,,,-0.937715,,,,,,,-0.834215,,,
Array_mean10,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [22]:
"""
   Function to drop all the derived column having very low variance
(~0.1) between its mean, median, max and min.
"""


def get_drop_column(column_name, drop_list_var):
    drop_col = []
    for name in column_name:
        for indx in drop_list_var:
            drop_col.append(name+str(indx))
    return (drop_col)

In [23]:
final_drop_col = get_drop_column(column_name, drop_list_var)

In [24]:
df_club = pd.concat([data_mean, data_median, data_min, data_max],axis=1, ignore_index=False)
df_club = df_club.drop(final_drop_col, axis=1)
df_club = pd.concat([df_club,data[['expected_result', 'user_id']]] , axis=1)

In [25]:
"""
   Function to split the dataframe based on the target class(positive or
negative and return the feature and target for each class.
"""


def split_user(dataframe, user_id):
    df_user = dataframe[dataframe['user_id'] == user_id]
    user_pos = df_user[df_user['expected_result'] == 1]
    user_neg = df_user[df_user['expected_result'] == -1]
    x_train, y_train = user_pos.iloc[:, :-2], user_pos['expected_result']
    x_test, y_test = user_neg.iloc[:, :-2], user_neg['expected_result']
    return x_train, x_test, y_train, y_test

In [26]:
"""
   Function to apply scaling operation on the input features and return
the transformed scaled features.
"""


def scaler_func(x_train, x_test):
    col = [x_train.columns]
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler.fit(x_train)
    return (pd.DataFrame(scaler.transform(x_train), columns=col),
            pd.DataFrame(scaler.transform(x_test), columns=col))


In [27]:
"""
   Function to fit OneClassSVM on the positive class training set and
evaluate the results on negative class testing set and return the
calculated metrics.
"""


col_name = ['user_id', 'Train size', 'Train Error', 'Test size',
            'Test Error', 'False Acceptance %', 'False Rejection %']


def svm_class(user_id, x_train, x_test, y_train, y_test, gamma=0.71):
    clf = svm.OneClassSVM(nu=0.15, kernel="rbf", gamma=gamma)
    clf.fit(x_train)
    y_pred_pos = clf.predict(x_train)
    y_pred_neg = clf.predict(x_test)
    n_error_train = y_pred_pos[y_pred_pos == -1].size
    n_error_test = y_pred_neg[y_pred_neg == 1].size
    fap = float(n_error_test*100/y_test.size)
    frp = float(n_error_train*100/y_train.size)
    return ([user_id, y_train.size, n_error_train,
            y_test.size, n_error_test, fap, frp])


In [28]:
user_ids = np.array(np.unique(data.user_id)).astype(int)

In [29]:
"""
   Function to call scaler function and pass on the scaled feature
to fit into OneClassSVM and fetch the metric for each user.
"""


def get_metric(data_frame, user_ids):
    data_metric = pd.DataFrame()
    metric_list = []
    for user_id in user_ids:
        x_train, x_test, y_train, y_test = split_user(data_frame, user_id)
        scaled_train, scaled_test = scaler_func(x_train, x_test)
        metric_list.append(svm_class(user_id, scaled_train, scaled_test,
                                     y_train, y_test))

    return pd.DataFrame(metric_list, columns=col_name)


In [30]:
get_metric(df_club, user_ids)

Unnamed: 0,user_id,Train size,Train Error,Test size,Test Error,False Acceptance %,False Rejection %
0,56,87,23,77,0,0.0,26.0
1,83,131,28,55,0,0.0,21.0
2,84,88,19,66,1,1.0,21.0
3,85,101,14,68,0,0.0,13.0
