In [1]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn import preprocessing
import numpy as np
import os
from random import randint
import pickle



In [2]:
test = pd.read_csv('/Users/matthewschultz/Big_Data_Lab/identity_review/csv_output/test_data.csv')

def create_equal_representation(df):
    malicious = df.loc[df['output'] == 1]

    non_mal = df.loc[df['output'] == 0]
    non_mal_sample = non_mal.sample(len(malicious))

    return pd.concat([malicious, non_mal_sample], ignore_index= True)

equalized_df = create_equal_representation(test)



In [15]:
# add in the pickle functions here 
def normalize(df):
    scalar = preprocessing.Normalizer(norm="l2").fit(df[['num_of_mal_trans', 'ratio']])
    pickle.dump(scalar, open("/Users/matthewschultz/Big_Data_Lab/identity_review/mlp_normalizer/scalar_mlp.save", "wb"))
    scaled = scalar.transform(df[['num_of_mal_trans', 'ratio']])
    temp = df[['value_out', 'value_out', 'avg_trans', 'num_of_zero', 'output']].to_numpy()

    combined = np.concatenate((scaled, temp), axis = 1)

    normalized = pd.DataFrame(combined)
    normalized = normalized.rename(columns = {
        0: 'num_of_mal_trans',
        1: 'ratio',
        2: 'value_out',
        3: 'value_in',
        4: 'avg_trans',
        5: 'num_of_zero',
        6: 'output'
    })
    return normalized

normalzed_df = normalize(equalized_df)

I am looking to run a test that compares the normalized data to the non-normaized data to see which data creates a better model. Then we go into pruning and other validation test. This is esstentially which model will have better results when running the test first.

In [4]:
X_non_normalized = equalized_df[['num_of_mal_trans', 'ratio', 'value_out', 'value_in', 'avg_trans', 'num_of_zero']].to_numpy()
y_non_normalized = equalized_df['output'].to_numpy()

X_normalized = normalzed_df[['num_of_mal_trans', 'ratio', 'value_out', 'value_in', 'avg_trans', 'num_of_zero']].to_numpy()
y_normalized = normalzed_df['output'].to_numpy()


def ensure_equal(x_df, y_df):
    X_train, X_test, y_train, y_test = train_test_split(x_df, y_df, train_size=0.65, random_state = randint(1, 300))
    if len(np.where(y_train == 1)[0]) / len(y_train) < 0.5:
        return ensure_equal(x_df, y_df)
    else:
        return X_train, X_test, y_train, y_test



X_non_train, X_non_test, y_non_train, y_non_test = ensure_equal(X_non_normalized, y_non_normalized)
X_norm_train, X_norm_test, y_norm_train, y_norm_test = ensure_equal(X_normalized, y_normalized)

Test for the non-normalized data. Set activation to logitic because this is a binary classification model

In [5]:
clf_non = MLPClassifier(hidden_layer_sizes = (10,), 
                        activation = 'logistic',
                        solver = 'sgd',
                        learning_rate = 'adaptive',
                        max_iter = 2000,
                        random_state=1)
cros_val_non = cross_val_score(clf_non, X_non_normalized, y_non_normalized, cv=4)

fitted_non = clf_non.fit(X_non_train, y_non_train)

predict_non = fitted_non.predict(X_non_test).tolist()

conf_matrix_non = confusion_matrix(y_non_test, predict_non)

print(cros_val_non)
print(conf_matrix_non)

[0.5        0.71428571 0.77777778 0.55555556]
[[ 1 19]
 [ 1 18]]


This will be for the normalized data

In [13]:
clf_norm = MLPClassifier(hidden_layer_sizes = (13,), 
                        activation = 'logistic',
                        solver = 'sgd',
                        learning_rate = 'adaptive',
                        max_iter = 2000,
                        random_state=1)
cros_val_norm = cross_val_score(clf_norm, X_normalized, y_normalized, cv=10)

fitted_norm = clf_norm.fit(X_norm_train, y_norm_train)

predict_norm = fitted_norm.predict(X_norm_test).tolist()

conf_matrix_norm = confusion_matrix(y_norm_test, predict_norm)

print(cros_val_norm)
print(conf_matrix_norm)

[0.81818182 0.81818182 0.90909091 1.         0.81818182 0.90909091
 1.         0.81818182 0.81818182 0.81818182]
[[17  4]
 [ 1 17]]


After running the test, I have concluded that the normalized version of the model is better optimized for the data set. It is important to note that the unnormalized model was performing better until I added the value in and out features. However, I think that it is important to include these features because it helps visualize the disparity between non-malicious and malicious accounts. For example, most non-malicious accounts will have a very similar input and output amounts making it easier to identify.

I want to find the optimal number of nodes in the hidden layer.

In [12]:
def find_optimal_model(X, y):
    number_of_nodes = range(5,16)
    list_of_df = []
    for i in number_of_nodes:
        print(i)
        clf_norm = MLPClassifier(hidden_layer_sizes = (i,), 
            activation = 'logistic',
            solver = 'sgd',
            learning_rate = 'adaptive',
            max_iter = 2500,
            random_state=1)
        average_cross = np.mean(cross_val_score(clf_norm, X, y, cv=10)) 
        list_of_false_positives = []
        list_of_false_negatives = [] 
        for k in range(0, 29):
            X_norm_train, X_norm_test, y_norm_train, y_norm_test = ensure_equal(X, y)
            fitted_norm = clf_norm.fit(X_norm_train, y_norm_train)
            predict_norm = fitted_norm.predict(X_norm_test).tolist()
            conf_matrix_norm = confusion_matrix(y_norm_test, predict_norm)
            list_of_false_positives.append(conf_matrix_norm[0, 1])
            list_of_false_negatives.append(conf_matrix_norm[1, 0])
        list_of_df.append(pd.DataFrame({'num_hidden': [i],
                                        'average_cross': [average_cross],
                                        'average_pos': [sum(list_of_false_positives)/len(list_of_false_positives)],
                                        'average_neg': [sum(list_of_false_negatives)/len(list_of_false_negatives)]}))
    final = pd.concat(list_of_df)
    return final


            

temp = find_optimal_model(X_normalized, y_normalized)
print(temp)

    


5
6
7
8
9
10
11
12
13
14
15
   num_hidden  average_cross  average_pos  average_neg
0           5       0.809091     9.965517     1.172414
0           6       0.809091     7.655172     0.310345
0           7       0.872727     3.034483     3.068966
0           8       0.790909     6.103448     1.482759
0           9       0.818182     7.896552     0.310345
0          10       0.781818    11.206897     0.275862
0          11       0.800000     8.517241     0.482759
0          12       0.763636     7.310345     0.448276
0          13       0.872727     5.068966     0.862069
0          14       0.809091     8.275862     0.241379
0          15       0.863636     5.862069     0.344828


In [16]:
def save_trained_model(model, path):
    list_files = os.listdir(path)
    files = [file for file in list_files if file != '.DS_Store']
    if len(files) == 0:
        file_name = path + "/mlp_1.sav"
        pickle.dump(model, open(file_name, 'wb'))
    else:
        file_number = int(files[-1].split("_")[2].split(".")[0]) + 1
        file_name = path + "/mlp_{}.sav".format(file_number)
        pickle.dump(model, open(file_name, 'wb'))
        

path_name = '/Users/matthewschultz/Big_Data_Lab/identity_review/mlp_iteration'
save_trained_model(clf_norm, path_name)

