In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier

#from tensorflow.keras.layers import *
#from tensorflow.keras.models import Model
#from tensorflow.keras.callbacks import EarlyStopping


from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

from tqdm import tqdm

import os
import datetime

pd.options.mode.chained_assignment = None

In [None]:
today = datetime.date.today()
d1 = today.strftime("%d-%m-%Y")

submission_predictions_filepath = '207632118_205644941.csv'

RAW_DATA_PATH = 'FraudedRawData'

PARTIAL_LABELS_PATH = 'challengeToFill.csv'

LABELS_OUTPUT_PATH = 'submission_' + d1 + '_statistics.csv'

MAX_NUM_OF_MALICIOUS = 10

num_train_segments = 50

total_num_users_to_train = 40

last_idx_usr_train = 9

hidden_layer_sizes = (512, 256, 128, 64, 32, 16, 8, 4, 2)
max_num_of_iterations = 5000
alpha = 0.0001
random_state = 21
loss_tolerance = 0.000000001

num_of_segments = 150

num_of_cmds_per_segment = 100

max_num_of_ngrams = 10

svd_flag = True

num_of_features_svd = 100

use_avg_statistics = True

chosen_classifiers = ['MLPClassifier', 'GradientBoostingClassifier', 'DecisionTreeClassifier']

In [None]:
def results_predict(user_id, predicts_data, labels, file_path):
    # accuracy: (tp + tn) / (p + n)
    accuracy = accuracy_score(labels, predicts_data)
    #print('Accuracy: %f' % accuracy)

    # precision tp / (tp + fp)
    precision = precision_score(labels, predicts_data)
    #print('Precision: %f' % precision)

    # recall: tp / (tp + fn)
    recall = recall_score(labels, predicts_data)
    #print('Recall: %f' % recall)

    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(labels, predicts_data)
    #print('F1 score: %f' % f1)

    auc = roc_auc_score(labels, predicts_data[:])
    #print('ROC AUC: %f' % auc)

    CM = confusion_matrix(labels, predicts_data)
    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]

    dict_results_predict = {'userID': user_id,
                            'date': datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
                            'accuracy': accuracy,
                            'TP': TP,
                            'TN': TN,
                            'FP': FP,
                            'FN': FN,
                            'precision': precision,
                            'recall': recall,
                            'f1': f1,
                            'AUC': auc
                            }

    current_results_predict = pd.DataFrame([dict_results_predict])

    try:
        all_results = pd.read_csv(file_path, index_col=False)
        all_results = all_results.append(current_results_predict)
        all_results.to_csv(file_path, index=False)
        print('added results to .csv, file shape is {}'.format(all_results.shape))
    except FileNotFoundError:
        current_results_predict.to_csv(file_path, index=False)
        print('new .csv has been created')

In [None]:
def DataScaler(data):
    '''
    scaling the data into Z for normalizing!
    @param data: data
    @return:  Z scaled data
    '''
    scaler = StandardScaler(with_mean=False)
    data = scaler.fit_transform(data)
    return data


def tfidf_vectorization_with_optional_svd(list_of_strings, index, ngram_range=(1, 1), svd_flag=True,
                                          n_features=num_of_features_svd):
    '''
    Perform multiple tfidf vectorizations on the data.
    @param list_of_strings: list of strings to vec
    @param index: index to attach to returned DataFrame
    @param ngram_range: range of ngrams to use
    @param svd_flag: if to apply svd or not
    @return df: vectorized pandas DataFrame
    '''
    tfidf = TfidfVectorizer(use_idf=False, norm=None, ngram_range=ngram_range)
    scaled_matrix = DataScaler(tfidf.fit_transform(list_of_strings))

    if svd_flag:
        svd = TruncatedSVD(n_components=n_features, n_iter=10, random_state=356)
        transformed = svd.fit_transform(scaled_matrix)
        df = pd.DataFrame(transformed, index=index)
    else:
        df = pd.DataFrame(scaled_matrix, index=index, columns=tfidf.get_feature_names())

    return df


def calc_statistics(transformed_df, segmented_data_dict, mode='train', calc_avg_of_statistic=False):
    '''
    create avg for each segment, and for each user(each segment for user -> each user)
    two features ->
    # -> 'avg_segment' = avg len of cmds in each segment, user based.
    # -> 'total_avg' = avg len of cmds for each user
    '''

    def calc_len_segment_avgs(row):
        curr_idx = str(int(row['userID'])) + "_" + str(int(row['segmentID']))
        curr_cmds = segmented_data_dict[curr_idx]
        curr_cmds_split = curr_cmds.split()
        if len(curr_cmds_split) == 0:
            return 0
        curr_avg = sum(map(len, curr_cmds_split)) / len(curr_cmds_split)
        return curr_avg

    def calc_len_segment_median(row):
        curr_idx = str(int(row['userID'])) + "_" + str(int(row['segmentID']))
        curr_cmds = segmented_data_dict[curr_idx]
        curr_cmds_split = curr_cmds.split()
        if len(curr_cmds_split) == 0:
            return 0
        curr_median = np.median(list(map(len, curr_cmds_split)))
        return curr_median

    def calc_len_segment_std(row):
        curr_idx = str(int(row['userID'])) + "_" + str(int(row['segmentID']))
        curr_cmds = segmented_data_dict[curr_idx]
        curr_cmds_split = curr_cmds.split()
        if len(curr_cmds_split) == 0:
            return 0
        curr_std = np.std(list(map(len, curr_cmds_split)))
        return curr_std

    if mode == 'train':
        num_to_repeat = num_train_segments
    else:
        num_to_repeat = num_of_segments - num_train_segments

    transformed_df.loc[:, 'avg_segment'] = transformed_df.apply(calc_len_segment_avgs, axis=1)
    if calc_avg_of_statistic == True:
        avg_of_avgs = transformed_df.groupby('userID')['avg_segment'].mean()
        repeated_avg_of_avgs = avg_of_avgs.iloc[np.arange(len(avg_of_avgs)).repeat(num_to_repeat)]
        transformed_df.loc[:, 'total_avg'] = repeated_avg_of_avgs.values

    #calculate median as above.
    transformed_df.loc[:, 'median_segment'] = transformed_df.apply(calc_len_segment_median, axis=1)
    if calc_avg_of_statistic == True:
        avg_of_medians = transformed_df.groupby('userID')['median_segment'].mean()
        repeated_avg_of_medians = avg_of_medians.iloc[np.arange(len(avg_of_medians)).repeat(num_to_repeat)]
        transformed_df.loc[:, 'total_median_avg'] = repeated_avg_of_medians.values

    #calculate standard deviation as above.
    transformed_df.loc[:, 'std_segment'] = transformed_df.apply(calc_len_segment_std, axis=1)
    if calc_avg_of_statistic == True:
        avg_of_stds = transformed_df.groupby('userID')['std_segment'].mean()
        repeated_avg_of_stds = avg_of_stds.iloc[np.arange(len(avg_of_stds)).repeat(num_to_repeat)]
        transformed_df.loc[:, 'total_std_avg'] = repeated_avg_of_stds.values

    return transformed_df


def predict_segments(segmented_data_dict, use_svd_flag=True, ngrams=10):
    if os.path.isfile(PARTIAL_LABELS_PATH):
        print("partial labels file found.. Loading...")
        partial_labels_df = pd.read_csv(PARTIAL_LABELS_PATH)
        #print(partial_labels_df.columns)
        #print(partial_labels_df.columns[0])
    else:
        print("partial labels file not found.. Continuing...")

    users_segments_keys = segmented_data_dict.keys()
    str_based_data_segments = segmented_data_dict.values()
    transformed_df = tfidf_vectorization_with_optional_svd(str_based_data_segments, users_segments_keys,
                                                           ngram_range=(1, ngrams), svd_flag=use_svd_flag)
    #print(transformed_df)

    transformed_df.loc[:, 'userID'] = transformed_df.index
    transformed_df.loc[:, 'userID'] = transformed_df['userID'].apply(lambda x: int(x.split('_')[0]))
    transformed_df.loc[:, 'segmentID'] = transformed_df.index
    transformed_df.loc[:, 'segmentID'] = transformed_df['segmentID'].apply(lambda x: int(x.split('_')[1]))
    transformed_df.sort_values(by=['userID', 'segmentID'], inplace=True)

    tfidf_until_seg_train = transformed_df[(transformed_df['segmentID'] < num_train_segments)]

    if use_avg_statistics == True:
        tfidf_until_seg_train = calc_statistics(tfidf_until_seg_train, segmented_data_dict, 'train')

    num_users_to_train = len(set(tfidf_until_seg_train['userID']))  #get number of users, by set of IDS

    if total_num_users_to_train != num_users_to_train:
        print("Not Equal num of users to train! something not right with the input... Exiting!")
        exit()

    tfidf_train_matrix = tfidf_until_seg_train.drop(['userID', 'segmentID'], axis=1)
    print(tfidf_train_matrix)

    'One versus All - [0](benign) just for current idx, the rest are [1](malicious)'

    for index in tqdm(range(num_users_to_train)):

        first_rest_labels = np.ones(num_train_segments * index,
                                    dtype=int)  #count as malicious, to fill before index benign if needed
        current_idx_labels = np.zeros(num_train_segments, dtype=int)  #count as benign
        rest_labels = np.ones(num_train_segments * (num_users_to_train - index - 1), dtype=int)  #count as malicious

        train_labels = np.concatenate((first_rest_labels, current_idx_labels, rest_labels), axis=0)

        user_idx_segs_val_test = transformed_df[
            (transformed_df['userID'] == index) & (transformed_df['segmentID'] >= num_train_segments)].copy()

        if use_avg_statistics == True:
            user_idx_segs_val_test = calc_statistics(user_idx_segs_val_test, segmented_data_dict, 'test')

        segs_val_test = user_idx_segs_val_test.drop(['userID', 'segmentID'], axis=1)

        #train

        models = []

        for model_name in chosen_classifiers:

            if model_name == 'MLPClassifier':

                model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, max_iter=max_num_of_iterations,
                                      alpha=alpha,
                                      solver='adam', verbose=False, random_state=random_state, tol=loss_tolerance)

            elif model_name == 'GradientBoostingClassifier':

                model = GradientBoostingClassifier(verbose=False, random_state=random_state, tol=loss_tolerance)


            elif model_name == 'DecisionTreeClassifier':

                model = DecisionTreeClassifier(criterion='gini', random_state=random_state)

            model.fit(tfidf_train_matrix, train_labels)

            models.append(model)

        #predict

        print("****************")

        print("User idx:" + str(index))

        eclf = VotingClassifier(estimators=[('mplc', models[0]), ('abc', models[1]), ('dtc', models[2])], voting='soft')

        eclf.fit(tfidf_train_matrix, train_labels)

        #predict

        pred_for_user = eclf.predict(segs_val_test)

        if sum(pred_for_user) > MAX_NUM_OF_MALICIOUS:  #change more malicious than MAX_NUM_OF_MALICIOUS to benign, besides top@MAX_NUM_OF_MALICIOUS

            result_probs = eclf.predict_proba(segs_val_test)

            top_indexes = result_probs[:, 0].argsort()[:MAX_NUM_OF_MALICIOUS]

            pred_for_user_after_top = np.zeros(len(pred_for_user), dtype=int)
            pred_for_user_after_top[top_indexes] = 1

        prediction_for_user = np.array(pred_for_user_after_top).astype(np.int)

        print(prediction_for_user)

        if index > last_idx_usr_train:  #need to add to submission since it test,changing PARTIAL_LABELS_PATH file
            return
            cmd_id = num_train_segments * num_of_cmds_per_segment  #starting from 5000 (50 segments * 100 cmds per segment)
            for pred in prediction_for_user:
                row_idx = 'User' + str(index)
                clmn_idx = str(cmd_id) + "-" + str(cmd_id + num_of_cmds_per_segment)
                partial_labels_df.at[row_idx, clmn_idx] = pred
                cmd_id += num_of_cmds_per_segment

        else:  #training evaluation!

            user_id = 'User' + str(index)
            column_idx_in_df = partial_labels_df.columns[0]
            desired_row = partial_labels_df.loc[(partial_labels_df[column_idx_in_df] == user_id)].values
            labels_preds = desired_row[0][
                           num_train_segments + 1:]  #num_train_segments +1 because row started with value UserX, skip num_train_segments to get val+test
            labels_preds = np.array(labels_preds).astype(np.int)

            results_predict(user_id, prediction_for_user, labels_preds, LABELS_OUTPUT_PATH)

    partial_labels_df.to_csv(submission_predictions_filepath, index=False)

In [None]:
def make_average_statistics(path, num_users_to_count):
    try:
        all_results = pd.read_csv(path, index_col=False)[-num_users_to_count:]
        avg_results_predict_dict = {'date': datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
                                    'accuracy': all_results['accuracy'].mean(),
                                    'TP': all_results['TP'].mean(),
                                    'TN': all_results['TN'].mean(),
                                    'FP': all_results['FP'].mean(),
                                    'FN': all_results['FN'].mean(),
                                    'precision': all_results['precision'].mean(),
                                    'recall': all_results['recall'].mean(),
                                    'f1': all_results['f1'].mean(),
                                    'AUC': all_results['AUC'].mean(),
                                    'userID': 999999
                                    }
        avg_results_predict = pd.DataFrame([avg_results_predict_dict])
        all_results = all_results.append(avg_results_predict)
        all_results.to_csv(path, index=False)
        print('added average results to statistics.csv')
    except Exception:
        print("Not found necassery information to make average... Exiting!")

In [None]:
def load_dataset(RAW_DATA_PATH, num_of_segments, num_of_cmds_per_segment):
    # segmented_data_dict -> {'User-ID_Segment-ID':[str composed of commands which are seperated by spaces]}
    segmented_data_dict = {}
    for filename in os.listdir(RAW_DATA_PATH):
        user_id = filename[len('User'):]
        filePath = RAW_DATA_PATH + '/' + filename
        with open(filePath) as fp:
            data = ''

            for i in range(num_of_segments):
                for j in range(num_of_cmds_per_segment):  # 100 commands each - 1 segment
                    line = fp.readline()
                    if line:
                        data += line[:-1] + ' '  # remove \n and add space
                    else:
                        print("on file:" + filename + ", there is a bug on segment: " + str(
                            i + 1) + ", missing command:" + str(j) + "!")
                segmented_data_dict[user_id + "_" + str(i)] = data
                data = ''
    return segmented_data_dict

In [None]:
segmented_data_dict = load_dataset(RAW_DATA_PATH, num_of_segments, num_of_cmds_per_segment)

predict_segments(segmented_data_dict, svd_flag, max_num_of_ngrams)

num_users_to_calc_avg = last_idx_usr_train + 1

make_average_statistics(LABELS_OUTPUT_PATH, num_users_to_calc_avg)