In [8]:
# Source files
training_data_a = "/home/nakaolab/Desktop/challenge/data/training-data_a.csv"
training_data_c = "/home/nakaolab/Desktop/challenge/data/training-data_c.csv"
test_data_c = "/home/nakaolab/Desktop/challenge/data/test-data_c.csv"

In [9]:
# 0. Required packages
import pandas as pd
from sklearn import preprocessing

## 1. Read source data

In [9]:
# 1. Read source data -- functions
def training_data_from_csv_file(training_data_from_a_param, training_data_from_c_param, proportion_param):
    """
    Read training data from two csv files: training_data_a and training_data_c, then, concatenate into one.
    The proportion of data come from training_data_c is various
    :param training_data_from_a_param: The absolute path fo training_data_a
    :param training_data_from_c_param: The absolute path fo training_data_c
    :param proportion_param: The proportion of data come from training_data_c
    :return:
      train_data_concatenated_dataframe: Dataframe containing the training data from two csv files
      train_data_data_dataframe: Dataframe containing metrics (data)
      train_data_text_label_dataframe: Dataframe containing label in text format
    """

    # Read from CSV file
    training_a_csv_file_contents_dataframe = pd.read_csv(filepath_or_buffer=training_data_from_a_param)
    training_c_csv_file_contents_dataframe = pd.read_csv(filepath_or_buffer=training_data_from_c_param)

    # Shuffle the data
    training_a_csv_file_contents_dataframe = training_a_csv_file_contents_dataframe.sample(frac=1, random_state=10)
    training_c_csv_file_contents_dataframe = training_c_csv_file_contents_dataframe.sample(frac=1, random_state=10)

    training_a_csv_file_contents_dataframe = training_a_csv_file_contents_dataframe.sample(frac=1, random_state=20)
    training_c_csv_file_contents_dataframe = training_c_csv_file_contents_dataframe.sample(frac=1, random_state=20)

    training_a_csv_file_contents_dataframe = training_a_csv_file_contents_dataframe.sample(frac=1, random_state=30)
    training_c_csv_file_contents_dataframe = training_c_csv_file_contents_dataframe.sample(frac=1, random_state=30)

    # Get a certain number of rows of data proportionally from training_data_c
    sampled_training_c_csv_file_contents_dataframe = training_c_csv_file_contents_dataframe.sample(
        frac=proportion_param, random_state=42)

    print("Training A and C shape:",training_a_csv_file_contents_dataframe.shape, training_c_csv_file_contents_dataframe.shape)

    # Concat training data a and sampled training data c
    train_data_concatenated_dataframe = pd.concat(
        [training_a_csv_file_contents_dataframe, sampled_training_c_csv_file_contents_dataframe])

    # Data columns
    train_data_data_dataframe = train_data_concatenated_dataframe.iloc[:, 3:]

    # Label columns in text format
    train_data_text_label_dataframe = pd.DataFrame(data=train_data_concatenated_dataframe["y_true(fc)"])

    return train_data_concatenated_dataframe, train_data_data_dataframe, train_data_text_label_dataframe


def test_data_from_csv_file(test_data_csv_file_path_param):
    """
    Read data from CSV file containing test data into dataframe
    :param test_data_csv_file_path_param: The absolute path of CSV file containing test data
    :return: Three dataframes
      test_data_csv_file_contents_dataframe: Dataframe containing the entire CSV contents
      test_data_training_data_dataframe: Dataframe containing metrics (data)
      test_data_text_label_dataframe: Dataframe containing label in text format
    """

    # Read test data from CSV file
    test_data_csv_file_contents_dataframe = pd.read_csv(filepath_or_buffer=test_data_csv_file_path_param)

    # Shuffle the data
    test_data_csv_file_contents_dataframe = test_data_csv_file_contents_dataframe.sample(frac=1, random_state=10)
    test_data_csv_file_contents_dataframe = test_data_csv_file_contents_dataframe.sample(frac=1, random_state=10)

    # Data columns
    test_data_data_dataframe = test_data_csv_file_contents_dataframe.iloc[:, 3:]

    # Label column in text format
    test_data_text_label_dataframe = pd.DataFrame(data=test_data_csv_file_contents_dataframe["y_true(fc)"])

    print("Test C shape:",test_data_csv_file_contents_dataframe.shape)

    return test_data_csv_file_contents_dataframe, test_data_data_dataframe, test_data_text_label_dataframe

In [10]:
# 1. Read source data -- execution
(return_train_data_concatenated_dataframe, return_train_data_data_dataframe,
    return_train_data_text_label_dataframe) = training_data_from_csv_file(
    training_data_from_a_param=training_data_a, training_data_from_c_param=training_data_c,
    proportion_param=1)

(return_test_data_csv_file_contents_dataframe, return_test_data_data_dataframe,
    return_test_data_text_label_dataframe) = test_data_from_csv_file(
    test_data_csv_file_path_param=test_data_c)

print(return_train_data_concatenated_dataframe.shape, return_train_data_data_dataframe.shape,
        return_train_data_text_label_dataframe.shape)
print(return_test_data_csv_file_contents_dataframe.shape, return_test_data_data_dataframe.shape,
        return_test_data_text_label_dataframe.shape)

Training A and C shape: (3645, 4121) (460, 4121)
Test C shape: (873, 4121)
(4105, 4121) (4105, 4118) (4105, 1)
(873, 4121) (873, 4118) (873, 1)


## 2. Generate datasets

In [12]:
# 2. Generate dataset  --functions
def encode_label(text_label_series_param):

    # Instantiate label encoder
    label_encoder = preprocessing.LabelEncoder()
    
    print("Label shape:", text_label_series_param.shape)

    # Encode the text label
    encoded_label_dataframe = label_encoder.fit_transform(X=pd.DataFrame(data=text_label_series_param))

    return encoded_label_dataframe


def create_dataset(entire_dataset_param):
    # Encode label
    encoded_label_dataframe = pd.DataFrame(data=encode_label(text_label_series_param=entire_dataset_param.iloc[:,2]), columns=["encoded_label"]).reset_index(drop=True)

    # Split label and data
    data_dataframe = entire_dataset_param.iloc[:,3:].reset_index(drop=True)

    # Combine label and data, encoded label dataframe is the first column after concatenating
    encoded_label_data_dataframe = pd.concat([encoded_label_dataframe, data_dataframe], axis=1)

    print("Encode label and data shape:",encoded_label_data_dataframe.shape)

    return encoded_label_data_dataframe

In [15]:
# 2. Generate dataset --exuction
# Training-a and training-c
(return_train_data_concatenated_dataframe, return_train_data_data_dataframe,
    return_train_data_text_label_dataframe) = training_data_from_csv_file(
    training_data_from_a_param=training_data_a, training_data_from_c_param=training_data_c,
    proportion_param=1)

create_dataset(entire_dataset_param=return_train_data_concatenated_dataframe)

# Test-c
(return_test_data_csv_file_contents_dataframe, return_test_data_data_dataframe,
    return_test_data_text_label_dataframe) = test_data_from_csv_file(
    test_data_csv_file_path_param=test_data_c)

create_dataset(entire_dataset_param=return_test_data_csv_file_contents_dataframe)

Training A and C shape: (3645, 4121) (460, 4121)
Label shape: (4105,)
Encode label and data shape: (4105, 4119)


  y = column_or_1d(y, warn=True)


Test C shape: (873, 4121)
Label shape: (873,)
Encode label and data shape: (873, 4119)


  y = column_or_1d(y, warn=True)


Unnamed: 0,encoded_label,amfx1_ens3_oper-status,amfx1_ens3_phys-address,amfx1_ens3_speed,amfx1_ens3_statistics.in-octets,amfx1_ens3_statistics.in-unicast-pkts,amfx1_ens3_statistics.out-octets,amfx1_ens3_statistics.out-unicast-pkts,amfx1_ens3_oper-status_value,amfx1_ens3_phys-address_value,...,upfx7_memory-stats.free-percent_value,upfx7_memory-stats.memory-status_value,upfx7_memory-stats.used-number_value,upfx7_memory-stats.used-percent_value,upfx7_per-core-stats.per-core-stat.idle_value,upfx7_per-core-stats.per-core-stat.io-wait_value,upfx7_per-core-stats.per-core-stat.nice_value,upfx7_per-core-stats.per-core-stat.sirq_value,upfx7_per-core-stats.per-core-stat.system_value,upfx7_per-core-stats.per-core-stat.user_value
0,10,0,0,0,1,1,1,1,0,0,...,1,0,-26452,-1,0.015,-0.160,0,0.0,0.015,4.500000e-02
1,15,0,0,0,1,1,1,1,0,0,...,1,0,-15804,-1,-0.815,1.360,0,0.0,-0.005,1.350000e-01
2,10,0,0,0,1,1,1,1,0,0,...,0,0,1248,0,-0.060,0.015,0,0.0,-0.035,8.500000e-02
3,10,0,0,0,1,1,1,1,0,0,...,2,0,-32624,-2,0.045,0.000,0,0.0,0.090,-1.800000e-01
4,10,0,0,0,1,1,1,1,0,0,...,1,0,-15240,-1,-0.250,0.120,0,0.0,0.105,9.000000e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
868,10,0,0,0,1,1,1,1,0,0,...,0,0,116,0,-0.050,0.100,0,0.0,0.090,-4.000000e-02
869,12,0,0,0,1,1,1,1,0,0,...,-1,0,10584,1,0.115,0.000,0,-0.1,0.000,-6.000000e-02
870,10,0,0,0,1,1,1,1,0,0,...,-1,0,10448,1,0.505,-0.010,0,0.0,-0.070,-4.300000e-01
871,10,0,0,0,1,1,1,1,0,0,...,1,0,-18256,-1,-0.135,0.085,0,0.0,0.080,-2.780000e-17
