In [1]:
import lightgbm as lgb

from tsfresh import extract_features

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import pickle
import numpy as np
import pandas as pd
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import extract_features, EfficientFCParameters
import collections
import math
from collections import Counter

def fea_extraction_new(T_num_list, name_list, addr1, addr2, addr3, addr4, num):
    '''
    T_num_list: The number of time windows
    name_list: The name list for generated files
    addr1: The address of orriginal trace files
    addr2: The address of output features files 
    addr3: The address of output labels files
    addr4: The address of output file indexes
    '''
    # for feature extraction
    index_all = []
    x_train_all = []
    y_train_all = []
    count = 0
    idx = 0
    for T_num, t_num in T_num_list.items():
        index = []
        for j in range(1, num):
            
            idx_pass = 0
            file = addr1 + str(j) + '.blkparse'
            small_trace = np.loadtxt(file, usecols=(0, 1, 3, 0)) # reading the original data files
            df = pd.DataFrame(small_trace)
            df.columns = ["time", "pid", "address", "delta_time"]
            df["id"] = 1
            df["delta_time"]=(df["delta_time"].shift(-1))-df["time"]# get the delta-time series
            print(df.tail(2))
            max_time = max(df["time"])
            min_time = min(df["time"])
            #print(max_time-min_time)
            if j == 1:
                ### Calculating the size of each time window
                T = (max_time - min_time) / T_num
                if t_num == 0:
                    t = T
                else:
                    t = T * t_num

            if j == 1:
                #For first file
                df_i = df[(df["time"] >= min_time) & (df["time"] < min_time + T)]
                pid = df_i["pid"]
                d = df_i[["time", "address", "delta_time","id"]]
                # tsfresh feature Extraction
                X_train = extract_features(d, column_id="id", column_sort="time",
                                           impute_function=impute, n_jobs=100, show_warnings=False,
                                           default_fc_parameters=EfficientFCParameters())
                print(X_train.shape)
                #Get the number of PIDs in this period
                k = len(list(set(pid)))
                y_train = [k]
                print(len(y_train))
                print(j)

            else:

                try:
                    
                    df_i = df[(df["time"] >= min_time) & (df["time"] < min_time+T)]
                    pid = df_i["pid"]
                    d = df_i[["time", "address", "delta_time","id"]]
                    # tsfresh feature Extraction
                    x_tmp = extract_features(d, column_id="id", column_sort="time",
                                             impute_function=impute, n_jobs=100, show_warnings=False,
                                             default_fc_parameters=EfficientFCParameters())

                except (ValueError, ZeroDivisionError):
                    idx_pass -= 1
                    pass
                else:
                    X_train = X_train.append(x_tmp)
                    print(X_train.shape)
                    #Get the number of PIDs in this period
                    k = len(list(set(pid)))
                    y_train.append(k)
                    print(len(y_train))
                    print(j)
           
            n = math.ceil((max_time - min_time) / t)
            print("n is:", n)
            idx = n + idx_pass
            for i in range(1, n):
                # get features and labels for each time window
                try:
                    df_i = df[(df["time"] >= min_time +i*t) & (df["time"] < min_time + T +i*t)]
                    pid = df_i["pid"]
                    d = df_i[["time", "address", "delta_time","id"]]
                    X0 = extract_features(d, column_id="id", column_sort="time",
                                          impute_function=impute, n_jobs=100, show_warnings=False,
                                          default_fc_parameters=EfficientFCParameters())
                
                except (ValueError, ZeroDivisionError):
                    idx = idx - 1
                    pass
                else:
                    X_train = X_train.append(X0)
                    print(X_train.shape)
                    k = len(list(set(pid))) # no filters
                    y_train.append(k)
                    print(len(y_train))
                    print(j)

            index.append(idx)

        print(len(y_train))
        print(X_train.shape)
        print(sum(index))
        index_all.append(index)
        x_train_all.append(X_train)
        y_train_all.append(y_train)
        print(len(x_train_all))
        k = list(name_list.keys())[count]
        v = list(name_list.values())[count]
        x_train_all[count].to_csv(path_or_buf=addr2 + str(k) + "_" + str(v) + ".csv", index=False)
        np.savetxt(addr3 + str(k) + '_' + str(v) + '.out', y_train_all[count], delimiter=',')
        np.savetxt(addr4 + str(k) + '_' + str(v) + '.out', index_all[count], delimiter=',')
        count += 1
    return x_train_all, y_train_all, index_all



This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [None]:
#use case: segementing FIU webmail data by time windows of 30 and 360 minutes 
T_num_list = {48:0, 4:0}
name_list = {30:0, 360:0}
x_all, y_all, index_all = fea_extraction_new(T_num_list, name_list, 'webmail/webmail.cs.fiu.edu-110108-113008.', 
                                             'new_webmail_x_', 'new_webmail_y_','new_webmail_index_',21)