In [2]:
import pandas as pd
import csv
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression, mutual_info_regression

import sys
workDir = "path/to/datasets/directory"
sys.path.append(workDir + "/src/")

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [5]:
# feature selection on iperf dataset with sklearn
filenameDl = workDir + "/datasets/20190426_iperf_samsung.csv"
data_iperf_raw = pd.read_csv(filepath_or_buffer=filenameDl, sep=',', decimal='.', parse_dates=['timestamp'])
print(data_iperf_raw.shape)

(102861, 105)


  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
# drop columns with NaN values exclusively
maxNaCount = len(data_iperf_raw) - int(0.9 * len(data_iperf_raw))
data_iperf_raw = data_iperf_raw.dropna(axis="columns", how="all", inplace=False, thresh=maxNaCount)
print(data_iperf_raw.shape)

(102861, 94)


In [7]:
# izbris drop constant object columns
objColumns = data_iperf_raw.select_dtypes(include=['object']).columns
dropColnames = data_iperf_raw.loc[:, [col for col in data_iperf_raw[objColumns].columns if data_iperf_raw[col].nunique() <= 1]].columns
data_iperf_raw.drop(dropColnames, axis="columns", inplace=True)

print(data_iperf_raw.shape)

(102861, 89)


In [8]:
_stopColDrop = [col for col in data_iperf_raw.columns if "_stop" in col]
data_iperf_raw.drop(_stopColDrop, axis="columns", inplace=True)
print(data_iperf_raw.shape)

(102861, 65)


In [10]:
data_iperf_raw[data_iperf_raw.select_dtypes(include=['object']).columns] = data_iperf_raw.\
    select_dtypes(include=[object]).apply(lambda x: x.astype("category"))

print("Number of different categories per column:")
data_iperf_raw.select_dtypes(['category']).apply(lambda x: print(x.name, ":   ", x.cat.codes.max() + 1))

# select only columns with more than 1 unique value
# data_iperf_raw = data_iperf_raw.loc[:, [col for col in data_iperf_raw.columns if data_iperf_raw[col].nunique() > 1]]

catColumns = data_iperf_raw.select_dtypes(include=["category"]).columns
dropColnames = data_iperf_raw.loc[:, [col for col in data_iperf_raw[catColumns].columns if data_iperf_raw[col].cat.codes.max() < 1]].columns
data_iperf_raw.drop(dropColnames, axis="columns", inplace=True)

# categories - replace categorical strings with ids
dataAttributeSelection = data_iperf_raw
catColumns = data_iperf_raw.select_dtypes(include=["category"]).columns
dataAttributeSelection[catColumns] = dataAttributeSelection[catColumns].apply(lambda x: x.cat.codes)

outputToCsv = False
if outputToCsv:
    outCsv = workDir + "/datasets/20190518_feature_selection_iperf.csv"
    data_iperf_raw.to_csv(path_or_buf=outCsv, sep=',', decimal='.', quoting=csv.QUOTE_NONNUMERIC,
                          encoding='utf-8', index=True)


print(data_iperf_raw.shape)

Number of different categories per column:
(102861, 65)
 1    44328
 0    32151
 2    23228
-1     3154
Name: radio_lte_ca_state_start, dtype: int64
(102861, 65)


In [12]:
# DL and UL iperf tests are significantly different: some attributes are NaN for DL
mode = "DL"
protocol = "TCP"

directionDlCode = 0
tcpProtocolCode = 0

if mode == "DL":
    dataDirectional = dataAttributeSelection.loc[dataAttributeSelection.direction == directionDlCode]
else:
    dataDirectional = dataAttributeSelection.loc[dataAttributeSelection.direction != directionDlCode]

if protocol == "TCP":
    dataDirectional = dataDirectional.loc[dataDirectional.protocol == tcpProtocolCode]

print(dataDirectional.shape)

# keep columns with less that 10% of NaNs
maxNaCount = int(dataDirectional.shape[0] * 0.10)
naColDrop = dataDirectional.loc[:, dataDirectional.isna().sum() > maxNaCount].columns
dataDirectional.drop(naColDrop, axis="columns", inplace=True)

# finally, all the rows containing NaN values can be dropped
dataDirectional = dataDirectional.dropna(axis="rows", how="any")
print(dataDirectional.shape)

(32518, 65)
(31688, 47)


In [16]:
# select a column to act as a label
dataDirectional = dataDirectional[["kbps", "radio_rsrq_db_start", "radio_sinr_db_start", "timestamp", "radio_lte_ca_state_start",
                                   "radio_rssi_dbm_start", "transfer_bytes", "radio_rsrp_dbm_start", "protocol", "hash"]].dropna(axis="rows", how="any")
labelColname = "kbps"
X = dataDirectional.drop(labelColname, axis="columns")
X = X.drop("timestamp", axis="columns")
Y = dataDirectional[labelColname]

print(X.shape)
print(Y.shape)

selector = SelectKBest(f_regression, k='all')

fit = selector.fit(X, Y)
res = pd.DataFrame(index=[0], columns=X.columns)
# print(fit.scores_)
res.iloc[0] = fit.scores_

for col in res.columns:
    print("\n", res[col])


(31688, 8)
(31688,)

 0    4142.22
Name: radio_rsrq_db_start, dtype: object

 0    10006.1
Name: radio_sinr_db_start, dtype: object

 0    462.037
Name: radio_lte_ca_state_start, dtype: object

 0    1367.11
Name: radio_rssi_dbm_start, dtype: object

 0    62342.8
Name: transfer_bytes, dtype: object

 0    3.26069
Name: radio_rsrp_dbm_start, dtype: object

 0    NaN
Name: protocol, dtype: object

 0    101.447
Name: hash, dtype: object
