In [99]:
import csv, math
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [93]:
# df: dataframe containing features to be encoded
# columns: list of columns to be encoded
def one_hot_encode(df, columns):
    ohe = OneHotEncoder()
    ohe_features = pd.DataFrame(ohe.fit_transform(df[columns]).toarray())
    ohe_features.columns = ohe.get_feature_names()
    df = pd.concat([df, ohe_features], axis=1)
    df = df.drop(columns = columns)
    return df

# df: dataframe containing features to be encoded
# columns: list of columns to be encoded
def label_encode(df, columns):
    le = LabelEncoder()
    df[columns] = df[columns].apply(le.fit_transform)
    return df

def extract_time_features(df):
    df['month'] = pd.DatetimeIndex(df['timestamp']).month
    df['day'] = pd.DatetimeIndex(df['timestamp']).day
    df['hour'] = pd.DatetimeIndex(df['timestamp']).hour
    df['minute'] = pd.DatetimeIndex(df['timestamp']).minute
    df['second'] = pd.DatetimeIndex(df['timestamp']).second
    df = df.drop(['timestamp'], axis=1)
    return df

In [109]:
f = '../data/Scorecard-109789.csv'

# column headers
cols = ['account','category','provider','device','fragments','geo','predictorScore','hazardScore','averagePIDX',
        'averageBufferEvents','averageBitrateShifts','asn','timestamp','hrtime']

# all features
features = ['hazardScore', 'averagePIDX', 'averageBufferEvents', 'averageBitrateShifts',
        'timestamp', 'provider', 'device', 'asn', 'fragments', 'geo']

# read all data with column names
df = pd.read_csv(f, header=None, names=cols)

# get dataframe with just features
df_features = df[features]

df_features.head(100)

Unnamed: 0,hazardScore,averagePIDX,averageBufferEvents,averageBitrateShifts,timestamp,provider,device,asn,fragments,geo
0,0.0,0.000000,0.000000,0.000000,1591314692617,akamai,windows_br5_be5,1,209,6254928
1,0.0,0.000000,0.000000,0.000000,1591314692617,akamai,windows_br5_be5,2,27231,6254928
2,0.0,71.287185,0.036364,0.000000,1591314692617,akamai,macos_core,55,701,6254928
3,0.0,0.000000,0.000000,0.000000,1591314692617,akamai,ipados;apple_core_media,5,7018,-1
4,0.0,98.421852,0.030303,0.030303,1591314692617,comcast,windows_br5_be5,33,11426,4597040
...,...,...,...,...,...,...,...,...,...,...
95,0.0,76.273083,0.000000,0.000000,1591314692617,level3,windows;edge_(chromium)_for_windows,28,11232,-1
96,0.0,122.510173,0.000000,0.000000,1591314692617,akamai,macos_chrome,12,209,5417618
97,0.0,273.505144,0.009346,0.009346,1591314692617,akamai,windows_br5_be5,107,20115,5279468
98,0.0,123.919297,0.017241,0.000000,1591314692617,level3,ios;apple_core_media,58,20115,-1


In [102]:
df.shape[0]

28856

In [105]:
# convert timestamp to datetime
df_features['timestamp'] = pd.to_datetime(df_features['timestamp'], unit='ms')
df_features = extract_time_features(df_features)

# one hot encode categorical features
cat_features = ['provider', 'device', 'geo']
df_encoded = one_hot_encode(df_features, cat_features)

df_encoded.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

Unnamed: 0,hazardScore,averagePIDX,averageBufferEvents,averageBitrateShifts,asn,fragments,month,day,hour,minute,...,x2_5744337,x2_5769223,x2_5815135,x2_5843591,x2_5855797,x2_5879092,x2_6254925,x2_6254926,x2_6254927,x2_6254928
0,0.0,0.0,0.0,0.0,1,209,6,4,23,51,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,2,27231,6,4,23,51,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,71.287185,0.036364,0.0,55,701,6,4,23,51,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,5,7018,6,4,23,51,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,98.421852,0.030303,0.030303,33,11426,6,4,23,51,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [106]:
num_features = len(df_encoded.columns)
num_features

105

In [128]:
stride = 1
sequence_len = 10
num_samples = math.ceil((df.shape[0]-sequence_len)/stride + 1) - 1

# construct input layer data
data = np.zeros((num_samples, sequence_len, num_features))
labels = df.hazardScore[sequence_len::stride]

In [37]:
# TODO: fix logic

sample_i = 0
for i, row in df_encoded.iterrows():
    if i > 0 and i%sequence_len == 0:
        sample_i += 1
    timestep_i = i%sequence_len
    data[sample_i,timestep_i] = row
    print(i, sample_i, timestep_i)
    print(row)

0 0 0
averagePIDX                                           3.615138e+02
averageBufferEvents                                   0.000000e+00
averageBitrateShifts                                  0.000000e+00
timestamp                                             1.584378e+12
asn                                                   4.050300e+04
fragments                                             5.000000e+00
provider_akamai                                       1.000000e+00
provider_cloudfront                                   0.000000e+00
device_android;chrome_mobile                          0.000000e+00
device_android;cm_browser                             0.000000e+00
device_android;ecosia_for_android                     0.000000e+00
device_android;exoplayer_for_android                  0.000000e+00
device_android;firefox_for_mobile                     0.000000e+00
device_android;kindle-silk                            0.000000e+00
device_android;opera_mobile                           0.

Name: 133, dtype: float64
134 13 4
averagePIDX                                           7.413678e+02
averageBufferEvents                                   4.990000e-03
averageBitrateShifts                                  4.990000e-03
timestamp                                             1.584378e+12
asn                                                   2.277300e+04
fragments                                             1.002000e+03
provider_akamai                                       1.000000e+00
provider_cloudfront                                   0.000000e+00
device_android;chrome_mobile                          0.000000e+00
device_android;cm_browser                             0.000000e+00
device_android;ecosia_for_android                     0.000000e+00
device_android;exoplayer_for_android                  0.000000e+00
device_android;firefox_for_mobile                     0.000000e+00
device_android;kindle-silk                            0.000000e+00
device_android;opera_mobile

Name: 244, dtype: float64
245 24 5
averagePIDX                                           1.407241e+02
averageBufferEvents                                   0.000000e+00
averageBitrateShifts                                  0.000000e+00
timestamp                                             1.584378e+12
asn                                                   1.083500e+04
fragments                                             1.000000e+01
provider_akamai                                       1.000000e+00
provider_cloudfront                                   0.000000e+00
device_android;chrome_mobile                          0.000000e+00
device_android;cm_browser                             0.000000e+00
device_android;ecosia_for_android                     0.000000e+00
device_android;exoplayer_for_android                  0.000000e+00
device_android;firefox_for_mobile                     0.000000e+00
device_android;kindle-silk                            0.000000e+00
device_android;opera_mobile

Name: 348, dtype: float64
349 34 9
averagePIDX                                           4.064654e+02
averageBufferEvents                                   2.439020e-02
averageBitrateShifts                                  2.439020e-02
timestamp                                             1.584378e+12
asn                                                   1.079600e+04
fragments                                             4.100000e+01
provider_akamai                                       0.000000e+00
provider_cloudfront                                   1.000000e+00
device_android;chrome_mobile                          0.000000e+00
device_android;cm_browser                             0.000000e+00
device_android;ecosia_for_android                     0.000000e+00
device_android;exoplayer_for_android                  0.000000e+00
device_android;firefox_for_mobile                     0.000000e+00
device_android;kindle-silk                            0.000000e+00
device_android;opera_mobile

424 42 4
averagePIDX                                           8.932117e+01
averageBufferEvents                                   0.000000e+00
averageBitrateShifts                                  0.000000e+00
timestamp                                             1.584378e+12
asn                                                   4.642600e+04
fragments                                             2.400000e+01
provider_akamai                                       1.000000e+00
provider_cloudfront                                   0.000000e+00
device_android;chrome_mobile                          0.000000e+00
device_android;cm_browser                             0.000000e+00
device_android;ecosia_for_android                     0.000000e+00
device_android;exoplayer_for_android                  0.000000e+00
device_android;firefox_for_mobile                     0.000000e+00
device_android;kindle-silk                            0.000000e+00
device_android;opera_mobile                          

Name: 498, dtype: float64
499 49 9
averagePIDX                                           6.586718e+02
averageBufferEvents                                   0.000000e+00
averageBitrateShifts                                  0.000000e+00
timestamp                                             1.584378e+12
asn                                                   1.187800e+04
fragments                                             7.000000e+00
provider_akamai                                       1.000000e+00
provider_cloudfront                                   0.000000e+00
device_android;chrome_mobile                          0.000000e+00
device_android;cm_browser                             0.000000e+00
device_android;ecosia_for_android                     0.000000e+00
device_android;exoplayer_for_android                  1.000000e+00
device_android;firefox_for_mobile                     0.000000e+00
device_android;kindle-silk                            0.000000e+00
device_android;opera_mobile