In [77]:
import sqlite3

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

# CONFIG
DB_PATH = '1_after_cleaning_databases/1000_devices_cleaned.db'
TABLE_NAME = 'css_attribute'
OUTPUT_NAME = 'feature_matrix'

## PANDAS
pd.set_option('future.no_silent_downcasting', True)
# Set pandas options to display the full DataFrame
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Disable line wrapping
pd.set_option('display.max_colwidth', None)  # Show full content in each column
pd.set_option('display.max_seq_item', 100000)  # Show all items in a sequence (e.g., lists, dictionaries)

# Optionally, increase the display buffer size
pd.set_option('display.max_info_columns', 100000)  # Show all columns in df.info()

# If you have very large dataframes, increasing this might help
pd.set_option('display.max_rows', 1000)  # You can change 1000 to a higher value if needed

### Functions

#### Loading data from sqlite to pandas dataframe

In [78]:
def load_sqlite_to_dataframe(db_path, table_name):
    conn = sqlite3.connect(db_path)
    df = pd.read_sql(f"SELECT * FROM {table_name}", conn)
    conn.close()
    return df

#### Changing boolean values to 1 and 0

In [79]:
def convert_booleans(value):
    true_values = {'enabled'}
    false_values = {'disabled'}

    if isinstance(value, str):
        val_lower = value.lower()
        if val_lower in true_values:
            return 1
        elif val_lower in false_values:
            return 0
    return value

#### Constructing feature matrix

In [80]:
def create_feature_matrix(df):
    # Handle fonts
    fonts = (
        df[df['attribute'] == 'font']
        .groupby('session_id')['value']
        .apply(lambda x: ';'.join(sorted(set(x))))
        .reset_index()
    )

    # replace spaces in font names with underscores
    fonts['value'] = fonts['value'].str.replace(' ', '_', regex=False)

    # Rest of features
    non_fonts = df[df['attribute'] != 'font'].copy()
    non_fonts['feature_name'] = non_fonts['source'] + ':' + non_fonts['attribute']

    # Apply boolean conversion
    non_fonts['value'] = non_fonts['value'].apply(convert_booleans)

    # Pivot table
    pivot = non_fonts.pivot_table(index='session_id',
                                  columns='feature_name',
                                  values='value',
                                  aggfunc='first').reset_index()

    # Merge fonts back
    full_df = pivot.merge(fonts, on='session_id', how='left')

    full_df.rename(columns={'value': 'fonts'}, inplace=True)

    return full_df

#### Changing font values to one-hot encoding

In [81]:
def vectorize_fonts(df, font_column='fonts'):
    vectorizer = CountVectorizer(tokenizer=lambda x: [token.strip() for token in x.split(';') if token.strip()])
    font_features = vectorizer.fit_transform(df[font_column].fillna(''))

    font_df = pd.DataFrame(
        font_features.toarray(),
        columns=[f'css:font_{f}' for f in vectorizer.get_feature_names_out()],
        index=df.index
    )

    font_df['css:font_vector'] = font_df.astype(str).agg(''.join, axis=1)

    df = df.drop(columns=[font_column]).join(font_df)
    return df

#### Cleaning feature types

In [82]:
def clean_feature_types(df):
    for col in df.columns:
        # todo bedzie trzeba na to spojrzec gdy dojda kolejne atrybuty bo jest to takie niezbyt uniwersalne
        if col.startswith('css:env') or col.startswith('js:env') or 'width' in col or 'height' in col:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Fill missing numeric with -1
    numeric_cols = df.select_dtypes(include='number').columns
    df[numeric_cols] = df[numeric_cols].fillna(-1)

    # Fill missing non-numeric with empty string
    object_cols = df.select_dtypes(include='object').columns
    df[object_cols] = df[object_cols].fillna('')

    df = df.infer_objects()

    return df

#### Save datagframe to parquet

In [83]:
def save_features(df):
    df.to_parquet(f'2_after_feature_extraction/{OUTPUT_NAME}.parquet')
    df.to_csv(f'2_after_feature_extraction/{OUTPUT_NAME}.csv', index=False)

### Execution

In [84]:
# Load data
df_raw = load_sqlite_to_dataframe(DB_PATH, TABLE_NAME)

In [85]:
# Create feature matrix
df_features = create_feature_matrix(df_raw)

In [86]:
df_features

Unnamed: 0,session_id,browserstack:browser,browserstack:browser_version,browserstack:os,browserstack:os_version,browserstack:real_height,browserstack:real_width,css:User-Agent,css:browser,css:env-1-height,css:env-1-width,css:env-10-height,css:env-10-width,css:env-11-height,css:env-11-width,css:env-12-height,css:env-12-width,css:env-13-height,css:env-13-width,css:env-14-height,css:env-14-width,css:env-2-height,css:env-2-width,css:env-3-height,css:env-3-width,css:env-5-height,css:env-5-width,css:env-6-height,css:env-6-width,css:env-7-height,css:env-7-width,css:env-8-height,css:env-8-width,css:env-9-height,css:env-9-width,css:image-set-heif,css:javascript,css:px_per_px,css:system,css:viewport_height,css:viewport_width,js:env-1-container-height,js:env-1-container-width,js:env-10-container-height,js:env-10-container-width,js:env-11-container-height,js:env-11-container-width,js:env-12-container-height,js:env-12-container-width,js:env-13-container-height,js:env-13-container-width,js:env-14-container-height,js:env-14-container-width,js:env-2-container-height,js:env-2-container-width,js:env-3-container-height,js:env-3-container-width,js:env-4-container-height,js:env-4-container-width,js:env-5-container-height,js:env-5-container-width,js:env-6-container-height,js:env-6-container-width,js:env-7-container-height,js:env-7-container-width,js:env-8-container-height,js:env-8-container-width,js:env-9-container-height,js:env-9-container-width,fonts
0,0041b5e2-06c3-43a8-a837-cc0f23e9d1e9,chrome,135.0 beta,OS X,Sonoma,1011,1200,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",chrome,136,195,166,712,175.0,1100.0,123,6762,125.0,88.0,130.0,78.0,188,403,110,133,100,600,164,183,123,554,78.0,587.0,115,681,0.0,1,1,linux,8,12,136,196,166,712,176,1202,123,6762,125,88,130,78,188.0,403.0,110,134,150,300,100,600,164,183,123.0,554.0,78,588,115.0,681.0,Menlo
1,006ec0f9-9ad5-457d-8d6a-df7958ec7975,chrome,134.0,Windows,11,1012,945,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",chrome,136,190,166,732,175.0,1100.0,126,6970,130.0,86.0,130.0,78.0,189,403,110,133,100,600,164,170,126,550,78.0,585.0,115,650,0.0,1,1,windows,9,9,136,191,166,732,176,1323,126,6971,130,87,130,78,189.0,403.0,110,134,150,300,100,600,164,170,126.0,551.0,78,585,115.0,650.0,Calibri;Franklin_Gothic;MS_UI_Gothic;Marlett;Segoe_UI_Light
2,007746e4-32b0-4f39-b768-200351c74882,edge,124.0,OS X,Monterey,1011,1200,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",chrome,136,195,166,712,175.0,1100.0,123,6762,125.0,84.0,130.0,78.0,188,403,110,133,100,600,164,183,123,554,78.0,587.0,115,681,0.0,1,1,linux,8,11,136,196,166,712,176,1176,123,6762,125,84,130,78,188.0,403.0,110,134,150,300,100,600,164,183,123.0,554.0,78,588,115.0,681.0,Menlo
3,008d5537-1e12-48a5-a32f-ce831b21c74e,chromium_iphone,unknown,ios,16.2,669,390,"Mozilla/5.0 (iPhone; CPU iPhone OS 16_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/125.0.6422.80 Mobile/15E148 Safari/604.1",chrome,120,195,166,654,147.0,1100.0,116,6656,116.0,84.0,130.0,74.0,150,386,100,133,100,374,166,153,27,446,51.0,,114,460,,1,3,linux,6,3,120,196,167,655,147,1188,116,6656,116,84,130,74,150.0,387.0,100,134,150,300,100,374,166,153,27.0,446.0,51,544,114.0,460.0,Menlo
4,00b748c3-5c91-43d2-b603-ae43d9fd4be2,chrome,118.0,OS X,Sonoma,1011,1200,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",chrome,136,195,166,712,175.0,1100.0,123,6762,125.0,84.0,130.0,78.0,188,403,110,133,100,600,164,183,123,554,78.0,587.0,115,681,0.0,1,1,linux,8,12,136,196,166,712,176,1184,123,6762,125,84,130,78,188.0,403.0,110,134,150,300,100,600,164,183,123.0,554.0,78,588,115.0,681.0,Menlo
5,00deeea6-4055-44c5-9f50-e974686dc419,chrome_android,unknown,android,15.0,820,427,"Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Mobile Safari/537.36",chrome,131,224,168,784,168.0,1100.0,122,6866,125.0,92.0,131.0,76.0,164,454,106,150,100,411,163,170,54,487,69.0,600.0,114,672,0.0,1,3,linux,8,4,132,225,169,785,169,1428,122,6867,126,93,131,77,165.0,455.0,107,150,150,300,100,411,163,170,55.0,487.0,70,619,115.0,672.0,sans-serif-thin
6,00e79661-0a6d-411c-b229-c7a71ca4da3d,firefox,121.0,OS X,Monterey,949,1280,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0,firefox,124,196,309,470,164.0,1100.0,116,6973,124.0,74.0,130.0,74.0,180,400,104,133,100,600,156,164,126,531,56.0,600.0,111,737,0.0,1,1,linux,8,12,125,196,309,470,164,1264,116,6974,124,74,130,74,181.0,400.0,104,134,150,300,100,600,156,165,126.0,532.0,57,602,111.0,738.0,Menlo
7,0144b7d8-0cdd-4841-8438-9c21d4de3426,chrome,114.0,OS X,Big Sur,1011,1200,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",,136,195,166,711,175.0,1100.0,123,6762,125.0,84.0,130.0,78.0,188,403,110,133,100,600,164,182,123,554,78.0,587.0,115,678,0.0,1,1,,8,12,136,196,166,711,176,1184,123,6762,125,84,130,78,188.0,403.0,110,134,150,300,100,600,164,182,123.0,554.0,78,588,115.0,678.0,Menlo
8,019ef33d-bdb3-48b8-ab32-b2a42918802c,chrome,121.0,OS X,Catalina,1013,1200,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",chrome,136,195,166,712,175.0,1100.0,123,6755,125.0,84.0,130.0,78.0,188,403,110,133,100,600,164,183,123,554,78.0,587.0,115,681,0.0,1,1,linux,8,12,136,196,166,712,176,1184,123,6755,125,84,130,78,188.0,403.0,110,134,150,300,100,600,164,183,123.0,554.0,78,588,115.0,681.0,Menlo
9,01eda466-aa28-41a1-a024-555bc9783c77,edge,124.0,OS X,Catalina,1013,1200,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",chrome,136,195,166,712,175.0,1100.0,123,6755,125.0,84.0,130.0,78.0,188,403,110,133,100,600,164,183,123,554,78.0,587.0,115,681,0.0,1,1,linux,8,11,136,196,166,712,176,1176,123,6755,125,84,130,78,188.0,403.0,110,134,150,300,100,600,164,183,123.0,554.0,78,588,115.0,681.0,Menlo


In [87]:
# Vectorize fonts
df_features = vectorize_fonts(df_features)



In [88]:
# Clean feature types
df_ready = clean_feature_types(df_features)

In [89]:
# test Output
print(f"Shape of feature matrix: {df_ready.shape}")
display(df_ready.head())

Shape of feature matrix: (932, 119)


Unnamed: 0,session_id,browserstack:browser,browserstack:browser_version,browserstack:os,browserstack:os_version,browserstack:real_height,browserstack:real_width,css:User-Agent,css:browser,css:env-1-height,css:env-1-width,css:env-10-height,css:env-10-width,css:env-11-height,css:env-11-width,css:env-12-height,css:env-12-width,css:env-13-height,css:env-13-width,css:env-14-height,css:env-14-width,css:env-2-height,css:env-2-width,css:env-3-height,css:env-3-width,css:env-5-height,css:env-5-width,css:env-6-height,css:env-6-width,css:env-7-height,css:env-7-width,css:env-8-height,css:env-8-width,css:env-9-height,css:env-9-width,css:image-set-heif,css:javascript,css:px_per_px,css:system,css:viewport_height,css:viewport_width,js:env-1-container-height,js:env-1-container-width,js:env-10-container-height,js:env-10-container-width,js:env-11-container-height,js:env-11-container-width,js:env-12-container-height,js:env-12-container-width,js:env-13-container-height,js:env-13-container-width,js:env-14-container-height,js:env-14-container-width,js:env-2-container-height,js:env-2-container-width,js:env-3-container-height,js:env-3-container-width,js:env-4-container-height,js:env-4-container-width,js:env-5-container-height,js:env-5-container-width,js:env-6-container-height,js:env-6-container-width,js:env-7-container-height,js:env-7-container-width,js:env-8-container-height,js:env-8-container-width,js:env-9-container-height,js:env-9-container-width,css:font_agency_fb,css:font_arabic_typesetting,css:font_arial_unicode_ms,css:font_arno_pro,css:font_avantgarde_bk_bt,css:font_bankgothic_md_bt,css:font_batang,css:font_bitstream_vera_sans_mono,css:font_calibri,css:font_century,css:font_century_gothic,css:font_clarendon,css:font_eurostile,css:font_franklin_gothic,css:font_futura_bk_bt,css:font_futura_md_bt,css:font_gotham,css:font_haettenschweiler,css:font_helv,css:font_helvetica_neue,css:font_humanst521_bt,css:font_letter_gothic,css:font_levenim_mt,css:font_lucida_bright,css:font_lucida_sans,css:font_marlett,css:font_meiryo_ui,css:font_menlo,css:font_microsoft_uighur,css:font_minion_pro,css:font_monotype_corsiva,css:font_ms_mincho,css:font_ms_outlook,css:font_ms_reference_specialty,css:font_ms_ui_gothic,css:font_mt_extra,css:font_myriad_pro,css:font_pristina,css:font_sans-serif-thin,css:font_scriptina,css:font_segoe_ui_light,css:font_serifa,css:font_simhei,css:font_small_fonts,css:font_staccato222_bt,css:font_trajan_pro,css:font_univers_ce_55_medium,css:font_vrinda,css:font_zwadobef,css:font_vector
0,0041b5e2-06c3-43a8-a837-cc0f23e9d1e9,chrome,135.0 beta,OS X,Sonoma,1011,1200,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",chrome,136,195,166,712,175.0,1100.0,123,6762,125.0,88.0,130.0,78.0,188,403,110,133,100,600,164,183,123,554,78.0,587.0,115,681,0.0,1,1,linux,8,12,136,196,166,712,176,1202,123,6762,125,88,130,78,188.0,403.0,110,134,150,300,100,600,164,183,123.0,554.0,78,588,115.0,681.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0000000000000000000000000001000000000000000000000
1,006ec0f9-9ad5-457d-8d6a-df7958ec7975,chrome,134.0,Windows,11,1012,945,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",chrome,136,190,166,732,175.0,1100.0,126,6970,130.0,86.0,130.0,78.0,189,403,110,133,100,600,164,170,126,550,78.0,585.0,115,650,0.0,1,1,windows,9,9,136,191,166,732,176,1323,126,6971,130,87,130,78,189.0,403.0,110,134,150,300,100,600,164,170,126.0,551.0,78,585,115.0,650.0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0000000010000100000000000100000000100000100000000
2,007746e4-32b0-4f39-b768-200351c74882,edge,124.0,OS X,Monterey,1011,1200,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",chrome,136,195,166,712,175.0,1100.0,123,6762,125.0,84.0,130.0,78.0,188,403,110,133,100,600,164,183,123,554,78.0,587.0,115,681,0.0,1,1,linux,8,11,136,196,166,712,176,1176,123,6762,125,84,130,78,188.0,403.0,110,134,150,300,100,600,164,183,123.0,554.0,78,588,115.0,681.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0000000000000000000000000001000000000000000000000
3,008d5537-1e12-48a5-a32f-ce831b21c74e,chromium_iphone,unknown,ios,16.2,669,390,"Mozilla/5.0 (iPhone; CPU iPhone OS 16_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/125.0.6422.80 Mobile/15E148 Safari/604.1",chrome,120,195,166,654,147.0,1100.0,116,6656,116.0,84.0,130.0,74.0,150,386,100,133,100,374,166,153,27,446,51.0,-1.0,114,460,,1,3,linux,6,3,120,196,167,655,147,1188,116,6656,116,84,130,74,150.0,387.0,100,134,150,300,100,374,166,153,27.0,446.0,51,544,114.0,460.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0000000000000000000000000001000000000000000000000
4,00b748c3-5c91-43d2-b603-ae43d9fd4be2,chrome,118.0,OS X,Sonoma,1011,1200,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",chrome,136,195,166,712,175.0,1100.0,123,6762,125.0,84.0,130.0,78.0,188,403,110,133,100,600,164,183,123,554,78.0,587.0,115,681,0.0,1,1,linux,8,12,136,196,166,712,176,1184,123,6762,125,84,130,78,188.0,403.0,110,134,150,300,100,600,164,183,123.0,554.0,78,588,115.0,681.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0000000000000000000000000001000000000000000000000


In [90]:
save_features(df_ready)

In [91]:
# list all columns
print(df_ready.columns)

Index(['session_id', 'browserstack:browser', 'browserstack:browser_version',
       'browserstack:os', 'browserstack:os_version',
       'browserstack:real_height', 'browserstack:real_width', 'css:User-Agent',
       'css:browser', 'css:env-1-height', 'css:env-1-width',
       'css:env-10-height', 'css:env-10-width', 'css:env-11-height',
       'css:env-11-width', 'css:env-12-height', 'css:env-12-width',
       'css:env-13-height', 'css:env-13-width', 'css:env-14-height',
       'css:env-14-width', 'css:env-2-height', 'css:env-2-width',
       'css:env-3-height', 'css:env-3-width', 'css:env-5-height',
       'css:env-5-width', 'css:env-6-height', 'css:env-6-width',
       'css:env-7-height', 'css:env-7-width', 'css:env-8-height',
       'css:env-8-width', 'css:env-9-height', 'css:env-9-width',
       'css:image-set-heif', 'css:javascript', 'css:px_per_px', 'css:system',
       'css:viewport_height', 'css:viewport_width',
       'js:env-1-container-height', 'js:env-1-container-width',
 