In [46]:
import sqlite3

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

# CONFIG
DB_PATH = '1_after_cleaning_databases/1000_devices_cleaned.db'
TABLE_NAME = 'css_attribute'
OUTPUT_NAME = 'feature_matrix'

## PANDAS
pd.set_option('future.no_silent_downcasting', True)
# Set pandas options to display the full DataFrame
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Disable line wrapping
pd.set_option('display.max_colwidth', None)  # Show full content in each column
pd.set_option('display.max_seq_item', 100000)  # Show all items in a sequence (e.g., lists, dictionaries)

# Optionally, increase the display buffer size
pd.set_option('display.max_info_columns', 100000)  # Show all columns in df.info()

# If you have very large dataframes, increasing this might help
pd.set_option('display.max_rows', 1000)  # You can change 1000 to a higher value if needed

### Functions

#### Loading data from sqlite to pandas dataframe

In [47]:
def load_sqlite_to_dataframe(db_path, table_name):
    conn = sqlite3.connect(db_path)
    df = pd.read_sql(f"SELECT * FROM {table_name}", conn)
    conn.close()
    return df

#### Changing boolean values to 1 and 0

In [48]:
def convert_booleans(value):
    true_values = {'enabled'}
    false_values = {'disabled'}

    if isinstance(value, str):
        val_lower = value.lower()
        if val_lower in true_values:
            return 1
        elif val_lower in false_values:
            return 0
    return value

#### Constructing feature matrix

In [49]:
def create_feature_matrix(df):
    # Handle fonts
    fonts = (
        df[df['attribute'] == 'font']
        .groupby('session_id')['value']
        .apply(lambda x: ' '.join(sorted(set(x))))
        .reset_index()
    )

    # Rest of features
    non_fonts = df[df['attribute'] != 'font'].copy()
    non_fonts['feature_name'] = non_fonts['source'] + ':' + non_fonts['attribute']

    # Apply boolean conversion
    non_fonts['value'] = non_fonts['value'].apply(convert_booleans)

    # Pivot table
    pivot = non_fonts.pivot_table(index='session_id',
                                  columns='feature_name',
                                  values='value',
                                  aggfunc='first').reset_index()

    # Merge fonts back
    full_df = pivot.merge(fonts, on='session_id', how='left')

    return full_df

#### Changing font values to one-hot encoding

In [50]:
def vectorize_fonts(df, font_column='value'):
    vectorizer = CountVectorizer(token_pattern=r'[^ ]+')
    font_features = vectorizer.fit_transform(df[font_column].fillna(''))

    font_df = pd.DataFrame(
        font_features.toarray(),
        columns=[f'css:font_{f}' for f in vectorizer.get_feature_names_out()],
        index=df.index
    )

    df = df.drop(columns=[font_column]).join(font_df)
    return df

#### Cleaning feature types

In [51]:
def clean_feature_types(df):
    for col in df.columns:
        # todo bedzie trzeba na to spojrzec gdy dojda kolejne atrybuty bo jest to takie niezbyt uniwersalne
        if col.startswith('css:env') or col.startswith('js:env') or 'width' in col or 'height' in col:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Fill missing numeric with -1
    numeric_cols = df.select_dtypes(include='number').columns
    df[numeric_cols] = df[numeric_cols].fillna(-1)

    # Fill missing non-numeric with empty string
    object_cols = df.select_dtypes(include='object').columns
    df[object_cols] = df[object_cols].fillna('')

    df = df.infer_objects()

    return df

#### Save datagframe to parquet

In [52]:
def save_features(df):
    df.to_parquet(f'2_after_feature_extraction/{OUTPUT_NAME}.parquet')
    df.to_csv(f'2_after_feature_extraction/{OUTPUT_NAME}.csv', index=False)

### Execution

In [53]:
# Load data
df_raw = load_sqlite_to_dataframe(DB_PATH, TABLE_NAME)

In [54]:
# Create feature matrix
df_features = create_feature_matrix(df_raw)

In [55]:
# Vectorize fonts
df_features = vectorize_fonts(df_features)

In [56]:
# Clean feature types
df_ready = clean_feature_types(df_features)

In [57]:
# test Output
print(f"Shape of feature matrix: {df_ready.shape}")
display(df_ready.head())

Shape of feature matrix: (932, 138)


Unnamed: 0,session_id,browserstack:browser,browserstack:browser_version,browserstack:os,browserstack:os_version,browserstack:real_height,browserstack:real_width,css:User-Agent,css:browser,css:env-1-height,css:env-1-width,css:env-10-height,css:env-10-width,css:env-11-height,css:env-11-width,css:env-12-height,css:env-12-width,css:env-13-height,css:env-13-width,css:env-14-height,css:env-14-width,css:env-2-height,css:env-2-width,css:env-3-height,css:env-3-width,css:env-5-height,css:env-5-width,css:env-6-height,css:env-6-width,css:env-7-height,css:env-7-width,css:env-8-height,css:env-8-width,css:env-9-height,css:env-9-width,css:image-set-heif,css:javascript,css:px_per_px,css:system,css:viewport_height,css:viewport_width,js:env-1-container-height,js:env-1-container-width,js:env-10-container-height,js:env-10-container-width,js:env-11-container-height,js:env-11-container-width,js:env-12-container-height,js:env-12-container-width,js:env-13-container-height,js:env-13-container-width,js:env-14-container-height,js:env-14-container-width,js:env-2-container-height,js:env-2-container-width,js:env-3-container-height,js:env-3-container-width,js:env-4-container-height,js:env-4-container-width,js:env-5-container-height,js:env-5-container-width,js:env-6-container-height,js:env-6-container-width,js:env-7-container-height,js:env-7-container-width,js:env-8-container-height,js:env-8-container-width,js:env-9-container-height,js:env-9-container-width,css:font_55,css:font_agency,css:font_arabic,css:font_arial,css:font_arno,css:font_avantgarde,css:font_bankgothic,css:font_batang,css:font_bitstream,css:font_bk,css:font_bright,css:font_bt,css:font_calibri,css:font_ce,css:font_century,css:font_clarendon,css:font_corsiva,css:font_eurostile,css:font_extra,css:font_fb,css:font_fonts,css:font_franklin,css:font_futura,css:font_gotham,css:font_gothic,css:font_haettenschweiler,css:font_helv,css:font_helvetica,css:font_humanst521,css:font_letter,css:font_levenim,css:font_light,css:font_lucida,css:font_marlett,css:font_md,css:font_medium,css:font_meiryo,css:font_menlo,css:font_microsoft,css:font_mincho,css:font_minion,css:font_mono,css:font_monotype,css:font_ms,css:font_mt,css:font_myriad,css:font_neue,css:font_outlook,css:font_pristina,css:font_pro,css:font_reference,css:font_sans,css:font_sans-serif-thin,css:font_scriptina,css:font_segoe,css:font_serifa,css:font_simhei,css:font_small,css:font_specialty,css:font_staccato222,css:font_trajan,css:font_typesetting,css:font_ui,css:font_uighur,css:font_unicode,css:font_univers,css:font_vera,css:font_vrinda,css:font_zwadobef
0,0041b5e2-06c3-43a8-a837-cc0f23e9d1e9,chrome,135.0 beta,OS X,Sonoma,1011,1200,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",chrome,136,195,166,712,175.0,1100.0,123,6762,125.0,88.0,130.0,78.0,188,403,110,133,100,600,164,183,123,554,78.0,587.0,115,681,0.0,1,1,linux,8,12,136,196,166,712,176,1202,123,6762,125,88,130,78,188.0,403.0,110,134,150,300,100,600,164,183,123.0,554.0,78,588,115.0,681.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,006ec0f9-9ad5-457d-8d6a-df7958ec7975,chrome,134.0,Windows,11,1012,945,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",chrome,136,190,166,732,175.0,1100.0,126,6970,130.0,86.0,130.0,78.0,189,403,110,133,100,600,164,170,126,550,78.0,585.0,115,650,0.0,1,1,windows,9,9,136,191,166,732,176,1323,126,6971,130,87,130,78,189.0,403.0,110,134,150,300,100,600,164,170,126.0,551.0,78,585,115.0,650.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,0,0,0,0
2,007746e4-32b0-4f39-b768-200351c74882,edge,124.0,OS X,Monterey,1011,1200,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",chrome,136,195,166,712,175.0,1100.0,123,6762,125.0,84.0,130.0,78.0,188,403,110,133,100,600,164,183,123,554,78.0,587.0,115,681,0.0,1,1,linux,8,11,136,196,166,712,176,1176,123,6762,125,84,130,78,188.0,403.0,110,134,150,300,100,600,164,183,123.0,554.0,78,588,115.0,681.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,008d5537-1e12-48a5-a32f-ce831b21c74e,chromium_iphone,unknown,ios,16.2,669,390,"Mozilla/5.0 (iPhone; CPU iPhone OS 16_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/125.0.6422.80 Mobile/15E148 Safari/604.1",chrome,120,195,166,654,147.0,1100.0,116,6656,116.0,84.0,130.0,74.0,150,386,100,133,100,374,166,153,27,446,51.0,-1.0,114,460,,1,3,linux,6,3,120,196,167,655,147,1188,116,6656,116,84,130,74,150.0,387.0,100,134,150,300,100,374,166,153,27.0,446.0,51,544,114.0,460.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,00b748c3-5c91-43d2-b603-ae43d9fd4be2,chrome,118.0,OS X,Sonoma,1011,1200,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",chrome,136,195,166,712,175.0,1100.0,123,6762,125.0,84.0,130.0,78.0,188,403,110,133,100,600,164,183,123,554,78.0,587.0,115,681,0.0,1,1,linux,8,12,136,196,166,712,176,1184,123,6762,125,84,130,78,188.0,403.0,110,134,150,300,100,600,164,183,123.0,554.0,78,588,115.0,681.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [58]:
save_features(df_ready)

In [59]:
# list all columns
print(df_ready.columns)

Index(['session_id', 'browserstack:browser', 'browserstack:browser_version',
       'browserstack:os', 'browserstack:os_version',
       'browserstack:real_height', 'browserstack:real_width', 'css:User-Agent',
       'css:browser', 'css:env-1-height', 'css:env-1-width',
       'css:env-10-height', 'css:env-10-width', 'css:env-11-height',
       'css:env-11-width', 'css:env-12-height', 'css:env-12-width',
       'css:env-13-height', 'css:env-13-width', 'css:env-14-height',
       'css:env-14-width', 'css:env-2-height', 'css:env-2-width',
       'css:env-3-height', 'css:env-3-width', 'css:env-5-height',
       'css:env-5-width', 'css:env-6-height', 'css:env-6-width',
       'css:env-7-height', 'css:env-7-width', 'css:env-8-height',
       'css:env-8-width', 'css:env-9-height', 'css:env-9-width',
       'css:image-set-heif', 'css:javascript', 'css:px_per_px', 'css:system',
       'css:viewport_height', 'css:viewport_width',
       'js:env-1-container-height', 'js:env-1-container-width',
 