In [59]:
import sqlite3

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

# CONFIG
DB_PATH = '1_after_cleaning_databases/1000_devices_cleaned.db'
TABLE_NAME = 'css_attribute'
OUTPUT_NAME = 'feature_matrix'

## PANDAS
pd.set_option('future.no_silent_downcasting', True)
pd.set_option('display.width', 400)
pd.set_option('display.max_columns', 10)

### Functions

#### Loading data from sqlite to pandas dataframe

In [60]:
def load_sqlite_to_dataframe(db_path, table_name):
    conn = sqlite3.connect(db_path)
    df = pd.read_sql(f"SELECT * FROM {table_name}", conn)
    conn.close()
    return df

#### Changing boolean values to 1 and 0

In [61]:
def convert_booleans(value):
    true_values = {'enabled'}
    false_values = {'disabled'}

    if isinstance(value, str):
        val_lower = value.lower()
        if val_lower in true_values:
            return 1
        elif val_lower in false_values:
            return 0
    return value

#### Constructing feature matrix

In [62]:
def create_feature_matrix(df):
    # Handle fonts
    fonts = (
        df[df['attribute'] == 'font']
        .groupby('session_id')['value']
        .apply(lambda x: ' '.join(sorted(set(x))))
        .reset_index()
    )

    # Rest of features
    non_fonts = df[df['attribute'] != 'font'].copy()
    non_fonts['feature_name'] = non_fonts['source'] + ':' + non_fonts['attribute']

    # Apply boolean conversion
    non_fonts['value'] = non_fonts['value'].apply(convert_booleans)

    # Pivot table
    pivot = non_fonts.pivot_table(index='session_id',
                                  columns='feature_name',
                                  values='value',
                                  aggfunc='first').reset_index()

    # Merge fonts back
    full_df = pivot.merge(fonts, on='session_id', how='left')

    return full_df

#### Changing font values to one-hot encoding

In [63]:
def vectorize_fonts(df, font_column='value'):
    vectorizer = CountVectorizer(token_pattern=r'[^ ]+')
    font_features = vectorizer.fit_transform(df[font_column].fillna(''))

    font_df = pd.DataFrame(
        font_features.toarray(),
        columns=[f'font_{f}' for f in vectorizer.get_feature_names_out()],
        index=df.index
    )

    df = df.drop(columns=[font_column]).join(font_df)
    return df

#### Cleaning feature types

In [64]:
def clean_feature_types(df):
    for col in df.columns:
        # todo bedzie trzeba na to spojrzec gdy dojda kolejne atrybuty bo jest to takie niezbyt uniwersalne
        if col.startswith('css:env') or col.startswith('js:env') or 'width' in col or 'height' in col:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Fill missing numeric with -1
    numeric_cols = df.select_dtypes(include='number').columns
    df[numeric_cols] = df[numeric_cols].fillna(-1)

    # Fill missing non-numeric with empty string
    object_cols = df.select_dtypes(include='object').columns
    df[object_cols] = df[object_cols].fillna('')

    df = df.infer_objects()

    return df

#### Save datagframe to parquet

In [65]:
def save_features(df):
    df.to_parquet(f'2_after_feature_extraction/{OUTPUT_NAME}.parquet')
    df.to_csv(f'2_after_feature_extraction/{OUTPUT_NAME}.csv', index=False)

### Execution

In [66]:
# Load data
df_raw = load_sqlite_to_dataframe(DB_PATH, TABLE_NAME)

In [67]:
# Create feature matrix
df_features = create_feature_matrix(df_raw)

In [68]:
# Vectorize fonts
df_features = vectorize_fonts(df_features)

In [69]:
# Clean feature types
df_ready = clean_feature_types(df_features)

In [70]:
# test Output
print(f"Shape of feature matrix: {df_ready.shape}")
display(df_ready.head())

Shape of feature matrix: (932, 138)


Unnamed: 0,session_id,browserstack:browser,browserstack:browser_version,browserstack:os,browserstack:os_version,...,font_unicode,font_univers,font_vera,font_vrinda,font_zwadobef
0,0041b5e2-06c3-43a8-a837-cc0f23e9d1e9,chrome,135.0 beta,OS X,Sonoma,...,0,0,0,0,0
1,006ec0f9-9ad5-457d-8d6a-df7958ec7975,chrome,134.0,Windows,11,...,0,0,0,0,0
2,007746e4-32b0-4f39-b768-200351c74882,edge,124.0,OS X,Monterey,...,0,0,0,0,0
3,008d5537-1e12-48a5-a32f-ce831b21c74e,chromium_iphone,unknown,ios,16.2,...,0,0,0,0,0
4,00b748c3-5c91-43d2-b603-ae43d9fd4be2,chrome,118.0,OS X,Sonoma,...,0,0,0,0,0


In [71]:
save_features(df_ready)