In [1]:
import sqlite3

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

# CONFIG
DB_PATH = '1_after_cleaning_databases/1000_devices_media_cleaned.db'
TABLE_NAME = 'css_attribute'
OUTPUT_NAME = 'feature_matrix_media'

## PANDAS
pd.set_option('future.no_silent_downcasting', True)
# Set pandas options to display the full DataFrame
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Disable line wrapping
pd.set_option('display.max_colwidth', None)  # Show full content in each column
pd.set_option('display.max_seq_item', 100000)  # Show all items in a sequence (e.g., lists, dictionaries)

# Optionally, increase the display buffer size
pd.set_option('display.max_info_columns', 100000)  # Show all columns in df.info()

# If you have very large dataframes, increasing this might help
pd.set_option('display.max_rows', 1000)  # You can change 1000 to a higher value if needed

### Functions

#### Loading data from sqlite to pandas dataframe

In [2]:
def load_sqlite_to_dataframe(db_path, table_name):
    conn = sqlite3.connect(db_path)
    df = pd.read_sql(f"SELECT * FROM {table_name}", conn)
    conn.close()
    return df

#### Changing boolean values to 1 and 0

In [3]:
def convert_booleans(value):
    true_values = {'enabled'}
    false_values = {'disabled'}

    if isinstance(value, str):
        val_lower = value.lower()
        if val_lower in true_values:
            return 1
        elif val_lower in false_values:
            return 0
    return value

#### Constructing feature matrix

In [4]:
def create_feature_matrix(df):
    # Handle fonts
    fonts = (
        df[df['attribute'] == 'font']
        .groupby('session_id')['value']
        .apply(lambda x: ';'.join(sorted(set(x))))
        .reset_index()
    )

    # replace spaces in font names with underscores
    fonts['value'] = fonts['value'].str.replace(' ', '_', regex=False)

    # Rest of features
    non_fonts = df[df['attribute'] != 'font'].copy()
    non_fonts['feature_name'] = non_fonts['source'] + ':' + non_fonts['attribute']

    # Apply boolean conversion
    non_fonts['value'] = non_fonts['value'].apply(convert_booleans)

    # Pivot table
    pivot = non_fonts.pivot_table(index='session_id',
                                  columns='feature_name',
                                  values='value',
                                  aggfunc='first').reset_index()

    # Merge fonts back
    full_df = pivot.merge(fonts, on='session_id', how='left')

    full_df.rename(columns={'value': 'fonts'}, inplace=True)

    return full_df

#### Changing font values to one-hot encoding

In [5]:
def vectorize_fonts(df, font_column='fonts'):
    vectorizer = CountVectorizer(tokenizer=lambda x: [token.strip() for token in x.split(';') if token.strip()])
    font_features = vectorizer.fit_transform(df[font_column].fillna(''))

    font_df = pd.DataFrame(
        font_features.toarray(),
        columns=[f'css:font_{f}' for f in vectorizer.get_feature_names_out()],
        index=df.index
    )

    font_df['css:font_vector'] = font_df.astype(str).agg(''.join, axis=1)

    df = df.drop(columns=[font_column]).join(font_df)
    return df

#### Cleaning feature types

In [6]:
def clean_feature_types(df):
    for col in df.columns:
        # todo bedzie trzeba na to spojrzec gdy dojda kolejne atrybuty bo jest to takie niezbyt uniwersalne
        if col.startswith('css:env') or col.startswith("css:media") or col.startswith('js:env') or 'width' in col or 'height' in col:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Fill missing numeric with -1
    numeric_cols = df.select_dtypes(include='number').columns
    df[numeric_cols] = df[numeric_cols].fillna(-1)

    # Fill missing non-numeric with empty string
    object_cols = df.select_dtypes(include='object').columns
    df[object_cols] = df[object_cols].fillna('')

    df = df.infer_objects()

    return df

#### Save datagframe to parquet

In [7]:
def save_features(df):
    df.to_parquet(f'2_after_feature_extraction/{OUTPUT_NAME}.parquet')
    df.to_csv(f'2_after_feature_extraction/{OUTPUT_NAME}.csv', index=False)

### Execution

In [8]:
# Load data
df_raw = load_sqlite_to_dataframe(DB_PATH, TABLE_NAME)

In [9]:
# Create feature matrix
df_features = create_feature_matrix(df_raw)

In [10]:
df_features

Unnamed: 0,session_id,browserstack:browser,browserstack:browser_version,browserstack:os,browserstack:os_version,browserstack:real_height,browserstack:real_width,css:User-Agent,css:browser,css:env-1-height,css:env-1-width,css:env-10-height,css:env-10-width,css:env-11-height,css:env-11-width,css:env-12-height,css:env-12-width,css:env-13-height,css:env-13-width,css:env-14-height,css:env-14-width,css:env-2-height,css:env-2-width,css:env-3-height,css:env-3-width,css:env-5-height,css:env-5-width,css:env-6-height,css:env-6-width,css:env-7-height,css:env-7-width,css:env-8-height,css:env-8-width,css:env-9-height,css:env-9-width,css:image-set-heif,css:javascript,css:media-1-width,css:media-10-width,css:media-2-width,css:media-3-width,css:media-4-width,css:media-5-width,css:media-6-width,css:media-7-width,css:media-8-width,css:media-9-width,css:px_per_px,css:system,css:viewport_height,css:viewport_width,js:env-1-container-height,js:env-1-container-width,js:env-10-container-height,js:env-10-container-width,js:env-11-container-height,js:env-11-container-width,js:env-12-container-height,js:env-12-container-width,js:env-13-container-height,js:env-13-container-width,js:env-14-container-height,js:env-14-container-width,js:env-2-container-height,js:env-2-container-width,js:env-3-container-height,js:env-3-container-width,js:env-4-container-height,js:env-4-container-width,js:env-5-container-height,js:env-5-container-width,js:env-6-container-height,js:env-6-container-width,js:env-7-container-height,js:env-7-container-width,js:env-8-container-height,js:env-8-container-width,js:env-9-container-height,js:env-9-container-width,fonts
0,002fc90c-b5a1-4db9-a469-3e31ef506b36,edge,105.0,OS X,Big Sur,1011,1200,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.25",chrome,136,195,166,711,175,1100,123,6762,125,84,130,78,188,403,110,133,100,600,164,182,123,554,78,587,115,678,,1,32,3,17,128,0,68,20,1,8,5,1,linux,8,12.0,136,196,166,711,176.0,1184.0,123,6762,125,84,130,78,188,403,110,134,150,300,100,600,164.0,182.0,123,554,78.0,588.0,115,678,Menlo
1,00964d52-4547-49f6-8161-42c9f5b85fb6,firefox,122.0,OS X,Monterey,949,1280,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:122.0) Gecko/20100101 Firefox/122.0,firefox,124,196,309,470,164,1100,116,6973,124,74,130,74,180,400,104,133,100,600,156,164,126,531,56,600,111,737,0.0,1,32,7,17,128,32,64,20,1,14,5,1,linux,8,12.0,125,196,309,470,164.0,1264.0,116,6974,124,74,130,74,181,400,104,134,150,300,100,600,156.0,165.0,126,532,57.0,602.0,111,738,Menlo
2,00ca66db-dcc8-4e63-a1ec-358c641e5d62,edge,124.0,OS X,Ventura,1011,1200,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",chrome,136,195,166,712,175,1100,123,6762,125,84,130,78,188,403,110,133,100,600,164,183,123,554,78,587,115,681,0.0,1,32,7,17,128,32,68,20,1,8,5,1,linux,8,11.0,136,196,166,712,176.0,1176.0,123,6762,125,84,130,78,188,403,110,134,150,300,100,600,164.0,183.0,123,554,78.0,588.0,115,681,Menlo
3,00dbd708-2007-45ee-affe-44af226d935f,chrome_android,unknown,android,10.0,727,393,"Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Mobile Safari/537.36",chrome,130,224,169,785,167,1100,121,6866,126,85,131,76,163,454,105,150,100,377,163,169,51,487,66,600,115,671,0.0,1,32,4,49,128,32,68,41,2,8,5,3,linux,7,3.0,130,225,169,785,168.0,1436.0,121,6867,126,86,131,77,163,455,105,150,150,300,100,377,163.0,169.0,52,488,67.0,619.0,115,671,sans-serif-thin
4,00e9af93-70c0-4a67-b38d-41e488a63f15,chrome_android,unknown,android,11.0,1037,753,"Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",chrome,130,224,168,784,167,1100,120,7125,126,90,130,77,182,468,105,150,100,600,163,170,109,637,67,600,114,672,0.0,1,32,7,17,128,32,68,36,3,8,5,3,linux,11,7.0,130,225,168,784,167.0,1429.0,120,7125,126,90,131,77,182,468,106,150,150,300,100,600,163.0,170.0,109,637,68.0,646.0,114,673,sans-serif-thin
5,00fdffc9-f32f-4143-8eb5-c86c5da72a25,edge,109.0,OS X,High Sierra,1013,1200,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.49",chrome,136,195,166,711,175,1100,123,6763,125,84,130,78,188,403,110,133,100,600,164,181,123,554,78,587,115,678,,1,32,3,17,128,0,68,20,1,8,5,1,linux,8,12.0,136,196,166,711,176.0,1169.0,123,6764,125,84,130,78,188,403,110,134,150,300,100,600,164.0,181.0,123,554,78.0,588.0,115,678,Menlo
6,012d4223-904f-4375-a9b0-1f0237c05b03,edge,132.0,Windows,11,1012,945,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0",chrome,136,190,166,730,175,1100,126,6970,130,81,130,78,189,403,110,133,100,600,164,168,126,550,78,585,115,650,0.0,1,32,7,17,128,32,68,20,1,8,5,1,windows,8,9.0,136,191,166,730,176.0,1323.0,126,6971,130,81,130,78,189,403,110,134,150,300,100,600,164.0,168.0,126,551,78.0,585.0,115,650,Calibri;Franklin_Gothic;MS_UI_Gothic;Marlett;Segoe_UI_Light
7,01334116-382c-4bc3-8662-9d4fe217d955,firefox,121.0,OS X,Ventura,949,1280,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0,firefox,124,196,309,470,164,1100,120,7108,124,74,130,74,180,400,104,133,100,600,156,164,126,531,56,600,111,737,0.0,1,32,7,17,128,32,64,20,1,14,5,1,linux,8,12.0,125,196,309,470,164.0,1264.0,120,7108,124,74,130,74,181,400,104,134,150,300,100,600,156.0,165.0,126,532,57.0,602.0,111,738,Menlo
8,017a63c7-7321-45c0-83f1-53dba5ea049c,firefox,117.0,OS X,Sonoma,949,1280,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/117.0,firefox,124,196,309,470,164,1100,120,7108,124,74,130,74,180,400,104,133,100,600,156,164,126,531,56,600,111,738,0.0,1,32,7,17,128,32,64,20,1,14,5,1,linux,8,12.0,125,196,309,470,164.0,1264.0,120,7108,124,74,130,74,181,400,104,134,150,300,100,600,156.0,165.0,126,532,57.0,602.0,111,738,Menlo
9,017d09c4-da05-44c2-90f2-ffdf0133dea8,edge,128.0,OS X,Monterey,1011,1200,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0",chrome,136,195,166,712,175,1100,123,6762,125,84,130,78,188,403,110,133,100,600,164,183,123,554,78,587,115,681,0.0,1,32,7,17,128,32,68,20,1,8,5,1,linux,8,11.0,136,196,166,712,176.0,1202.0,123,6762,125,84,130,78,188,403,110,134,150,300,100,600,164.0,183.0,123,554,78.0,588.0,115,681,Menlo


In [11]:
# Vectorize fonts
df_features = vectorize_fonts(df_features)



In [12]:
# Clean feature types
df_ready = clean_feature_types(df_features)

In [13]:
# test Output
print(f"Shape of feature matrix: {df_ready.shape}")
display(df_ready.head())

Shape of feature matrix: (918, 95)


Unnamed: 0,session_id,browserstack:browser,browserstack:browser_version,browserstack:os,browserstack:os_version,browserstack:real_height,browserstack:real_width,css:User-Agent,css:browser,css:env-1-height,css:env-1-width,css:env-10-height,css:env-10-width,css:env-11-height,css:env-11-width,css:env-12-height,css:env-12-width,css:env-13-height,css:env-13-width,css:env-14-height,css:env-14-width,css:env-2-height,css:env-2-width,css:env-3-height,css:env-3-width,css:env-5-height,css:env-5-width,css:env-6-height,css:env-6-width,css:env-7-height,css:env-7-width,css:env-8-height,css:env-8-width,css:env-9-height,css:env-9-width,css:image-set-heif,css:javascript,css:media-1-width,css:media-10-width,css:media-2-width,css:media-3-width,css:media-4-width,css:media-5-width,css:media-6-width,css:media-7-width,css:media-8-width,css:media-9-width,css:px_per_px,css:system,css:viewport_height,css:viewport_width,js:env-1-container-height,js:env-1-container-width,js:env-10-container-height,js:env-10-container-width,js:env-11-container-height,js:env-11-container-width,js:env-12-container-height,js:env-12-container-width,js:env-13-container-height,js:env-13-container-width,js:env-14-container-height,js:env-14-container-width,js:env-2-container-height,js:env-2-container-width,js:env-3-container-height,js:env-3-container-width,js:env-4-container-height,js:env-4-container-width,js:env-5-container-height,js:env-5-container-width,js:env-6-container-height,js:env-6-container-width,js:env-7-container-height,js:env-7-container-width,js:env-8-container-height,js:env-8-container-width,js:env-9-container-height,js:env-9-container-width,css:font_arabic_typesetting,css:font_batang,css:font_calibri,css:font_franklin_gothic,css:font_levenim_mt,css:font_marlett,css:font_meiryo_ui,css:font_menlo,css:font_microsoft_uighur,css:font_ms_mincho,css:font_ms_ui_gothic,css:font_sans-serif-thin,css:font_segoe_ui_light,css:font_simhei,css:font_vrinda,css:font_vector
0,002fc90c-b5a1-4db9-a469-3e31ef506b36,edge,105.0,OS X,Big Sur,1011,1200,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.25",chrome,136,195,166,711,175,1100,123,6762,125,84,130,78,188,403,110,133,100,600,164,182,123,554,78,587,115,678,,1,32,3,17,128,0,68,20,1,8,5,1,linux,8,12.0,136,196,166,711,176.0,1184.0,123,6762,125,84,130,78,188,403,110,134,150,300,100,600,164.0,182.0,123,554,78.0,588.0,115,678,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,10000000
1,00964d52-4547-49f6-8161-42c9f5b85fb6,firefox,122.0,OS X,Monterey,949,1280,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:122.0) Gecko/20100101 Firefox/122.0,firefox,124,196,309,470,164,1100,116,6973,124,74,130,74,180,400,104,133,100,600,156,164,126,531,56,600,111,737,0.0,1,32,7,17,128,32,64,20,1,14,5,1,linux,8,12.0,125,196,309,470,164.0,1264.0,116,6974,124,74,130,74,181,400,104,134,150,300,100,600,156.0,165.0,126,532,57.0,602.0,111,738,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,10000000
2,00ca66db-dcc8-4e63-a1ec-358c641e5d62,edge,124.0,OS X,Ventura,1011,1200,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",chrome,136,195,166,712,175,1100,123,6762,125,84,130,78,188,403,110,133,100,600,164,183,123,554,78,587,115,681,0.0,1,32,7,17,128,32,68,20,1,8,5,1,linux,8,11.0,136,196,166,712,176.0,1176.0,123,6762,125,84,130,78,188,403,110,134,150,300,100,600,164.0,183.0,123,554,78.0,588.0,115,681,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,10000000
3,00dbd708-2007-45ee-affe-44af226d935f,chrome_android,unknown,android,10.0,727,393,"Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Mobile Safari/537.36",chrome,130,224,169,785,167,1100,121,6866,126,85,131,76,163,454,105,150,100,377,163,169,51,487,66,600,115,671,0.0,1,32,4,49,128,32,68,41,2,8,5,3,linux,7,3.0,130,225,169,785,168.0,1436.0,121,6867,126,86,131,77,163,455,105,150,150,300,100,377,163.0,169.0,52,488,67.0,619.0,115,671,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1000
4,00e9af93-70c0-4a67-b38d-41e488a63f15,chrome_android,unknown,android,11.0,1037,753,"Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",chrome,130,224,168,784,167,1100,120,7125,126,90,130,77,182,468,105,150,100,600,163,170,109,637,67,600,114,672,0.0,1,32,7,17,128,32,68,36,3,8,5,3,linux,11,7.0,130,225,168,784,167.0,1429.0,120,7125,126,90,131,77,182,468,106,150,150,300,100,600,163.0,170.0,109,637,68.0,646.0,114,673,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1000


In [14]:
save_features(df_ready)

In [15]:
# list all columns
print(df_ready.columns)

Index(['session_id', 'browserstack:browser', 'browserstack:browser_version',
       'browserstack:os', 'browserstack:os_version',
       'browserstack:real_height', 'browserstack:real_width', 'css:User-Agent',
       'css:browser', 'css:env-1-height', 'css:env-1-width',
       'css:env-10-height', 'css:env-10-width', 'css:env-11-height',
       'css:env-11-width', 'css:env-12-height', 'css:env-12-width',
       'css:env-13-height', 'css:env-13-width', 'css:env-14-height',
       'css:env-14-width', 'css:env-2-height', 'css:env-2-width',
       'css:env-3-height', 'css:env-3-width', 'css:env-5-height',
       'css:env-5-width', 'css:env-6-height', 'css:env-6-width',
       'css:env-7-height', 'css:env-7-width', 'css:env-8-height',
       'css:env-8-width', 'css:env-9-height', 'css:env-9-width',
       'css:image-set-heif', 'css:javascript', 'css:media-1-width',
       'css:media-10-width', 'css:media-2-width', 'css:media-3-width',
       'css:media-4-width', 'css:media-5-width', 'css:me