In [17]:
import sqlite3

import pandas as pd

# SETUP
DB_PATH = "0_raw_databases/1000_database_media.db"
DB_PATH_CLEANED = "1_after_cleaning_databases/1000_devices_media_cleaned.db"

# to get sensible output
pd.set_option('display.width', 400)
pd.set_option('display.max_columns', 10)

In [18]:
conn = sqlite3.connect(DB_PATH)
query = "SELECT session_id, attribute, value, source FROM css_attribute"
df = pd.read_sql(query, conn)
conn.close()

## Usunięcie losowych, niezwiązanych z żadnym urządzeniem danych

In [19]:
# find all session_ids that don't have User-Agent attribute
conn = sqlite3.connect(DB_PATH)
query = """
        SELECT DISTINCT session_id
        FROM css_attribute
        WHERE session_id NOT IN (SELECT DISTINCT session_id
                                 FROM css_attribute
                                 WHERE attribute = 'User-Agent') \
        """
missing_session_ids = pd.read_sql(query, conn)
conn.close()
# remove duplicates
missing_session_ids = missing_session_ids.drop_duplicates(subset=['session_id'])

# remove found session_ids from the df dataframe
for session_id in missing_session_ids['session_id']:
    df = df[df['session_id'] != session_id]

## Usunięcie urządzeń, na których nie działa @container

In [20]:
conn = sqlite3.connect(DB_PATH)
query = """
        SELECT session_id, attribute, value, source
        FROM css_attribute
        WHERE source = 'css'
          and session_id IN (SELECT DISTINCT session_id
                             FROM css_attribute
                             WHERE attribute = 'User-Agent') \
        """
css_session_ids = pd.read_sql(query, conn)
conn.close()
# count the number of attributes for each session_id
css_session_ids_count = css_session_ids['session_id'].value_counts()

# filter the session_ids with only 2 attributes
css_session_ids_count = css_session_ids_count[css_session_ids_count == 2]

# remove duplicates
css_session_ids_count = css_session_ids_count.reset_index()

# remove found session_ids from the df dataframe
for session_id in css_session_ids_count['session_id']:
    df = df[df['session_id'] != session_id]

## Usunięcie sesji, które nie pochodzą od ładowania strony tylko z dodatkowych requestów podczas testów browserstack

In [21]:
conn = sqlite3.connect(DB_PATH)
query = """
        SELECT session_id, attribute, value, source
        FROM css_attribute
        WHERE session_id NOT IN (SELECT DISTINCT session_id
                                 FROM css_attribute
                                 WHERE source = 'browserstack') \
        """
missing_session_ids = pd.read_sql(query, conn)
conn.close()
# remove duplicates
missing_session_ids = missing_session_ids.drop_duplicates(subset=['session_id'])

# remove found session_ids from the df dataframe
for session_id in missing_session_ids['session_id']:
    df = df[df['session_id'] != session_id]

## Usunięcie sesji, na których nie działa font detection z dobranymi parametrami

In [22]:
# find all session_ids that have more than 40 attributes 'font'
conn = sqlite3.connect(DB_PATH)
query = """
        SELECT session_id, attribute, value, source
        FROM css_attribute
        WHERE attribute = 'font' \
        """
font_session_ids = pd.read_sql(query, conn)
conn.close()

# remove duplicates
# font_session_ids = font_session_ids.drop_duplicates(subset=['session_id'])
# count the number of attributes for each session_id
font_session_ids_count = font_session_ids['session_id'].value_counts()
# filter the session_ids with more than 40 attributes
font_session_ids_count = font_session_ids_count[font_session_ids_count > 30]

# remove found session_ids from the df dataframe
for session_id in font_session_ids_count.index:
    df = df[df['session_id'] != session_id]

## Kod pozwalający na usunięcie wybranych sesji, które w ramach etapu 0 zostały wykryte ale nie są związane z żadnym konkretnym błędem.
Mogą one wynikać z zbyt długiego ładowania się strony, problemów z siecią lub innych błędów, które mogły wystąpić podczas automatycznych testów

In [23]:
session_id_list = [
    "375b904c-424d-46a2-b907-909d5100461e",
    "43307808-5861-44a7-86d9-7914d635bdfa",
    "4e20e8b0-2427-40d5-ab77-84c462f7e11f",
    "5810ff32-7126-4911-95cb-8a1e4481400d",
    "62e191d2-f02c-4fb2-bc4f-0cc42fe332f2",
    "6b2c152d-65a1-4ba6-a9b7-a7a5a1589647",
    "778ea450-d21b-4951-9a30-9ea55c73d457",
    "9328aa58-6007-4e61-b8e5-c8ccc676ad24",
    "abbf2ce4-b484-4816-86bd-ba641b1af990",
    "cb3cba62-ed5b-4f63-8e9d-7cd585f63cc3",
    "d278e5d3-68a3-48aa-8621-1875312f3ff9",
    "d8f00dbd-f5e2-47a7-9a75-43e4b5425364",
    "edeaa1cd-1b86-46cc-9b17-239e81552d3f"
]
# remove found session_ids from the df dataframe
for session_id in session_id_list:
    df = df[df['session_id'] != session_id]

## Ostateczna liczba unikalnych sesji, po oczyszczeniu

In [24]:
# check how many session_ids are left in dataframe
unique_session_ids_count = df['session_id'].drop_duplicates().shape[0]

print(f"Number of unique session_ids: {unique_session_ids_count}")

Number of unique session_ids: 918


In [25]:
# save the cleaned dataframe to the new sqlite database
conn = sqlite3.connect(DB_PATH_CLEANED)
df.to_sql('css_attribute', conn, if_exists='replace', index=False)
conn.close()