In [10]:
import sqlite3

import pandas as pd

# SETUP
DB_PATH = "0_raw_databases/1000_calc_media.db"
DB_PATH_CLEANED = "1_after_cleaning_databases/1000_calc_media.db"

# to get sensible output
pd.set_option('display.width', 400)
pd.set_option('display.max_columns', 10)

In [11]:
conn = sqlite3.connect(DB_PATH)
query = "SELECT session_id, attribute, value, source FROM css_attribute"
df = pd.read_sql(query, conn)
conn.close()

## Usunięcie losowych, niezwiązanych z żadnym urządzeniem danych

In [12]:
# find all session_ids that don't have User-Agent attribute
conn = sqlite3.connect(DB_PATH)
query = """
        SELECT DISTINCT session_id
        FROM css_attribute
        WHERE session_id NOT IN (SELECT DISTINCT session_id
                                 FROM css_attribute
                                 WHERE attribute = 'User-Agent') \
        """
missing_session_ids = pd.read_sql(query, conn)
conn.close()
# remove duplicates
missing_session_ids = missing_session_ids.drop_duplicates(subset=['session_id'])

# remove found session_ids from the df dataframe
for session_id in missing_session_ids['session_id']:
    df = df[df['session_id'] != session_id]

## Usunięcie urządzeń, na których nie działa @container

In [13]:
conn = sqlite3.connect(DB_PATH)
query = """
        SELECT session_id, attribute, value, source
        FROM css_attribute
        WHERE source = 'css'
          and session_id IN (SELECT DISTINCT session_id
                             FROM css_attribute
                             WHERE attribute = 'User-Agent') \
        """
css_session_ids = pd.read_sql(query, conn)
conn.close()
# count the number of attributes for each session_id
css_session_ids_count = css_session_ids['session_id'].value_counts()

# filter the session_ids with only 2 attributes
css_session_ids_count = css_session_ids_count[css_session_ids_count == 2]

# remove duplicates
css_session_ids_count = css_session_ids_count.reset_index()

# remove found session_ids from the df dataframe
for session_id in css_session_ids_count['session_id']:
    df = df[df['session_id'] != session_id]

## Usunięcie sesji, które nie pochodzą od ładowania strony tylko z dodatkowych requestów podczas testów browserstack

In [14]:
conn = sqlite3.connect(DB_PATH)
query = """
        SELECT session_id, attribute, value, source
        FROM css_attribute
        WHERE session_id NOT IN (SELECT DISTINCT session_id
                                 FROM css_attribute
                                 WHERE source = 'browserstack') \
        """
missing_session_ids = pd.read_sql(query, conn)
conn.close()
# remove duplicates
missing_session_ids = missing_session_ids.drop_duplicates(subset=['session_id'])

# remove found session_ids from the df dataframe
for session_id in missing_session_ids['session_id']:
    df = df[df['session_id'] != session_id]

## Usunięcie sesji, na których nie działa font detection z dobranymi parametrami

In [15]:
# find all session_ids that have more than 40 attributes 'font'
conn = sqlite3.connect(DB_PATH)
query = """
        SELECT session_id, attribute, value, source
        FROM css_attribute
        WHERE attribute = 'font' \
        """
font_session_ids = pd.read_sql(query, conn)
conn.close()

# remove duplicates
# font_session_ids = font_session_ids.drop_duplicates(subset=['session_id'])
# count the number of attributes for each session_id
font_session_ids_count = font_session_ids['session_id'].value_counts()
# filter the session_ids with more than 40 attributes
font_session_ids_count = font_session_ids_count[font_session_ids_count > 30]

# remove found session_ids from the df dataframe
for session_id in font_session_ids_count.index:
    df = df[df['session_id'] != session_id]

## Kod pozwalający na usunięcie wybranych sesji, które w ramach etapu 0 zostały wykryte ale nie są związane z żadnym konkretnym błędem.
Mogą one wynikać z zbyt długiego ładowania się strony, problemów z siecią lub innych błędów, które mogły wystąpić podczas automatycznych testów

In [16]:
session_id_list = [
"01a304a7-f93f-422f-9896-7aafd63986db",
"6bf64725-6436-47cb-8f34-7ce6b8d124b9",
"71bb6995-47ab-47f9-8fc9-6088e45a11d4",
"c47593b1-eb07-4aa2-80a3-6fab6d03e746",
"c98bedfa-709d-4cb0-9b63-15cf72fdb3b9",
"d8723313-d1b0-4449-ae8c-73880012ddf0",
"e192240f-6e06-46e5-abf9-9a476cfdc169"
]
# remove found session_ids from the df dataframe
for session_id in session_id_list:
    df = df[df['session_id'] != session_id]

## Ostateczna liczba unikalnych sesji, po oczyszczeniu

In [17]:
# check how many session_ids are left in dataframe
unique_session_ids_count = df['session_id'].drop_duplicates().shape[0]

print(f"Number of unique session_ids: {unique_session_ids_count}")

Number of unique session_ids: 922


In [18]:
# save the cleaned dataframe to the new sqlite database
conn = sqlite3.connect(DB_PATH_CLEANED)
df.to_sql('css_attribute', conn, if_exists='replace', index=False)
conn.close()