In [None]:
%%capture

!pip install advertools adviz

In [None]:
import advertools as adv
import adviz
import pandas as pd
pd.options.display.max_columns = None
import plotly
import plotly.express as px
from IPython.display import display_html, display_markdown

for pkg in [adv, adviz, pd, plotly]:
    print(f'{pkg.__name__:-<30}v{pkg.__version__}')

def md(text):
    return display_markdown(text, raw=True)

In [None]:
rm_words = ['and', 'la', 'de', 'der', 'et', 'die', 'des', 'les', 'le', 
            'its', 'or', 'le', 'du', 'can', 'ii', 'und', 'the', 'to', 'in',
            'of', 'a', 'for', 'on', 'is', 'with', 'at']

In [None]:
# sitemap = adv.sitemap_to_df('https://www.apple.com/robots.txt')

website = 'apple.com'

sitemap_raw = pd.read_csv(
    '/kaggle/input/xml-sitemaps/sitemap_2023_01_11_apple_com.csv',
    parse_dates=['download_date'], low_memory=False)
print(f'Original: {sitemap_raw.shape}')
sitemap = sitemap_raw.drop_duplicates(subset=['loc'])
print(f'After de-duplication: {sitemap.shape}')
sitemap.head()

In [None]:
duplicated_urls = sitemap_raw['loc'].duplicated()
md(f'## Duplicated URLs: {duplicated_urls.sum():,} ({duplicated_urls.mean():.1%})')

In [None]:
# are there any
'errors' in sitemap # ? 

In [None]:
sitemap[['sitemap', 'errors']].dropna()

### URLs per sitemap (make sure none exceeds 50k URLs)

In [None]:
adviz.value_counts_plus(sitemap_raw['sitemap'], show_top=20, name='URLs per sitemap')

### Sitemap size (make sure none exceeds 50MB)

In [None]:
(sitemap_raw
 ['sitemap_size_mb']
 .drop_duplicates()
 .describe()
 .to_frame()
 .style
 .format('{:.2f}'))

# Split URLs into their components

Many URLs contain a country code  and optionally a language code where applicable, in them `/ca/`, `/de/`, or `hk-zh` for example. A special regex can be used to extract those patterns and create a `country` column so we can more easily categorize and anlyze those pages.

We also remove duplicated URLs.

In [None]:
urldf = adv.url_to_df(sitemap['loc'].dropna().str.strip())
urldf['country'] = urldf['dir_1'].str.extract('(^[a-z]{2}$|^[a-z]{2}-[a-z]{2,}$)')[0]

print(urldf.shape)
urldf.head()

# Check schemes and domains used in the URL list

In [None]:
adviz.value_counts_plus(urldf['scheme'], size=15)

In [None]:
adviz.value_counts_plus(urldf['netloc'], size=15, name='Domains')

The exception seems to be China, which has its own domain. In this case, we can take the index of the URLs where the domain is `www.apple.com.cn` and assign the value `cn` to the country column.

In [None]:
china_index = urldf[urldf['netloc'].eq('www.apple.com.cn')].index
urldf.loc[china_index, 'country'] = 'cn'

quick check to make sure we have 6,917 `cn` URLs:

In [None]:
adviz.value_counts_plus(urldf['country'], name='Country URLs', show_top=15)

# Check the percentage of utilization of URL components

In [None]:
(urldf
 .notna()
 .mean()
 .to_frame()
 .rename(columns={0: 'URL element used %'})
 .style
 .format('{:.2%}')
 .background_gradient(cmap='cividis'))

## Explore the query parameter `fh`, which is present in 45% of the URLs

In [None]:
urldf[urldf['query_fh'].notna()]['query_fh'].value_counts()

In [None]:
urldf[urldf['query_fh'].notna()]['url'].sample(10).tolist()

In [None]:
adviz.value_counts_plus(urldf[urldf['query_fh'].notna()]['path'], name='URLs with query_fh')

In [None]:
urldf[urldf['query_fh'].notna()]['path'].duplicated().sum()

In [None]:
urldf[urldf['query_fh'].notna()]['path'].duplicated().mean()

It's not clear what this parameter provides, other than causing their pages to be empty.



# Visualize the structure of the site with a treemap `adviz.url_structure`

In [None]:
adviz.url_structure(
    urldf['url'],
    domain='apple.com',
    items_per_level=20,
    height=700,
    title='URL structure: <b>apple.com<b>')

# Explore the structure of the site, by counting values in various directories

In [None]:
adviz.value_counts_plus(urldf['dir_1'], name='/dir_1/')

In [None]:
adviz.value_counts_plus(urldf[urldf['country'].isna()]['dir_1']).set_caption('<h3>Counts of dir_1 for pages without a country code')

What kind of products do they have under `/shop/` (you can test with other filters, for a specific country for example)

In [None]:
adviz.value_counts_plus(urldf[urldf['dir_1'].eq('shop')]['dir_2']).set_caption('<h3>Counts of /dir_2/ under /shop/</h3>')


In [None]:
urldf[urldf['dir_1'].eq('shop') & urldf['dir_2'].eq('accessories')]['dir_3'].value_counts()

In [None]:
(adviz.value_counts_plus(
    urldf[urldf['dir_1'].eq('shop') &
          urldf['dir_2'].eq('accessories')]
    ['dir_4'], show_top=15, name='Accessories products')
#  .set_caption('<h5>Counts of accessories products')
)

What does the `/today/` directory contain?

In [None]:
adviz.value_counts_plus(
    urldf[urldf['dir_1'].eq('today') & urldf['dir_2'].eq('event')]['dir_3'],
    name='/today/dir_2/')
# .set_caption('<h3>Counts of /today/dir_2/</h3>')


It seems these are live event pages where they demonstrate certain products/skills to users. The same event is duplicated hundreds/thousands of times, and almost all those events have expired (empty URLs)

Although this is a bad practice, it can help us understand the frequency of topics that they talk about in these events. We can simply count the words in the event pages to see what they focu on:

In [None]:
(adviz.value_counts_plus(
    urldf[urldf['dir_1'].eq('today') &
          urldf['dir_2'].eq('event')]
    ['dir_3'].str.split('-').explode(),
    show_top=20, name='word')
 .set_caption('<h4>Word counts of event names (counting duplicates)'))

## Number of URLs per country page

In [None]:
country_counts = urldf['country'].value_counts()
px.histogram(
    country_counts,
    labels={'value': 'URLs per country page'},
    title='Number of pages per country – apple.com sitemap')


In [None]:
px.histogram(
    country_counts[country_counts.lt(1000)],
    labels={'value': 'URLs per country page'},
    title='Number of pages per country (countries with less than 1,000 pages) – apple.com sitemap')


In [None]:
px.histogram(
    country_counts[country_counts.gt(1000)],
    labels={'value': 'URLs per country page'},
    title='Number of pages per country (countries with more than 1,000 pages) – apple.com sitemap')

# Most used words (uni-grams, bigrams, etc.) in `last_dir`

This is done by removing the `/today/` URL, and deduplicating the URLs. You can run the same using other filters as you see fit.

In [None]:
(adv.word_frequency(
    urldf[urldf['dir_1'].ne('today')]['last_dir']
    .dropna()
    .str.replace('-', ' '),
    rm_words=rm_words
)
 .head(15)
 .style.format({'abs_freq': '{:,}'}))

In [None]:
(adv.word_frequency(
    urldf[urldf['dir_1'].ne('today')]['last_dir']
    .dropna()
    .str.replace('-', ' '),
    phrase_len=2)
 .head(20)
 .style.format({'abs_freq': '{:,}'}))