# Import des librairies

In [1]:
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd

!pip install pandas_profiling
from pandas_profiling import ProfileReport

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

!pip install plotly

import plotly.express as px
import plotly.io as pio

from tqdm import tqdm

pio.renderers.default = "iframe"

pd.options.plotting.backend = "plotly"

Collecting pandas_profiling
  Downloading pandas_profiling-3.6.1-py2.py3-none-any.whl (328 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.5/328.5 kB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
Collecting multimethod<1.10,>=1.4
  Downloading multimethod-1.9.1-py3-none-any.whl (10 kB)
Collecting visions[type_image_path]==0.7.5
  Downloading visions-0.7.5-py3-none-any.whl (102 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.7/102.7 kB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<2.14,>=2.13.2
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Collecting htmlmin==0.1.12
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting phik<0.13,>=0.11.1
  Downloading phik-0.12.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (679 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m679.8/679.8 kB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting statsm

# Chargement des fichiers

In [2]:
DATA_PATH = "/storage/P9"

## Load and describe Articles Metada data

articles_metadata = pd.read_csv(
    Path(DATA_PATH, "articles_metadata.csv"),
    parse_dates=["created_at_ts"],
    date_parser=lambda x: datetime.fromtimestamp(int(x) / 1000),
    dtype={
        "article_id": "category",
        "category_id": "category",
        "publisher_id": "category",
        "words_count": "int",
    },
)

articles_metadata = articles_metadata.astype({"created_at_ts": "datetime64[ns]"})

articles_metadata.describe(include="all", datetime_is_numeric=True)

Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count
count,364047.0,364047.0,364047,364047.0,364047.0
unique,364047.0,461.0,,1.0,
top,0.0,281.0,,0.0,
freq,1.0,12817.0,,364047.0,
mean,,,2016-09-16 23:57:17.328421888,,190.897727
min,,,2006-09-27 11:14:35,,0.0
25%,,,2015-10-15 16:00:43.500000,,159.0
50%,,,2017-03-13 16:27:29,,186.0
75%,,,2017-11-05 14:09:11,,218.0
max,,,2018-03-13 12:12:30,,6690.0


In [3]:
## Loading all clicks data

clicks = pd.concat(
    [
        pd.read_csv(
            click_file_path,
            parse_dates=["session_start", "click_timestamp"],
            date_parser=lambda x: datetime.fromtimestamp(int(int(x) / 1000)),
            dtype={
                "user_id": "category",
                "session_id": "category",
                "session_size": "int",
                "click_article_id": "category",
                "click_environment": "category",
                "click_deviceGroup": "category",
                "click_os": "category",
                "click_country": "category",
                "click_region": "category",
                "click_referrer_type": "category",
            },
        ).replace(
            {
                "click_environment": {
                    "1": "1 - Facebook Instant Article",
                    "2": "2 - Mobile App",
                    "3": "3 - AMP (Accelerated Mobile Pages)",
                    "4": "4 - Web",
                },
                "click_deviceGroup": {
                    "1": "1 - Tablet",
                    "2": "2 - TV",
                    "3": "3 - Empty",
                    "4": "4 - Mobile",
                    "5": "5 - Desktop",
                },
                "click_os": {
                    "1": "1 - Other",
                    "2": "2 - iOS",
                    "3": "3 - Android",
                    "4": "4 - Windows Phone",
                    "5": "5 - Windows Mobile",
                    "6": "6 - Windows",
                    "7": "7 - Mac OS X",
                    "8": "8 - Mac OS",
                    "9": "9 - Samsung",
                    "10": "10 - FireHbbTV",
                    "11": "11 - ATV OS X",
                    "12": "12 - tvOS",
                    "13": "13 - Chrome OS",
                    "14": "14 - Debian",
                    "15": "15 - Symbian OS",
                    "16": "16 - BlackBerry OS",
                    "17": "17 - Firefox OS",
                    "18": "18 - Android",
                    "19": "19 - Brew MP",
                    "20": "20 - Chromecast",
                    "21": "21 - webOS",
                    "22": "22 - Gentoo",
                    "23": "23 - Solaris",
                },
            }
        )
        for click_file_path in tqdm(
            sorted(Path(DATA_PATH, "clicks").glob("clicks_hour_*.csv"))
        )
    ],
    sort=False,
    ignore_index=True,
    verify_integrity=True,
)

clicks = clicks.astype(
    {"session_start": "datetime64[ns]", "click_timestamp": "datetime64[ns]"}
)

clicks.describe(include="all", datetime_is_numeric=True)

100%|██████████| 385/385 [00:26<00:00, 14.69it/s]


Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
count,2988181.0,2988181.0,2988181,2988181.0,2988181.0,2988181,2988181,2988181,2988181,2988181.0,2988181.0,2988181.0
unique,322897.0,1048594.0,,,46033.0,,3,5,8,11.0,28.0,7.0
top,5890.0,1507563657895091.0,,,160974.0,,4 - Web,1 - Tablet,17 - Firefox OS,1.0,25.0,2.0
freq,1232.0,124.0,,,37213.0,,2904478,1823162,1738138,2852406.0,804985.0,1602601.0
mean,,,2017-10-08 14:17:08.013157120,3.901885,,2017-10-08 14:51:05.106516736,,,,,,
min,,,2017-10-01 02:37:03,2.0,,2017-10-01 03:00:00,,,,,,
25%,,,2017-10-04 13:35:52,2.0,,2017-10-04 14:20:52,,,,,,
50%,,,2017-10-08 20:09:00,3.0,,2017-10-08 20:35:30,,,,,,
75%,,,2017-10-11 19:16:54,4.0,,2017-10-11 19:43:24,,,,,,
max,,,2017-10-17 03:36:19,124.0,,2017-11-13 20:04:14,,,,,,


# EDA

## Metadata

In [4]:
## Visualize Articles categories distribution

articles_metadata["category_id"].value_counts().plot(
    kind="bar",
    labels={
        "index": "Category ID",
        "value": "Count",
    },
    color="value",
    title="Distribution of categories",
)

In [5]:
## Visualize number of articles per category distribution

articles_metadata["category_id"].value_counts().plot(
    kind="box",
    x="category_id",
    title="Distribution of categories counts",
    labels={
        "index": "Category ID",
        "category_id": "Count",
    },
    notched=True,
    points="suspectedoutliers",
)

In [6]:
## Visualize Articles creation time distribution

articles_metadata.sample(frac=0.10)["created_at_ts"].plot(
    kind="histogram",
    title="Distribution of creation time (sampling = 10%)",
    labels={
        "value": "Creation time",
    },
    text_auto=True,
    marginal="box",
)

In [7]:
## Visualize articles word count distribution

articles_metadata["words_count"].sample(frac=0.10).plot(
    kind="histogram",
    title="Distribution of words count (sampling = 10%)",
    labels={
        "value": "Words count",
    },
    text_auto=True,
    marginal="box",
)

In [8]:
## Publish Articles Metadata ProfileReport

profile = ProfileReport(
    articles_metadata,
    title="Pandas Profiling Report",
    explorative=True,
    minimal=True,
)
profile.to_file(Path("./reports/articles_metadata_profile_report.html"))

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## Clicks

In [8]:
## Print period information

print('Sessions start from {} to {}'.format(clicks.session_start.min(), clicks.session_start.max()))
print('Clicks period starts from {} to {}'.format(clicks.click_timestamp.min(), clicks.click_timestamp.max()))

Sessions start from 2017-10-01 02:37:03 to 2017-10-17 03:36:19
Clicks period starts from 2017-10-01 03:00:00 to 2017-11-13 20:04:14


In [9]:
## Visualize click sessions distribution over time

clicks.sample(frac=0.10)["session_start"].plot(
    kind="histogram",
    title="Distribution of sessions (sampling = 10%)",
    labels={
        "value": "Sessions",
    },
    text_auto=True,
    marginal="box",
)

In [10]:
## Visualize clicks distribution over time

clicks.sample(frac=0.10)["click_timestamp"].plot(
    kind="histogram",
    title="Distribution of clicks (sampling = 10%)",
    labels={
        "value": "Clicks",
    },
    text_auto=True,
    marginal="box",
)

In [11]:
## Visualize clicks sessions size

clicks.sample(frac=0.10)["session_size"].plot(
    kind="histogram",
    title="Distribution of session sizes (sampling = 10%)",
    labels={
        "value": "Session size",
    },
    text_auto=True,
    marginal="box",
)

In [12]:
## Visualize nb of clicked articles per user (user engagement)

clicks.sample(frac=0.10).groupby("user_id").agg(
    COUNT_unique_article=("click_article_id", lambda x: len(set(list(x)))),
).plot(
    kind="histogram",
    title="Distribution of number of clicked articles (sampling = 10%)",
    labels={
        "value": "Number of clicked articles",
    },
    text_auto=True,
    marginal="box",
)

In [13]:
## Visualize click environment : user environment

fig = px.parallel_categories(
    clicks.sample(frac=0.10),
    dimensions=["click_environment", "click_deviceGroup", "click_os"],
    title="Distribution of Environment x Device Group x OS (sampling = 10%)",
    labels={
        "click_environment": "Environment",
        "click_deviceGroup": "Device group",
        "click_os": "OS",
    },
)
fig.show()

In [14]:
## Visualize click location : user geolocation

fig = px.parallel_categories(
    clicks.sample(frac=0.10),
    dimensions=["click_country", "click_region"],
    title="Distribution of Country x Region (sampling = 10%)",
    labels={
        "click_country": "Country",
        "click_region": "Region",
    },
)
fig.show()

In [15]:
## Visualize click referrer : user referrer

clicks.sample(frac=0.10)["click_referrer_type"].plot(
    kind="histogram",
    title="Distribution of referrer types (sampling = 10%)",
    labels={
        "value": "Referrer type",
    },
    category_orders={
        "value": [str(i) for i in range(1, 8)],
    },
    text_auto=True,
)

In [32]:
## Publish clicks metadata ProfileReport

profile = ProfileReport(
    clicks, title="Pandas Profiling Report", explorative=True, minimal=True
)
profile.to_file(Path("./reports/clicks_profile_report.html"))

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## Embeddings

In [19]:
## Loading articles embeddings data

articles_embeddings = pd.read_pickle(Path(DATA_PATH, "articles_embeddings.pickle"))

articles = pd.DataFrame(
    articles_embeddings,
    columns=["embedding_" + str(i) for i in range(articles_embeddings.shape[1])],
)
articles["words_count"] = articles_metadata["words_count"]
articles["category_id"] = articles_metadata["category_id"]
articles["article_id"] = articles_metadata["article_id"]

articles.describe(include="all", datetime_is_numeric=True)

articles_sample = articles.sample(frac=0.10)

In [17]:
## Visualize Articles Embeddings in 2D PCA

pca = PCA(n_components=2)
articles_pca = pca.fit_transform(
    articles_sample[
        ["embedding_" + str(i) for i in range(articles_embeddings.shape[1])]
    ]
)

# Plot the data in the PCA space
fig = px.scatter(
    x=articles_pca[:, 0],
    y=articles_pca[:, 1],
    color=articles_sample["category_id"],
    symbol=articles_sample["category_id"],
    title="PCA 2D",
    width=1200,
    height=800,
)
fig.show()

In [18]:
## Visualize Articles Embeddings in 2D t-SNE

tsne = TSNE(n_components=2)
articles_tsne = tsne.fit_transform(
    articles_sample[
        ["embedding_" + str(i) for i in range(articles_embeddings.shape[1])]
    ]
)

# Plot the data in the PCA space
fig = px.scatter(
    x=articles_tsne[:, 0],
    y=articles_tsne[:, 1],
    color=articles_sample["category_id"],
    symbol=articles_sample["category_id"],
    title="t-SNE 2D",
    width=1200,
    height=800,
)
fig.show()


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.

