# Import des librairies

In [22]:
import os
import sys

import logging
from datetime import datetime
from pathlib import Path

!pip install azure-cosmos
from azure.cosmos import CosmosClient, PartitionKey

import numpy as np
import pandas as pd

!pip install implicit
from implicit.als import AlternatingLeastSquares

from sklearn.preprocessing import OrdinalEncoder, StandardScaler

from scipy import sparse
from tqdm import tqdm

!pip install python-dotenv
from dotenv import load_dotenv

load_dotenv()

#AZURE_COSMOSDB_URI = os.getenv("AZURE_COSMOSDB_URI")
AZURE_COSMOSDB_URI = "https://recos.documents.azure.com:443/"
#AZURE_COSMOSDB_KEY = os.getenv("AZURE_COSMOSDB_KEY")
AZURE_COSMOSDB_KEY = "a9eoSCXTy5A6gCELxoodsbGqUmXrQgBZ2BVUE5OIlN6A3vFfSBTGC7gXeRZOr7KWfgu1e17WC6inACDbym2fXQ=="

DATA_PATH = "/storage/P9"

CACHE = dict()

[0m

# Fonctions utilitaires

In [7]:
def reduce_dataframe_memory_usage(
    df: pd.DataFrame,
    high_precision: bool = False,
) -> pd.DataFrame:
    """
    Iterate through all the columns of a dataframe and modify the data type to
    reduce memory usage.
    Args:
        df (pd.DataFrame): dataframe to reduce memory usage.
        high_precision (bool): If True, use 64-bit floats instead of 32-bit
    Returns:
        pd.DataFrame: dataframe with reduced memory usage.
    """
    start_mem = round(df.memory_usage().sum() / 1024**2, 2)
    logging.info("Memory usage of dataframe is %d MB", start_mem)

    # Iterate through columns
    for col in df.columns:
        if df[col].dtype == "object":
            # "object" dtype
            if df[col].nunique() < max(100, df.shape[0] / 100):
                # If number of unique values is less than max(100, 1%)
                df[col] = df[col].astype("category")
            else:
                # If number of unique values is greater than max(100, 1%)
                df[col] = df[col].astype("string")

        elif str(df[col].dtype)[:3] == "int":
            # "int" dtype
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
                df[col] = df[col].astype("UInt8")
            elif c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype("Int8")
            elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
                df[col] = df[col].astype("UInt16")
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype("Int16")
            elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
                df[col] = df[col].astype("UInt32")
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype("Int32")
            elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
                df[col] = df[col].astype("UInt64")
            else:
                df[col] = df[col].astype("Int64")

        elif str(df[col].dtype)[:5] == "float":
            # "float" dtype
            c_min = df[col].min()
            c_max = df[col].max()
            if (
                not high_precision
                and c_min > np.finfo(np.float32).min
                and c_max < np.finfo(np.float32).max
            ):
                df[col] = df[col].astype("float32")
            else:
                df[col] = df[col].astype("float64")

    end_mem = round(df.memory_usage().sum() / 1024**2, 2)
    logging.info("Memory usage after optimization is %d MB", end_mem)
    if start_mem > 0:
        logging.info(
            "Decreased by %d %%", round(100 * (start_mem - end_mem) / start_mem)
        )

    return df

# Chargement des fichiers

In [13]:
## Load click data

clicks = pd.concat(
    [
        pd.read_csv(
            click_file_path,
            parse_dates=["session_start", "click_timestamp"],
            date_parser=lambda x: datetime.fromtimestamp(int(int(x) / 1000)),
            dtype={
                "user_id": "category",
                "session_id": "category",
                "session_size": "int",
                "click_article_id": "category",
                "click_environment": "category",
                "click_deviceGroup": "category",
                "click_os": "category",
                "click_country": "category",
                "click_region": "category",
                "click_referrer_type": "category",
            },
        ).replace(
            {
                "click_environment": {
                    "1": "1 - Facebook Instant Article",
                    "2": "2 - Mobile App",
                    "3": "3 - AMP (Accelerated Mobile Pages)",
                    "4": "4 - Web",
                },
                "click_deviceGroup": {
                    "1": "1 - Tablet",
                    "2": "2 - TV",
                    "3": "3 - Empty",
                    "4": "4 - Mobile",
                    "5": "5 - Desktop",
                },
                "click_os": {
                    "1": "1 - Other",
                    "2": "2 - iOS",
                    "3": "3 - Android",
                    "4": "4 - Windows Phone",
                    "5": "5 - Windows Mobile",
                    "6": "6 - Windows",
                    "7": "7 - Mac OS X",
                    "8": "8 - Mac OS",
                    "9": "9 - Samsung",
                    "10": "10 - FireHbbTV",
                    "11": "11 - ATV OS X",
                    "12": "12 - tvOS",
                    "13": "13 - Chrome OS",
                    "14": "14 - Debian",
                    "15": "15 - Symbian OS",
                    "16": "16 - BlackBerry OS",
                    "17": "17 - Firefox OS",
                    "18": "18 - Android",
                    "19": "19 - Brew MP",
                    "20": "20 - Chromecast",
                    "21": "21 - webOS",
                    "22": "22 - Gentoo",
                    "23": "23 - Solaris",
                },
            }
        )
        for click_file_path in tqdm(
            sorted(Path(DATA_PATH, "clicks").glob("clicks_hour_*.csv"))
        )
    ],
    sort=False,
    ignore_index=True,
    verify_integrity=True,
)

clicks = reduce_dataframe_memory_usage(
    clicks.astype(
        {"session_start": "datetime64[ns]", "click_timestamp": "datetime64[ns]"}
    )
)

clicks.describe(include="all", datetime_is_numeric=True)

100%|██████████| 385/385 [00:26<00:00, 14.52it/s]


Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
count,2988181.0,2988181.0,2988181,2988181.0,2988181.0,2988181,2988181,2988181,2988181,2988181.0,2988181.0,2988181.0
unique,322897.0,1048594.0,,,46033.0,,3,5,8,11.0,28.0,7.0
top,5890.0,1507563657895091.0,,,160974.0,,4 - Web,1 - Tablet,17 - Firefox OS,1.0,25.0,2.0
freq,1232.0,124.0,,,37213.0,,2904478,1823162,1738138,2852406.0,804985.0,1602601.0
mean,,,2017-10-08 14:17:08.013157120,3.901885,,2017-10-08 14:51:05.106516736,,,,,,
min,,,2017-10-01 02:37:03,2.0,,2017-10-01 03:00:00,,,,,,
25%,,,2017-10-04 13:35:52,2.0,,2017-10-04 14:20:52,,,,,,
50%,,,2017-10-08 20:09:00,3.0,,2017-10-08 20:35:30,,,,,,
75%,,,2017-10-11 19:16:54,4.0,,2017-10-11 19:43:24,,,,,,
max,,,2017-10-17 03:36:19,124.0,,2017-11-13 20:04:14,,,,,,


# Modélisation

## Création du rating

### Count implicite

In [14]:
## Create implicit ratings

ratings = (
    clicks.reset_index()
    .groupby(["user_id", "click_article_id"])
    .agg(
        rating=("index", "count"),
    )
    .reset_index()
)

ratings

Unnamed: 0,user_id,click_article_id,rating
0,0,157541,1
1,0,160158,1
2,0,233470,1
3,0,313996,1
4,0,68866,1
...,...,...,...
2950705,99998,64329,1
2950706,99999,168784,1
2950707,99999,225055,1
2950708,99999,272143,1


In [15]:
ratings['rating'].unique()

array([ 1,  2,  3, 13,  5,  4,  6,  8, 12,  7,  9, 33, 10, 30, 31, 17, 16])

### Count pondéré

In [16]:
def get_ratings_from_clicks(clicks):
    count_user_article_clicks = (
        clicks.reset_index()
        .groupby(["user_id", "click_article_id"])
        .agg(
            COUNT_user_article_clicks=("index", "count"),
        )
    )

    count_user_clicks = (
        clicks.reset_index()
        .groupby(["user_id"])
        .agg(
            COUNT_user_clicks=("index", "count"),
        )
    )

    ratings = count_user_article_clicks.join(count_user_clicks, on="user_id")
    ratings["rating"] = (
        ratings["COUNT_user_article_clicks"] / ratings["COUNT_user_clicks"]
    )

    ratings = reduce_dataframe_memory_usage(
        ratings["rating"]
        .reset_index()
        .rename({"click_article_id": "article_id"}, axis=1)
    )

    return ratings


ratings = get_ratings_from_clicks(clicks)

ratings

Unnamed: 0,user_id,article_id,rating
0,0,157541,0.125000
1,0,160158,0.125000
2,0,233470,0.125000
3,0,313996,0.125000
4,0,68866,0.125000
...,...,...,...
2950705,99998,64329,0.071429
2950706,99999,168784,0.250000
2950707,99999,225055,0.250000
2950708,99999,272143,0.250000


## Encodage et formatage

In [19]:
enc = OrdinalEncoder(dtype="int")


ratings_enc = pd.DataFrame(
    enc.fit_transform(ratings[["user_id", "article_id"]]),
    columns=["user_idx", "article_idx"],
)
ratings_enc["rating"] = ratings["rating"]


ratings_sparse = sparse.csr_matrix(
    (
        ratings_enc["rating"].values,
        (ratings_enc["user_idx"].values, ratings_enc["article_idx"].values),
    ),
    shape=(
        ratings_enc["user_idx"].max() + 1,
        ratings_enc["article_idx"].max() + 1,
    ),
)

ratings_sparse

<322897x46033 sparse matrix of type '<class 'numpy.float32'>'
	with 2950710 stored elements in Compressed Sparse Row format>

## Entrainement du modèle

In [20]:
model = AlternatingLeastSquares()
model.fit(ratings_sparse)

  0%|          | 0/15 [00:00<?, ?it/s]

## Test du modèle

In [21]:
model.recommend(5890, ratings_sparse[5890], N=10)

(array([25037,  7646, 13471, 10601,  7657, 29034, 25224,  3858, 24929,
         8571], dtype=int32),
 array([0.02005151, 0.015251  , 0.01478209, 0.01457605, 0.01430848,
        0.0128094 , 0.01267883, 0.01226072, 0.01174724, 0.01170805],
       dtype=float32))

## Liaison Database

In [23]:
client = CosmosClient(AZURE_COSMOSDB_URI, AZURE_COSMOSDB_KEY)
database = client.create_database_if_not_exists(id="recos")
container = database.create_container_if_not_exists(
    id="UserRecos",
    partition_key=PartitionKey(path="/id"),
)

## Sauvegarde des recommendations

In [None]:
user_ids = ratings["user_id"].unique().astype("int")
recos, _ = model.recommend(user_ids, ratings_sparse[user_ids], N=10)

for i, user_id in tqdm(enumerate(user_ids)):
    container.upsert_item(
        body={
            "id": str(user_id),
            "articles": [str(article_id) for article_id in recos[i]],
        }
    )

147649it [3:24:30, 12.12it/s]