In [3]:
import os
import shutil

import pandas as pd
import requests
from sklearn.utils import Bunch
from tqdm.auto import tqdm

In [4]:
def get_data_dir():
    """Return the path of the scikit-uplift data dir.

    This folder is used by some large dataset loaders to avoid downloading the data several times.

    By default the data dir is set to a folder named ``scikit-uplift-data`` in the user home folder.

    Returns:
        string: The path to scikit-uplift data dir.

    """
    return os.path.join(os.path.expanduser("~"), "scikit-uplift-data")

In [5]:
def _create_data_dir(path):
    """Creates a directory, which stores the datasets.

    Args:
        path (str): The path to scikit-uplift data dir.

    """
    if not os.path.isdir(path):
        os.makedirs(path)

In [6]:
def _download(url, dest_path, content_length_header_key='Content-Length'):
    """Download the file from url and save it locally.

    Args:
        url (str): URL address, must be a string.
        dest_path (str): Destination of the file.
        content_length_header_key (str): The key in the HTTP response headers that lists the response size in bytes.
            Used for progress bar.
    """
    if isinstance(url, str):
        req = requests.get(url, stream=True)
        req.raise_for_status()

        with open(dest_path, "wb") as fd:
            total_size_in_bytes = int(req.headers.get(content_length_header_key, 0))
            progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
            for chunk in req.iter_content(chunk_size=2 ** 20):
                progress_bar.update(len(chunk))
                fd.write(chunk)
    else:
        raise TypeError("URL must be a string")

In [7]:
def _get_data(data_home, url, dest_subdir, dest_filename, download_if_missing,
              content_length_header_key='Content-Length'):
    """Return the path to the dataset.

    Args:
        data_home (str): The path to scikit-uplift data dir.
        url (str): The URL to the dataset.
        dest_subdir (str): The name of the folder in which the dataset is stored.
        dest_filename (str): The name of the dataset.
        download_if_missing (bool): If False, raise a IOError if the data is not locally available instead of
            trying to download the data from the source site.
        content_length_header_key (str): The key in the HTTP response headers that lists the response size in bytes.
            Used for progress bar.

    Returns:
        string: The path to the dataset.

    """
    if data_home is None:
        if dest_subdir is None:
            data_dir = get_data_dir()
        else:
            data_dir = os.path.join(get_data_dir(), dest_subdir)
    else:
        if dest_subdir is None:
            data_dir = os.path.abspath(data_home)
        else:
            data_dir = os.path.join(os.path.abspath(data_home), dest_subdir)

    _create_data_dir(data_dir)

    dest_path = os.path.join(data_dir, dest_filename)

    if not os.path.isfile(dest_path):
        if download_if_missing:
            _download(url, dest_path, content_length_header_key)
        else:
            raise IOError("Dataset missing")
    return dest_path

In [8]:
def clear_data_dir(path=None):
    """Delete all the content of the data home cache.

        Args:
            path (str): The path to scikit-uplift data dir

    """
    if path is None:
        path = get_data_dir()
    if os.path.isdir(path):
        shutil.rmtree(path, ignore_errors=True)

In [9]:
def fetch_dataset(target_col='conversion', data_home=None, dest_subdir=None, download_if_missing=True,
                    return_X_y_t=False):
    """Load and return dataset for promotion marketing campaig .
.
    Context
    
    Marketing Promotion Campaign
    with a total of 6,400 customers data.

    Content
    This dataset show customer's brief information,
    historical use of discount or BOGO(Buy One Get One) promotion,
    offer has been made, and the conversion result(buy or not).
    The conversion average value = $25
    
    Acknowledgements
    This dataset is a fictional dataset for practicing purpose

    Inspiration
    
    Predict customer's conversion rate
    Uplift Modelling to maximizing marketing campaign and reducing campaign cost

    Major columns:

    * ``conversion`` (binary): target. 1/0 indicator, 1 = Customer purchased merchandise in the following two weeks.
    * ``offer`` (str): treatment. The campaign the customer received (Discount/Buy One Get One/No Offer)

    Read more in the :ref:`docs <dataset>`.

    Args:
        target_col (string, 'visit' or 'conversion', 'spend' or 'all', default='visit'): Selects which column from dataset
            will be target
        data_home (str): The path to the folder where datasets are stored.
        dest_subdir (str): The name of the folder in which the dataset is stored.
        download_if_missing (bool): Download the data if not present. Raises an IOError if False and data is missing.
        return_X_y_t (bool, default=False): If True, returns (data, target, treatment) instead of a Bunch object.

    Returns:
        Bunch or tuple: dataset.

        Bunch:
            By default dictionary-like object, with the following attributes:

                * ``data`` (DataFrame object): Dataset without target and treatment.
                * ``target`` (Series or DataFrame object): Column target by values.
                * ``treatment`` (Series object): Column treatment by values.
                * ``DESCR`` (str): Description of the dataset.
                * ``feature_names`` (list): Names of the features.
                * ``target_name`` (str or list): Name of the target.
                * ``treatment_name`` (str): Name of the treatment.

        Tuple:
            tuple (data, target, treatment) if `return_X_y` is True

    References:
        https://www.kaggle.com/davinwijaya/customer-retention

    Example::

        from sklift.datasets import fetch_dataset


        dataset = fetch_dataset(target_col='conversion')
        data, target, treatment = dataset.data, dataset.target, dataset.treatment

        # alternative option
        data, target, treatment = fetch_dataset(target_col='conversion', return_X_y_t=True)

    See Also:

        :func:`.fetch_lenta`: Load and return the Lenta dataset (classification).

        :func:`.fetch_x5`: Load and return the X5 RetailHero dataset (classification).

        :func:`.fetch_criteo`: Load and return the Criteo Uplift Prediction Dataset (classification).

        :func:`.fetch_megafon`: Load and return the MegaFon Uplift Competition dataset (classification)
    """
    target_cols = ['offer', 'conversion']
    if target_col == 'all':
        target_col = target_cols
    elif target_col not in target_cols:
        raise ValueError(f"The target_col must be an element of {target_cols + ['all']}. "
                         f"Got value target_col={target_col}.")

    url = 'https://scikit-uplift-material.s3.eu-north-1.amazonaws.com/data.csv.zip'
    filename = url.split('/')[-1] 
    csv_path = _get_data(data_home=data_home, url=url, dest_subdir=dest_subdir,
                         dest_filename=filename,
                         download_if_missing=download_if_missing)

    treatment_col = 'offer'

    data = pd.read_csv(csv_path)
    treatment, target = data[treatment_col], data[target_col]

    data = data.drop(target_cols + [treatment_col], axis=1)

    if return_X_y_t:
        return data, target, treatment

    feature_names = list(data.columns)

    #module_path = os.path.dirname(os.path.abspath(__file__))
    #with open(os.path.join(module_path, 'descr', 'dataset.rst')) as rst_file:
        #fdescr = rst_file.read()

    return Bunch(data=data, target=target, treatment=treatment, #DESCR=fdescr,
                 feature_names=feature_names, target_name=target_col, treatment_name=treatment_col)