#### Tracking your datasets for the data science lab (examples)


#### Initializing your tracker (PyPads)
First you have to install pypads-padre which has pypads as a dependency

    pip install pypads-padre

In [1]:
from pypads.app.base import PyPads
from dotenv import load_dotenv
load_dotenv()
tracker = PyPads(uri="http://mlflow.padre-lab.eu", autostart=False)

In [None]:
- load the datasets and test if it works
- do random tests on the datasets with splitting

#### 3D MNIST example

In [1]:
tracker.start_track(experiment_name= "3D MNIST")
import h5py
import numpy as np


# Loading and tracking your dataset
path = "data/3d-mnist/full_dataset_vectors.h5"
@tracker.decorators.dataset(name="3DMNIST", target_columns=[-1])
def load_3d_mnist(path):
    """
    The aim of this dataset is to provide a simple way to get started with 3D computer vision problems such as 3D shape recognition.

    Accurate 3D point clouds can (easily and cheaply) be adquired nowdays from different sources:

     - RGB-D devices: Google Tango, Microsoft Kinect, etc.
     - Lidar.
     - 3D reconstruction from multiple images.

    However there is a lack of large 3D datasets (you can find a good one here based on triangular meshes); it's especially hard to find datasets based on point clouds (wich is the raw output from every 3D sensing device).

    This dataset contains 3D point clouds generated from the original images of the MNIST dataset to bring a familiar introduction to 3D to people used to work with 2D datasets (images).

    The full dataset is splitted into arrays:

    X_train (10000, 4096)
    y_train (10000)
    X_test(2000, 4096)
    y_test (2000)
    
    data is the concatenation of X_train, X_test, y_train, y_test (12000, 4097)
    
    :return data 
    """
    with h5py.File(path, "r") as hf:
        X_train, y_train = hf["X_train"][:], hf["y_train"][:]
        X_test, y_test = hf["X_test"][:], hf["y_test"][:]
        train_data = np.concatenate([X_train, y_train.reshape(len(y_train), 1)], axis=1)
        test_data = np.concatenate([X_test, y_test.reshape(len(y_test), 1)], axis=1)
        data = np.concatenate([train_data, test_data], axis=0)
    return data


data = load_3d_mnist(path)

# Tracking your train and test splits: in this simple case we follow the original data train : (10000) test: (2000)

@tracker.decorators.splitter()
def splitter(data, index=10000):
    import numpy as np
    idx = np.arange(data.shape[0])
    return idx[:index], idx[index:]

train, test = splitter(data)
# Do and log stuff
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.classification import f1_score

X_train, y_train = data[train,:-1],data[train,-1]
X_test, y_test = data[test,:-1],data[test,-1]

model = LogisticRegression()

model.fit(X_train, y_train)

preds = model.predict(X_test)

f1 = f1_score(preds,y_test)

print("F1_score: ", str(f1))

# end_run
tracker.api.end_run()

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([5, 5, 0, ..., 1, 2, 2]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([7, 7, 2, ..., 8, 9, 9]))

In [3]:
#### Covid-19 tweets example
tracker.start_track(experiment_name= "Covid-19 Tweets")
import pandas as pd
from pandas_profiling import ProfileReport

# Loading and tracking your dataset
path = "data/covid-19-tweets/covid19_tweets.csv"
@tracker.decorators.dataset(name="Covid-19 Tweets")
def load_covid_tweets(path):
    """
    These tweets are collected using Twitter API and a Python script. 
    A query for this high-frequency hashtag (#covid19) is run on a daily basis for a certain time period, to collect a larger number of tweets samples.
    The collection script can be found here: https://github.com/gabrielpreda/covid-19-tweets
    
    :return dataframe 
    """
    df = pd.read_csv(path)
    return df


df = load_covid_tweets(path)

# Do and log stuff
profile = ProfileReport(df, title="Pandas Profiling Report")

# end_run
tracker.api.end_run()


[3, 4]

In [None]:
profile