In [None]:
%%capture
!pip install --upgrade ipython

In [None]:
exit()

In [None]:
%%capture
!pip install lineapy

In [None]:
%load_ext lineapy

In [1]:
import lineapy

# Use LineaPy to create a simple ML/data pipeline

## Scenario

You are a data scientist or a data analyst who 
* creates a working pipeline that is updateing the dashboard with latest data
* creates a data pipeline to prepare data for your downstream model developing 

You are expecting to **reexecute the pipeline** at regular or ad-hoc basis.

However, you may not have the proper engineering support to setup the pipeline for you or the pipeline is not for production purpose(like data prepartion for your experinment).

## What might happen next?

* You spend a lot time to copy and paste your code to make your pipeline working in the orchestration systems or job schedulers (cron, Apache Airflow , prefect ...).
* It takes forever to make your pipeline working and you end up set a physical alarm to remind your manually executing your pipeline whenever you need it. 
* You make some change on your pipeline and now you find out you need to restart above processes again.

## What problems LineaPy is trying to solve here?

LineaPy is trying to help setting up a pipeline with minimal manually work with three lines of code.

```
import lineapy

........................
.
. your original notebook
. 
........................

lineapy.save(object, 'artifact name')
lineapy.to_pipeline(['artiffact name'], )

..........................


```

This enable data scientists to produce 
For some orchestration systems, like Apache Airflow, LineaPy is able to create and update the pipeline without any manual operation.


```
import lineapy

........................
.
. your original notebook
. 
........................

artifact = lineapy.save()  # Return an artifact object
artifact.to_pipeline()
```



## What will we learn in rest of the notebook?

In this demo, we are going to load the iris data features as a four columns data frame and do an aggregation to mimic the dashboard updating process.

During this process, we will use a minimal example to create a LineaPy artifact, then we will create a pipeline that produces the artifact. 

In the end, we will demonstrate 

* How to use the to_pipeline() to .
  * create a native pipeline as python script and setup in crontab.
  * create a native Airflow DAG.

This demonstrates that LineaPy can help data scientists save time on pipeline management and focus more on insight generation by using the pipeline creating feature.



## Setup the demo

In [2]:
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import pandas as pd

iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names).assign(target=[iris.target_names[i] for i in iris.target])
df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:

iris_agg = df.groupby('target').describe()
iris_agg

Unnamed: 0_level_0,sepal length (cm),sepal length (cm),sepal length (cm),sepal length (cm),sepal length (cm),sepal length (cm),sepal length (cm),sepal length (cm),sepal width (cm),sepal width (cm),...,petal length (cm),petal length (cm),petal width (cm),petal width (cm),petal width (cm),petal width (cm),petal width (cm),petal width (cm),petal width (cm),petal width (cm)
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
setosa,50.0,5.006,0.35249,4.3,4.8,5.0,5.2,5.8,50.0,3.428,...,1.575,1.9,50.0,0.246,0.105386,0.1,0.2,0.2,0.3,0.6
versicolor,50.0,5.936,0.516171,4.9,5.6,5.9,6.3,7.0,50.0,2.77,...,4.6,5.1,50.0,1.326,0.197753,1.0,1.2,1.3,1.5,1.8
virginica,50.0,6.588,0.63588,4.9,6.225,6.5,6.9,7.9,50.0,2.974,...,5.875,6.9,50.0,2.026,0.27465,1.4,1.8,2.0,2.3,2.5


In [4]:
# Run this cell whenever you need to save the artifact 

artifact = lineapy.save(iris_agg, 'iris agg')

## Create a pipeline from an artifact





In [5]:
lineapy.to_pipeline(artifacts=['iris agg'], framework='SCRIPT', pipeline_name='iris_aggregation_script_pipeline', output_dir='python_script')

PosixPath('python_script')

In [6]:
%%sh
echo 'validate lineapy has created all pipeline required files'
echo '--------------------------------------------------------------------'
ls -ltrh python_script
echo '\n'
echo '--------------------------------------------------------------------'
echo 'python module: iris_aggregation_script_pipeline'
echo '--------------------------------------------------------------------'
cat python_script/iris_aggregation_script_pipeline.py
echo '\n'
echo '--------------------------------------------------------------------'
echo 'pipeline script: iris_aggregation_script_pipeline_script_dag'
echo '--------------------------------------------------------------------'
cat python_script/iris_aggregation_script_pipeline_script_dag.py
echo '--------------------------------------------------------------------'


validate lineapy has created all pipeline required files
--------------------------------------------------------------------
total 0
-rwxrwxrwx 1 mlee mlee 281 May 11 17:14 iris_aggregation_script_pipeline.py
-rwxrwxrwx 1 mlee mlee 368 May 11 17:14 iris_aggregation_script_pipeline_Dockerfile
-rwxrwxrwx 1 mlee mlee  89 May 11 17:14 iris_aggregation_script_pipeline_requirements.txt
-rwxrwxrwx 1 mlee mlee 128 May 11 17:14 iris_aggregation_script_pipeline_script_dag.py


--------------------------------------------------------------------
python module: iris_aggregation_script_pipeline
--------------------------------------------------------------------
import pandas as pd
from sklearn.datasets import load_iris


def iris_agg():
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names).assign(
        target=[iris.target_names[i] for i in iris.target]
    )
    iris_agg = df.groupby("target").describe()


----------------------------------------------------------

# Execute the pipeline from command line

```bash
python python_script/iris_aggregation_script_pipeline_script_dag.py
```

In [7]:
lineapy.to_pipeline(artifacts=['iris agg'], framework='AIRFLOW', pipeline_name='iris_aggregation_airflow_pipeline', output_dir='iris_airflow')

PosixPath('iris_airflow')

In [9]:
%%sh
echo 'validate lineapy has created all pipeline required files'
echo '--------------------------------------------------------------------'
ls -ltrh iris_airflow
echo '\n'
echo '--------------------------------------------------------------------'
echo 'python module: iris_aggregation_script_pipeline'
echo '--------------------------------------------------------------------'
cat iris_airflow/iris_aggregation_airflow_pipeline.py
echo '\n'
echo '--------------------------------------------------------------------'
echo 'pipeline script: iris_aggregation_script_pipeline_script_dag'
echo '--------------------------------------------------------------------'
cat iris_airflow/iris_aggregation_airflow_pipeline_dag.py
echo '--------------------------------------------------------------------'


validate lineapy has created all pipeline required files
--------------------------------------------------------------------
total 4.0K
-rwxrwxrwx 1 mlee mlee 281 May 11 17:14 iris_aggregation_airflow_pipeline.py
-rwxrwxrwx 1 mlee mlee 370 May 11 17:14 iris_aggregation_airflow_pipeline_Dockerfile
-rwxrwxrwx 1 mlee mlee  89 May 11 17:14 iris_aggregation_airflow_pipeline_requirements.txt
-rwxrwxrwx 1 mlee mlee 578 May 11 17:14 iris_aggregation_airflow_pipeline_dag.py


--------------------------------------------------------------------
python module: iris_aggregation_script_pipeline
--------------------------------------------------------------------
import pandas as pd
from sklearn.datasets import load_iris


def iris_agg():
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names).assign(
        target=[iris.target_names[i] for i in iris.target]
    )
    iris_agg = df.groupby("target").describe()


----------------------------------------------------------

lineapy.to_pipeline(artifacts=['iris agg'], framework='SCRIPT', pipeline_name='iris_aggregation_script_pipeline', output_dir='python_script')