This is a local, non-Kaggle notebook in which TFX 1.16.0 and python 3.10 and the compatible versions of other libraries are installed in a virtual environment that this notebook is running in.

paths are relative to the github repository directory, "recommender_systems"

In [None]:
import shutil

from tfx.orchestration import metadata

import tensorflow_transform as tft

from ml_metadata.proto import metadata_store_pb2
from ml_metadata.metadata_store import metadata_store

import sys
import os
sys.path.append(os.path.join(os.getcwd(), "src/test/python/movie_lens_tfx"))
sys.path.append(os.path.join(os.getcwd(), "src/main/python/movie_lens_tfx"))

from helper import *
from movie_lens_tfx.PipelineComponentsFactory import *
from movie_lens_tfx.tune_train_movie_lens import *

from absl import logging
tf.get_logger().propagate = False
logging.set_verbosity(logging.WARNING)
logging.set_stderrthreshold(logging.WARNING)

## EDA on the raw data

### w/ Polars, Seaborn, and Matplotlib

In [None]:
import polars as pl
#import matplotlib.pyplot as plt
#seaborn version installed is 0.12.2.  need>= 0.13.0 for polars
#import seaborn as sns
#import seaborn_polars as snl
from scipy.stats.distributions import chi2
from collections import OrderedDict
import re
import io
from datetime import datetime
import pytz
import dcor
import numpy as np
#import altair as alt
import plotly.express as px
#needs pip install plotly jupyterlab anywidget

pl.Config.set_fmt_str_lengths(900)



In [None]:
def can_reject_indep(x : np.array, y:np.array, alpha:float = 0.05, debug:bool=False):
  """
  Args:
    x: float array
    y: float array
  reject independence for 
    n*C >= inv(F{chi^2-1})(1-alpha)
    where n = len(x)
      C = fast distance covariance following 2019 Chaudhuri and Hu
      inv(F{chi^2-1}) is the inverse of the CDF.
  """
  with np.errstate(divide='ignore'):
    C = dcor.distance_covariance(x, y, method='mergesort')
  lhs = len(x)*C
  rhs = chi2.ppf(1 - alpha, df=x.shape[-1])
  if debug:
    print(f"nC={lhs}\nppf(1-{alpha}, dof={x.shape[-1]})={rhs}")
  return lhs >= rhs

In [None]:
CTZ = pytz.timezone("America/Chicago")
genres = ["Action", "Adventure", "Animation", "Children", "Comedy",
          "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir",
          "Horror", "Musical", "Mystery", "Romance", "Sci-Fi",
          "Thriller", "War", "Western"]

schemas = {
  'ratings' : pl.Schema(OrderedDict({'user_id': pl.Int64, 
    'movie_id': pl.Int64, 'rating': pl.Int64,
    'timestamp' : pl.Int64})),
  'users' : pl.Schema(OrderedDict({'user_id': pl.Int64, 
    'gender': pl.String, 'age': pl.Int64,
    'occupation' : pl.Int64, 
    'zipcode' : pl.String})),
  'movies' : pl.Schema(OrderedDict({'movie_id': pl.Int64, 
    'title': pl.String, 'genres': pl.String}))}

_infiles_dict_ser, _, __ = get_test_data(use_small=False)
_infiles_dict = deserialize(_infiles_dict_ser)

file_paths = {
  'ratings': _infiles_dict['ratings']['uri'],
  'users':_infiles_dict['users']['uri'],
  'movies':_infiles_dict['movies']['uri'],
}

#polars.read_csv( source=
#  encoding='iso-8859-1', 
#  has_header=False, skip_rows=0, try_parse_dates=True, 
#  use_pyarrow=True

labels_dict = {}
labels_dict['age_group'] = {0:'1', 1:'18', 2:'25', 3:'35', 4:'45', 5:'50', 6:'56'} 
labels_dict['gender'] = {0:'F', 1:'M'}
labels_dict['occupation'] = {0:  "other", 1:  "academic/educator", 2:  "artist",
    3:  "clerical/admin", 4:  "college/grad student", 5:  "customer service",
    6:  "doctor/health care", 7:  "executive/managerial", 8:  "farmer", 9:  "homemaker",
    10:  "K-12 student", 11:  "lawyer", 12:  "programmer", 13:  "retired",
    14:  "sales/marketing", 15:  "scientist", 16:  "self-employed", 17:  "technician/engineer",
    18:  "tradesman/craftsman", 19:  "unemployed", 20:  "writer"}
labels_dict_arrays = {}
for k in labels_dict:
    labels_dict_arrays[k]=[labels_dict[k][k2] for k2 in labels_dict[k]]

In [None]:
#saving images to files to conserve memory
!pip install -U -q kaleido
img_dir = os.path.join(get_bin_dir(), "local_notebook", "images")
os.makedirs(img_dir, exist_ok=True)

for key in file_paths:
    processed_buffer = io.StringIO()
    file_path = file_paths[key]
    schema = schemas[key]
    print(f"key={key}, file_path={file_path}")
    with open(file_path, "r", encoding='iso-8859-1') as file:
        for line in file:
            line2 = line.replace('::', '\t')
            processed_buffer.write(line2)

    processed_buffer.seek(0)
    df = pl.read_csv(processed_buffer,
        encoding='iso-8859-1', has_header=False,
        skip_rows=0, separator='\t', schema=schema,
        try_parse_dates=True,
        new_columns=schema.names(),
        use_pyarrow=True)

    if key=="movies":
        df = df.with_columns(
          pl.col("genres").str.replace("Children's", "Children")
        )
        df = df.with_columns(
          pl.col("genres").str.split("|")
        )
        movie_genres = df.explode('genres')
        ordered_genres = movie_genres['genres'].value_counts().sort('count', descending=True)
        fig = px.bar(ordered_genres, x="genres", y="count", title="genres histogram",)
        fig.write_image(os.path.join(img_dir, "genres_hist.png"))
    if key=="ratings":
        #user_id, movie_id, rating, timestamp
        fig = px.histogram(df, x='rating', title='rating')
        fig.write_image(os.path.join(img_dir, "rating_hist.png"))
        fig = px.histogram(df, x='timestamp', title='timestamp')
        fig.write_image(os.path.join(img_dir, "timestamp_hist.png"))
        fig = px.histogram(df, x='movie_id', title='movie_id')
        fig.write_image(os.path.join(img_dir, "movieid_hist.png"))
        fig = px.histogram(df,  x='user_id', title='user_id')
        fig.write_image(os.path.join(img_dir, "userid_hist.png"))
        #run the ndep tests on transformed data instead of raw data
        #x = df.select(pl.col("rating")).to_numpy()
        #y = df.select(pl.col("timestamp")).to_numpy()
        #print(f"rating, timestamp are indep: {can_reject_indep(x, y, 0.05, True)}")
        fig = px.density_heatmap(df, x='movie_id', y='rating')
        fig.write_image(os.path.join(img_dir, "movieid_rating_heatmap.png"))
        fig = px.density_heatmap(df, x='timestamp', y='rating')
        fig.write_image(os.path.join(img_dir, "timestamp_rating_heatmap.png"))
        fig = px.density_heatmap(df, x='user_id', y='rating')
        fig.write_image(os.path.join(img_dir, "userid_rating_heatmap.png"))
        fig = px.density_heatmap(df, x='timestamp', y='movie_id')
        fig.write_image(os.path.join(img_dir, "timestamp_movieid_heatmap.png"))
        #fig = px.scatter_ternary(df, a="rating", b="timestamp", c="movie_id",
        #    #size="total", size_max=15,
        #    color_discrete_map = {"rating": "blue", "timestamp": "green", "movie_id":"red"} )
        #fig.show(renderer='notebook')
        #fig = px.scatter_ternary(df, a="rating", b="user_id", c="movie_id",
        #    #size="total", size_max=15,
        #    color_discrete_map = {"rating": "blue", "user_id": "green", "movie_id":"red"} )
        #fig.show(renderer='notebook')
    if key=="users":
        #user_id, gender, age, occupation, zipcode
        fig = px.histogram(df, x='gender', title='gender')
        fig.write_image(os.path.join(img_dir, "gender_hist.png"))
        fig = px.histogram(df, x='age',  title='age')
        fig.write_image(os.path.join(img_dir, "age_hist.png"))
        df = df.with_columns(
            pl.col("occupation").map_elements(lambda x: labels_dict['occupation'].get(x,x)).alias("occ")
        )
        df = df.with_columns(
            pl.col("occ").cast(pl.Categorical)
        )
        ordered_occupation = df['occ'].value_counts().sort('count', descending=True)
        fig = px.bar(ordered_occupation, x="occ", y="count", title="occupation histogram",)
        fig.update_xaxes(tickangle=45)
        fig.write_image(os.path.join(img_dir, "occupation_hist.png"))
        fig = px.histogram(df, x='zipcode',  title='zipcode')
        fig.write_image(os.path.join(img_dir, "zipcode_hist.png"))
        #run the ndep tests on transformed data instead of rrrawawwa data
        _features=['gender', 'age', 'occupation', 'zipcode']
        for ii, feature in enumerate(_features):
            for jj in range(ii+1, len(_features)):
                feature2 = _features[jj]
                if feature2 == 'occupation':
                    occ_counts = df.group_by("occ").len().rename({"len": "occ_count"})
                    df_sorted = (
                        df.join(occ_counts, on="occ", how="left")
                        .sort(by="occ_count", descending=True)
                        .drop("occ_count")
                    )
                    fig = px.density_heatmap(df_sorted, x=feature, y=feature2)
                else:
                    fig = px.density_heatmap(df, x=feature, y=feature2)
                fig.write_image(os.path.join(img_dir, f"{feature}_{feature2}_heatmap.png"))
                #for kk in range(jj+1, len(_features)):
                #    feature3 = _features[kk]
                #    fig = px.scatter_ternary(df, a=feature, b=feature2, c=feature3,
                #        #size="total", size_max=15,
                #        color_discrete_map = {feature: "blue", feature2: "green", feature3:"red"} )
                #    fig.show(renderer='notebook')

del df
print(f'wrote univariate EDA images to {img_dir}')

### run data pre-processing on full dataset

In [None]:
infiles_dict_ser, output_config_ser, split_names = get_test_data(use_small=False)
user_id_max = 6040
movie_id_max = 3952
n_genres = N_GENRES
n_age_groups = N_AGE_GROUPS
n_occupations = 21
MIN_EVAL_SIZE = 50 #make this larger for production pipeline

test_num = "1"
    
PIPELINE_NAME = 'TestPipelines'
output_data_dir = os.path.join(get_bin_dir(), "local_notebook", test_num)
PIPELINE_ROOT = os.path.join(output_data_dir, PIPELINE_NAME)

# remove results from previous test runs:
try:
  print(f"removing: {PIPELINE_ROOT}")
  shutil.rmtree(PIPELINE_ROOT)
except OSError as e:
  pass
METADATA_PATH = os.path.join(PIPELINE_ROOT, 'tfx_metadata',
                             'metadata.db')
os.makedirs(os.path.join(PIPELINE_ROOT, 'tfx_metadata'),
            exist_ok=True)

ENABLE_CACHE = True

# metadata_connection_config = metadata_store_pb2.ConnectionConfig()
# metadata_connection_config.sqlite.SetInParent()
# metadata_connection = metadata.Metadata(metadata_connection_config)
metadata_connection_config = metadata.sqlite_metadata_connection_config(
  METADATA_PATH)

store = metadata_store.MetadataStore(metadata_connection_config)

if get_kaggle():
  tr_dir = "/kaggle/working/"
else:
  tr_dir = os.path.join(get_project_dir(), "src/main/python/movie_lens_tfx")

serving_model_dir = os.path.join(PIPELINE_ROOT, 'serving_model')
output_parquet_path = os.path.join(PIPELINE_ROOT, "transformed_parquet")

# for the custom ingestion component, the apache beam pipeline needs to be able to
# find the sibling scripts it imports.
# 2 solutions: (1) create a tar archive and use --extra_package in pipeline args
# or (2) use setup.py and --setup_file in pipeline args.

beam_pipeline_args = [
  '--direct_running_mode=multi_processing',
  '--direct_num_workers=0',
  '--setup_file=setup.py',
  #f'--extra_package={ingest_tar_file}'
]

In [None]:
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
context = InteractiveContext(pipeline_name=PIPELINE_NAME, pipeline_root=PIPELINE_ROOT,
  metadata_connection_config=metadata_connection_config,
  beam_pipeline_args=beam_pipeline_args
)

factory = PipelineComponentsFactory(infiles_dict_ser, output_config_ser, tr_dir,
    user_id_max, movie_id_max, n_genres, n_age_groups,
    MIN_EVAL_SIZE, serving_model_dir, output_parquet_path)

components = factory.build_components(PIPELINE_TYPE.PREPROCESSING)

for component in components:
    context.run(component)

print(f'done pre-processing data')

## EDA on the transformed data

### using Polars and Plotly.express 

In [None]:
parquet_path = os.path.join(PIPELINE_ROOT, "transformed_parquet")
from movie_lens_tfx.utils import movie_lens_utils

for split_name in ["train", "eval", "test"]:
    in_file_pattern = os.path.join(parquet_path, f"Split-{split_name}*")
    df = pl.read_parquet(in_file_pattern)
    #df = pl.scan_parquet(in_file_pattern)
    df = df.with_columns(
        pl.col("genres").map_elements(movie_lens_utils.deserialize, return_dtype=pl.Object)
    )
    print(f"{split_name}: {df.head(5)}")
    dist_corr_matrix = []
    labels = []
    for i, feature in enumerate(df.columns):
        if feature == 'genres':
            continue
        labels.append(feature)
        d = []
        for j in range(i+1, len(df.columns)):
            feature2 = df.columns[j]
            if feature2 == 'genres':
                continue
            d.append(dcor.distance_correlation(df[feature], df[feature2], method='mergesort')
        dist_corr_matrix.append(d)
    fig = px.imshow(
        dist_corr_matrix, zmin=0., zmax=1., labels=labels,
        color_continuous_scale="RdBu_r", # Red-Blue reversed for correlation
        title="Correlation Matrix Heatmap"
    )
    fig.write_image(os.path.join(img_dir, f"{split_name}_distcorr_heatmap.png"))

    """
    explode each genres into new columns by name
    
    bar plots to compare avg feature with each genre
    
    correlation between genres, shown as heatmap to see which are frequently grouped together

    pairplots of rating, gender, age, occupation, hr_wk, month, weekday

    pairplots of feature with each genre

    boxplot(data=data, x='genres', y='rating', hue='gender', palette='pastel')
    boxplot(data=data, x='genres', y='rating', hue='age', palette='pastel')
    violinplots instead

    piechart of movie genres with slice thickness representing numbers rated
    same pie chart for ratings=4.0 (== tranformed 1.)
    same for ratings=3.0 ...

    many more to add...
    
    consider a quick PCA
    """
    
    """
    'user_id', 'movie_id'
    'rating',
    'gender', 'age', 'occupation', 'genres', 'hr', 'weekday', 'hr_wk','month'
    """
    
    
    

### using TFDV

#load the transformed examples

from tfx.dsl.io import fileio
from tfx.orchestration import metadata
from tfx.components import StatisticsGen, SchemaGen, ExampleValidator
from tfx.utils import io_utils
from tensorflow_metadata.proto.v0 import anomalies_pb2, schema_pb2
from tensorflow_transform.tf_metadata import schema_utils

#from movie_lens_tfx.ingest_pyfunc_component.ingest_movie_lens_component import *
#from movie_lens_tfx.tune_train_movie_lens import *
#from tfx import v1 as tfx

schema_list = store.get_artifacts_by_type("Schema")
schema_list = sorted(schema_list,
  key=lambda x: x.create_time_since_epoch, reverse=True)
for artifact in schema_list:
    if "post_transform_schema" in artifact.uri:
        schema_uri = artifact.uri
        break
assert(schema_uri is not None)
schema_file_path = [os.path.join(schema_uri, name) for name in os.listdir(schema_uri)][0]
schema = tfx.utils.parse_pbtxt_file(schema_file_path, schema_pb2.Schema())
feature_spec = schema_utils.schema_as_feature_spec(schema).feature_spec

examples_list = store.get_artifacts_by_type("Examples")
#print(f"examples_list={examples_list}")
examples_list = sorted(examples_list,
  key=lambda x: x.create_time_since_epoch, reverse=True)
for artifact in examples_list:
    if "transformed_examples" in artifact.uri:
        transformed_examples_uri = artifact.uri
        break
assert(transformed_examples_uri is not None)
logging.debug(f"transfomed_examples_uri={transformed_examples_uri}")
transform_uri = transformed_examples_uri[0:transformed_examples_uri.index("transformed_examples")]

"""
transformed_examples
post_transform_anomalies 
post_transform_schema
pre_transform_stats
post_transform_stats
transform_graph
updated_analyzer_cache
pre_transform_schema
"""

def parse_tf_example(example_proto, feature_spec):
    return tf.io.parse_single_example(example_proto, feature_spec)
for split_name in ["train", "eval", "test"]:
    tfrecord_uri = os.path.join(transform_uri, f"Split-{split_name}")
    file_paths = [os.path.join(tfrecord_uri, name) for name in os.listdir(tfrecord_uri)]
    ds_ser = tf.data.TFRecordDataset(file_paths, compression_type="GZIP")
    ds = ds_ser.map(lambda x: parse_tf_example(x, feature_spec))

    """
    
    """



## Run baseline model pipeline with full dataset

pipeline_factory = PipelineComponentsFactory(
  infiles_dict_ser=infiles_dict_ser, output_config_ser=output_config_ser,
  transform_dir=tr_dir, user_id_max=user_id_max, movie_id_max=movie_id_max,
  n_genres=n_genres, n_age_groups=n_age_groups, min_eval_size=MIN_EVAL_SIZE,
  serving_model_dir=serving_model_dir,
)

beam_pipeline_args = [
  '--direct_running_mode=multi_processing',
  '--direct_num_workers=0'
    ]

baseline_components = pipeline_factory.build_components(MODEL_TYPE.BASELINE)
    
# create baseline model
my_pipeline = tfx.dsl.Pipeline(
  pipeline_name=PIPELINE_NAME,
  pipeline_root=PIPELINE_ROOT,
  components=baseline_components,
  enable_cache=ENABLE_CACHE,
  metadata_connection_config=metadata_connection_config,
  beam_pipeline_args=beam_pipeline_args,
)

tfx.orchestration.LocalDagRunner().run(my_pipeline)

artifact_types = store.get_artifact_types()
logging.debug(f"MLMD store artifact_types={artifact_types}")
artifacts = store.get_artifacts()
logging.debug(f"MLMD store artifacts={artifacts}")

components = pipeline_factory.build_components(MODEL_TYPE.PRODUCTION)
# simulate experimentation of one model family
my_pipeline = tfx.dsl.Pipeline(
  pipeline_name=PIPELINE_NAME,
  pipeline_root=PIPELINE_ROOT,
  components=components,
  enable_cache=ENABLE_CACHE,
  metadata_connection_config=metadata_connection_config,
  beam_pipeline_args=beam_pipeline_args,
)

tfx.orchestration.LocalDagRunner().run(my_pipeline)


artifact_types = store.get_artifact_types()
print(f"MLMD store artifact_types={artifact_types}")
artifacts = store.get_artifacts()
print(f"MLMD store artifacts={artifacts}")

executions = store.get_executions()
logging.debug(f"MLMD store executions={executions}")

# executions has custom_properties.key: "infiles_dict_ser"
#    and custom_properties.key: "output_config_ser"
artifact_count = len(artifacts)
execution_count = len(executions)
