# Final Project: Machine Learning Operations (MLOps)
- **Nama:** Muhammad Giat
- **Email:** mhmmadgiat@gmail.com
- **ID Dicoding:** mhmmadgiatt

### Mounting Google Drive
Pada awal pipeline, Google Drive dipasang untuk memberikan akses ke file yang disimpan di dalamnya, seperti dataset dan file proyek yang diperlukan.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Ekstraksi Dataset dari File Zip
Dataset yang diperlukan untuk pelatihan model dikemas dalam file zip yang diekstraksi ke dalam folder di Colab.

In [2]:
import zipfile
import os

# Path ke file zip di Google Drive
zip_path = '/content/drive/My Drive/pipeline_2.zip'  # Sesuaikan nama file dan lokasi

# Ekstraksi file zip ke folder di Colab
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('/content/pipeline_2')

# Pindah ke folder proyek
os.chdir('/content/pipeline_2')

In [3]:
!pip install tfx

Collecting tfx
  Downloading tfx-1.15.1-py3-none-any.whl.metadata (23 kB)
Collecting ml-pipelines-sdk==1.15.1 (from tfx)
  Downloading ml_pipelines_sdk-1.15.1-py3-none-any.whl.metadata (2.3 kB)
Collecting ml-metadata<1.16.0,>=1.15.0 (from tfx)
  Downloading ml_metadata-1.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.0 kB)
Collecting docker<5,>=4.1 (from tfx)
  Downloading docker-4.4.4-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting google-apitools<1,>=0.5 (from tfx)
  Downloading google_apitools-0.5.32-py3-none-any.whl.metadata (2.3 kB)
Collecting google-api-python-client<2,>=1.8 (from tfx)
  Downloading google_api_python_client-1.12.11-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting apache-beam<3,>=2.47 (from apache-beam[gcp]<3,>=2.47->tfx)
  Downloading apache_beam-2.61.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)
Collecting attrs<24,>=19.3.0 (from tfx)
  Downloading attrs-23.2.0-py3-none-any.whl.metadata (9.5 kB)
Co

In [4]:
!pip install modules

Collecting modules
  Downloading modules-1.0.0.tar.gz (525 bytes)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: modules
  Building wheel for modules (setup.py) ... [?25l[?25hdone
  Created wheel for modules: filename=modules-1.0.0-py3-none-any.whl size=1198 sha256=3c9684cd9f1d94dad3c293768000789fb287aae30aa8699f6185401e7947383a
  Stored in directory: /root/.cache/pip/wheels/b9/1b/5a/0e7760d483cf2ac6001c9df78809f16feb9632607248e3ab78
Successfully built modules
Installing collected packages: modules
Successfully installed modules-1.0.0


## Import Library

In [1]:
import sys
sys.path.append('/content/pipeline_2')

In [3]:
# Cek isi direktori
!ls /content/pipeline_2/modules

components.py  hate_trainer.py	hate_transform.py  __init__.py


In [4]:
import importlib.util
import sys

module_path = '/content/pipeline_2/modules/components.py'
module_name = 'modules.components'

spec = importlib.util.spec_from_file_location(module_name, module_path)
module = importlib.util.module_from_spec(spec)
sys.modules[module_name] = module
spec.loader.exec_module(module)

# Sekarang gunakan fungsi dari modul
init_components = module.init_components

In [5]:
import os
import pandas as pdr
import pandas as pd
from typing import Text
from absl import logging
from tfx.orchestration import metadata, pipeline
from tfx.orchestration.beam.beam_dag_runner import BeamDagRunner
from sklearn.utils import resample
from modules.components import init_components

# Preparing the Dataset

In [6]:
df = pd.read_csv('/content/pipeline_2/Data/dataset.csv')
df

Unnamed: 0,image_name,sentence,label
0,LJ3r8Gy.jpg.png,OFFICIAL BERNIE SANDERS DRINKING GAME ! Every ...,Non-offensiv
1,qDnIIHA.png,2:28 PM THIS IS A WALL INSIDE A NAZI GAS CHAMB...,offensive
2,1JQk5NF.png,o shit waddup ! BERNIE SANDERS COM,offensive
3,iMMNq.png,`` MITT ROMNEY IS THE WORST REPUBLICAN IN THE ...,Non-offensiv
4,jAi3iI1.png,Anonymous ( ID : duqdA1io a 08/05/16 ( Fri ) 1...,Non-offensiv
...,...,...,...
738,iutC6Qy.png,Pussy almost brings down Trump Weiner knocks o...,offensive
739,yGHKtzg.png,CLINTON SAID SHE B BRINGING BACK RUGRATIS AM G...,offensive
740,phyDxAb.png,2 hrs THAT AwKWARD MoMENT wHEN ... I. BERNIE S...,Non-offensiv
741,Qj9x2wz.png,FOR HILLARY BLOOD ON YOUR HANDS Stop Voting ou...,offensive


In [7]:
df['label'] = df['label'].apply(lambda x: 1 if x == 'offensive' else 0 if x == 'Non-offensiv' else x)
df.head()

Unnamed: 0,image_name,sentence,label
0,LJ3r8Gy.jpg.png,OFFICIAL BERNIE SANDERS DRINKING GAME ! Every ...,0
1,qDnIIHA.png,2:28 PM THIS IS A WALL INSIDE A NAZI GAS CHAMB...,1
2,1JQk5NF.png,o shit waddup ! BERNIE SANDERS COM,1
3,iMMNq.png,`` MITT ROMNEY IS THE WORST REPUBLICAN IN THE ...,0
4,jAi3iI1.png,Anonymous ( ID : duqdA1io a 08/05/16 ( Fri ) 1...,0


In [8]:
df['label'].value_counts()

Unnamed: 0,label
0,440
1,303


In [9]:
df_minor = df[df['label'] == 0]
df_mayor = df[df['label'] == 1]

In [10]:
df_upsampling = resample(df_minor, n_samples=len(df_mayor), random_state=42)
df = pd.concat([df_mayor, df_upsampling]).reset_index(drop=True)
df['label'].value_counts()

Unnamed: 0,label
1,303
0,303


In [11]:
# Tentukan path lengkap ke folder Data_2
folder_path = '/content/pipeline_2/Data_2'

# Periksa apakah folder sudah ada
if not os.path.exists(folder_path):
    # Jika belum ada, buat folder baru
    os.makedirs(folder_path)

# Simpan DataFrame ke file CSV di dalam folder Data_2
df.to_csv(os.path.join(folder_path, 'processed_dataset.csv'), index=False)

## Run Pipeline

In [12]:
PIPELINE_IDENTIFIER = "hate-speech-pipeline"

DATA_SOURCE = "/content/pipeline_2/Data_2"
TRANSFORM_SCRIPT = "/content/pipeline_2/modules/hate_transform.py"
TRAINING_SCRIPT = "/content/pipeline_2/modules/hate_trainer.py"

BASE_OUTPUT_PATH = "mhmmadgiatt-pipeline"
exported_model_dir = os.path.join(BASE_OUTPUT_PATH, 'exported_model')
pipeline_storage = os.path.join(BASE_OUTPUT_PATH, PIPELINE_IDENTIFIER)
metadata_database = os.path.join(pipeline_storage, "metadata.sqlite")


def configure_local_pipeline(
    pipeline_components, storage_path: Text
) -> pipeline.Pipeline:

    logging.info(f"Pipeline storage path set to: {storage_path}")
    execution_args = [
        "--direct_running_mode=multi_processing",
        "--direct_num_workers=0"
    ]

    return pipeline.Pipeline(
        pipeline_name=PIPELINE_IDENTIFIER,
        pipeline_root=storage_path,
        components=pipeline_components,
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_database
        ),
        beam_pipeline_args=execution_args
    )

if __name__ == "__main__":
    logging.set_verbosity(logging.INFO)

    pipeline_components = init_components(
        DATA_SOURCE,
        training_module=TRAINING_SCRIPT,
        transform_module=TRANSFORM_SCRIPT,
        serving_model_dir=exported_model_dir,
    )

    pipeline_instance = configure_local_pipeline(pipeline_components, pipeline_storage)
    BeamDagRunner().run(pipeline=pipeline_instance)

INFO:absl:Excluding no splits because exclude_splits is not set.
INFO:absl:Excluding no splits because exclude_splits is not set.
INFO:absl:Excluding no splits because exclude_splits is not set.
INFO:absl:Pipeline storage path set to: mhmmadgiatt-pipeline/hate-speech-pipeline
INFO:absl:Generating ephemeral wheel package for '/content/pipeline_2/modules/hate_transform.py' (including modules: ['hate_transform', 'hate_trainer', 'components']).
INFO:absl:User module package has hash fingerprint version 99efbab93e16c38c5b828d267e21ab23828c87993eb3f5d8ad2d2d27983bf3d5.
INFO:absl:Executing: ['/usr/bin/python3', '/tmp/tmpqqdr7jrt/_tfx_generated_setup.py', 'bdist_wheel', '--bdist-dir', '/tmp/tmpa2rqagcn', '--dist-dir', '/tmp/tmpiij6hzz_']
INFO:absl:Successfully built user code wheel distribution at 'mhmmadgiatt-pipeline/hate-speech-pipeline/_wheels/tfx_user_code_Transform-0.0+99efbab93e16c38c5b828d267e21ab23828c87993eb3f5d8ad2d2d27983bf3d5-py3-none-any.whl'; target user module is 'hate_transfor

INFO:absl:Node CsvExampleGen depends on [].
INFO:absl:Node CsvExampleGen is scheduled.
INFO:absl:Node Latest_blessed_model_resolver depends on [].
INFO:absl:Node Latest_blessed_model_resolver is scheduled.
INFO:absl:Node StatisticsGen depends on ['Run[CsvExampleGen]'].
INFO:absl:Node StatisticsGen is scheduled.
INFO:absl:Node SchemaGen depends on ['Run[StatisticsGen]'].
INFO:absl:Node SchemaGen is scheduled.
INFO:absl:Node ExampleValidator depends on ['Run[SchemaGen]', 'Run[StatisticsGen]'].
INFO:absl:Node ExampleValidator is scheduled.
INFO:absl:Node Transform depends on ['Run[CsvExampleGen]', 'Run[SchemaGen]'].
INFO:absl:Node Transform is scheduled.
INFO:absl:Node Trainer depends on ['Run[SchemaGen]', 'Run[Transform]'].
INFO:absl:Node Trainer is scheduled.
INFO:absl:Node Evaluator depends on ['Run[CsvExampleGen]', 'Run[Latest_blessed_model_resolver]', 'Run[Trainer]'].
INFO:absl:Node Evaluator is scheduled.
INFO:absl:Node Pusher depends on ['Run[Evaluator]', 'Run[Trainer]'].
INFO:absl

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sentence_xf (InputLayer)    [(None, 1)]               0         
                                                                 
 tf.reshape (TFOpLambda)     (None,)                   0         
                                                                 
 text_vectorization (TextVe  (None, 50)                0         
 ctorization)                                                    
                                                                 
 embedding (Embedding)       (None, 50, 32)            1600000   
                                                                 
 global_average_pooling1d (  (None, 32)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 128)               4224  

INFO:absl:Function `_wrapped_model` contains input name(s) table_handle, 7672, resource with unsupported characters which will be renamed to model_text_vectorization_string_lookup_none_lookup_lookuptablefindv2_table_handle, model_embedding_embedding_lookup_7672, model_dense_3_biasadd_readvariableop_resource in the SavedModel.
INFO:absl:Found untraced functions such as _update_step_xla while saving (showing 1 of 1). These functions will not be directly callable after loading.
INFO:absl:Writing fingerprint to mhmmadgiatt-pipeline/hate-speech-pipeline/Trainer/model/7/Format-Serving/fingerprint.pb


Epoch 2/10
Epoch 2: val_binary_accuracy improved from 0.54026 to 0.57143, saving model to mhmmadgiatt-pipeline/hate-speech-pipeline/Trainer/model/7/Format-Serving


INFO:absl:Function `_wrapped_model` contains input name(s) table_handle, 9725, resource with unsupported characters which will be renamed to model_text_vectorization_string_lookup_hash_table_lookup_lookuptablefindv2_table_handle, model_embedding_embedding_lookup_9725, model_dense_3_biasadd_readvariableop_resource in the SavedModel.
INFO:absl:Found untraced functions such as _update_step_xla while saving (showing 1 of 1). These functions will not be directly callable after loading.
INFO:absl:Writing fingerprint to mhmmadgiatt-pipeline/hate-speech-pipeline/Trainer/model/7/Format-Serving/fingerprint.pb


Epoch 3/10
Epoch 3: val_binary_accuracy did not improve from 0.57143
Epoch 4/10
Epoch 4: val_binary_accuracy did not improve from 0.57143
Epoch 5/10
Epoch 5: val_binary_accuracy did not improve from 0.57143
Epoch 6/10
Epoch 6: val_binary_accuracy improved from 0.57143 to 0.58961, saving model to mhmmadgiatt-pipeline/hate-speech-pipeline/Trainer/model/7/Format-Serving


INFO:absl:Function `_wrapped_model` contains input name(s) table_handle, 14414, resource with unsupported characters which will be renamed to model_text_vectorization_string_lookup_hash_table_lookup_lookuptablefindv2_table_handle, model_embedding_embedding_lookup_14414, model_dense_3_biasadd_readvariableop_resource in the SavedModel.
INFO:absl:Found untraced functions such as _update_step_xla while saving (showing 1 of 1). These functions will not be directly callable after loading.
INFO:absl:Writing fingerprint to mhmmadgiatt-pipeline/hate-speech-pipeline/Trainer/model/7/Format-Serving/fingerprint.pb


Epoch 7/10
Epoch 7: val_binary_accuracy improved from 0.58961 to 0.59156, saving model to mhmmadgiatt-pipeline/hate-speech-pipeline/Trainer/model/7/Format-Serving


INFO:absl:Function `_wrapped_model` contains input name(s) table_handle, 16418, resource with unsupported characters which will be renamed to model_text_vectorization_string_lookup_hash_table_lookup_lookuptablefindv2_table_handle, model_embedding_embedding_lookup_16418, model_dense_3_biasadd_readvariableop_resource in the SavedModel.
INFO:absl:Found untraced functions such as _update_step_xla while saving (showing 1 of 1). These functions will not be directly callable after loading.
INFO:absl:Writing fingerprint to mhmmadgiatt-pipeline/hate-speech-pipeline/Trainer/model/7/Format-Serving/fingerprint.pb


Epoch 8/10
Epoch 8: val_binary_accuracy did not improve from 0.59156
Epoch 9/10
Epoch 9: val_binary_accuracy did not improve from 0.59156
Epoch 10/10
Epoch 10: val_binary_accuracy did not improve from 0.59156


INFO:absl:Feature image_name has a shape dim {
  size: 1
}
. Setting to DenseTensor.
INFO:absl:Feature label has a shape dim {
  size: 1
}
. Setting to DenseTensor.
INFO:absl:Feature sentence has a shape dim {
  size: 1
}
. Setting to DenseTensor.
INFO:absl:Function `serve_tf_examples_fn` contains input name(s) table_handle, 20286, resource with unsupported characters which will be renamed to model_text_vectorization_string_lookup_hash_table_lookup_lookuptablefindv2_table_handle, model_embedding_embedding_lookup_20286, model_dense_3_biasadd_readvariableop_resource in the SavedModel.
INFO:absl:Found untraced functions such as _update_step_xla while saving (showing 1 of 1). These functions will not be directly callable after loading.
INFO:absl:Writing fingerprint to mhmmadgiatt-pipeline/hate-speech-pipeline/Trainer/model/7/Format-Serving/fingerprint.pb
INFO:absl:Training complete. Model written to mhmmadgiatt-pipeline/hate-speech-pipeline/Trainer/model/7/Format-Serving. ModelRun written 

## Selesai - Pipeline Telah Dijalankan

In [13]:
import shutil

# Path folder pipeline yang ingin di-zip
source_folder = '/content/mhmmadgiatt-pipeline'

# Nama file zip output
output_zip = '/content/mhmmadgiatt-pipeline.zip'

# Membuat file zip dari folder pipeline
shutil.make_archive(base_name=output_zip.replace('.zip', ''), format='zip', root_dir=source_folder)

print(f"Pipeline berhasil di-zip: {output_zip}")

Pipeline berhasil di-zip: /content/mhmmadgiatt-pipeline.zip


### File zip yang berisi pipeline yang telah dijalankan dapat diunduh untuk keperluan distribusi atau pengarsipan.

In [14]:
from google.colab import files

# Mendownload file zip
files.download(output_zip)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>