# Import library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances

# Tidying Up Datasets

## Load Datasets

In [2]:
# Load Tour Guide Datasets
guide_df = pd.read_csv('data/tour-guide-data.csv')
guide_df

Unnamed: 0,Name,Gender,Domicile,17-25,26-34,35-43,44-52,52+,Historical tours,Adventure tours,...,CSN1,CSN2,CSN3,CSN4,CSN5,OPN1,OPN2,OPN3,OPN4,OPN5
0,Nabhan Nabilah,1,0,1,0,0,0,0,0,0,...,5,2,5,5,5,2,2,5,5,2
1,Salsabilla,1,0,1,0,0,0,0,0,1,...,4,3,3,4,4,2,2,2,4,2
2,Maulani,1,0,1,0,0,0,0,1,0,...,4,4,3,4,4,3,3,2,3,3
3,Abiyyu Farhan,0,0,1,0,0,0,0,0,1,...,4,5,3,5,4,4,4,4,4,4
4,Rahmaliyah Kadir,1,1,1,0,0,0,0,0,0,...,4,3,3,4,3,3,3,3,4,3
5,Faizah,1,1,1,0,0,0,0,0,0,...,4,2,5,5,5,3,4,5,5,4
6,Andani,1,1,0,1,0,0,0,0,1,...,4,4,4,3,3,4,3,3,3,3
7,Muthiah Hanun,1,1,0,1,0,0,0,0,0,...,5,5,5,5,5,5,4,3,5,4
8,Ihlasul Mufti Faqih,0,1,1,0,0,0,0,0,1,...,5,4,5,4,3,2,1,3,4,2
9,Dessylva Maharany Santosa,1,0,1,0,0,0,0,1,0,...,3,3,3,4,3,2,3,2,4,3


In [3]:
# Add roles column as a guide
guide_df['Roles'] = 'guide'
guide_df

Unnamed: 0,Name,Gender,Domicile,17-25,26-34,35-43,44-52,52+,Historical tours,Adventure tours,...,CSN2,CSN3,CSN4,CSN5,OPN1,OPN2,OPN3,OPN4,OPN5,Roles
0,Nabhan Nabilah,1,0,1,0,0,0,0,0,0,...,2,5,5,5,2,2,5,5,2,guide
1,Salsabilla,1,0,1,0,0,0,0,0,1,...,3,3,4,4,2,2,2,4,2,guide
2,Maulani,1,0,1,0,0,0,0,1,0,...,4,3,4,4,3,3,2,3,3,guide
3,Abiyyu Farhan,0,0,1,0,0,0,0,0,1,...,5,3,5,4,4,4,4,4,4,guide
4,Rahmaliyah Kadir,1,1,1,0,0,0,0,0,0,...,3,3,4,3,3,3,3,4,3,guide
5,Faizah,1,1,1,0,0,0,0,0,0,...,2,5,5,5,3,4,5,5,4,guide
6,Andani,1,1,0,1,0,0,0,0,1,...,4,4,3,3,4,3,3,3,3,guide
7,Muthiah Hanun,1,1,0,1,0,0,0,0,0,...,5,5,5,5,5,4,3,5,4,guide
8,Ihlasul Mufti Faqih,0,1,1,0,0,0,0,0,1,...,4,5,4,3,2,1,3,4,2,guide
9,Dessylva Maharany Santosa,1,0,1,0,0,0,0,1,0,...,3,3,4,3,2,3,2,4,3,guide


In [4]:
# Load Tourist Datasets
tourist_df = pd.read_csv('data/tourist-data.csv')
tourist_df

Unnamed: 0,Names,Genders,Destinations,17-25,26-34,35-43,44-52,52+,Historical tours,Adventure tours,...,CSN1,CSN2,CSN3,CSN4,CSN5,OPN1,OPN2,OPN3,OPN4,OPN5
0,Novia Rizki Wulandari,1,0,1,0,0,0,0,0,1,...,5,4,4,5,5,4,4,2,4,4
1,Herlina Kusyanuri Putri,1,0,1,0,0,0,0,1,0,...,5,5,5,5,5,4,3,3,4,3
2,Rifdah Alyaa,1,0,1,0,0,0,0,0,0,...,2,3,2,3,4,3,2,3,4,3
3,Alexis Purnomo,0,0,0,0,0,0,0,0,1,...,3,5,3,4,5,4,4,1,4,4
4,Rifky Surya Pratama,0,0,1,0,0,0,0,0,1,...,4,3,3,4,4,2,3,3,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,Féi,1,1,1,0,0,0,0,0,0,...,4,2,4,4,4,3,4,2,4,4
69,Annisa,1,1,1,0,0,0,0,0,0,...,4,2,3,3,3,3,3,2,4,3
70,Usamah,0,1,1,0,0,0,0,0,1,...,5,5,4,5,5,4,4,5,5,5
71,Helena,1,0,0,0,0,0,0,1,0,...,4,3,4,4,4,3,4,2,5,4


In [5]:
# Add roles columns as a tourist
tourist_df['Roles'] = 'tourist'
tourist_df

Unnamed: 0,Names,Genders,Destinations,17-25,26-34,35-43,44-52,52+,Historical tours,Adventure tours,...,CSN2,CSN3,CSN4,CSN5,OPN1,OPN2,OPN3,OPN4,OPN5,Roles
0,Novia Rizki Wulandari,1,0,1,0,0,0,0,0,1,...,4,4,5,5,4,4,2,4,4,tourist
1,Herlina Kusyanuri Putri,1,0,1,0,0,0,0,1,0,...,5,5,5,5,4,3,3,4,3,tourist
2,Rifdah Alyaa,1,0,1,0,0,0,0,0,0,...,3,2,3,4,3,2,3,4,3,tourist
3,Alexis Purnomo,0,0,0,0,0,0,0,0,1,...,5,3,4,5,4,4,1,4,4,tourist
4,Rifky Surya Pratama,0,0,1,0,0,0,0,0,1,...,3,3,4,4,2,3,3,2,3,tourist
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,Féi,1,1,1,0,0,0,0,0,0,...,2,4,4,4,3,4,2,4,4,tourist
69,Annisa,1,1,1,0,0,0,0,0,0,...,2,3,3,3,3,3,2,4,3,tourist
70,Usamah,0,1,1,0,0,0,0,0,1,...,5,4,5,5,4,4,5,5,5,tourist
71,Helena,1,0,0,0,0,0,0,1,0,...,3,4,4,4,3,4,2,5,4,tourist


## Combine Datasets into Users Datasets

In [6]:
# Check if column names are the same
columns_equal = guide_df.columns.equals(tourist_df.columns)

if columns_equal:
    print("The DataFrames have the same column names.")
else:
    print("The DataFrames do not have the same column names.")

The DataFrames do not have the same column names.


In [7]:
# Get column names from guide_df and tourist_df
guide_columns = set(guide_df.columns)
tourist_columns = set(tourist_df.columns)

# Find the different columns
diff_col_in_guide = guide_columns - tourist_columns
diff_col_in_tourist = tourist_columns - guide_columns

# Print the different columns
print("Different columns in guide_df:", diff_col_in_guide)
print("Different columns in tourist_df:", diff_col_in_tourist)


Different columns in guide_df: {'Name', 'Domicile', 'Gender'}
Different columns in tourist_df: {'Destinations', 'Genders', 'Names'}


In [8]:
# Rename the diff column
guide_df = guide_df.rename(columns={'Name': 'Names'})
guide_df = guide_df.rename(columns={'Gender': 'Genders'})
guide_df = guide_df.rename(columns={'Domicile': 'Destinations'})


In [9]:
# Check if column names are the same
columns_equal = guide_df.columns.equals(tourist_df.columns)

if columns_equal:
    print("The DataFrames have the same column names.")
else:
    print("The DataFrames do not have the same column names.")

The DataFrames have the same column names.


In [10]:
user_df = pd.concat([tourist_df, guide_df])
user_df = user_df.reset_index(drop=True)
user_df

Unnamed: 0,Names,Genders,Destinations,17-25,26-34,35-43,44-52,52+,Historical tours,Adventure tours,...,CSN2,CSN3,CSN4,CSN5,OPN1,OPN2,OPN3,OPN4,OPN5,Roles
0,Novia Rizki Wulandari,1,0,1,0,0,0,0,0,1,...,4,4,5,5,4,4,2,4,4,tourist
1,Herlina Kusyanuri Putri,1,0,1,0,0,0,0,1,0,...,5,5,5,5,4,3,3,4,3,tourist
2,Rifdah Alyaa,1,0,1,0,0,0,0,0,0,...,3,2,3,4,3,2,3,4,3,tourist
3,Alexis Purnomo,0,0,0,0,0,0,0,0,1,...,5,3,4,5,4,4,1,4,4,tourist
4,Rifky Surya Pratama,0,0,1,0,0,0,0,0,1,...,3,3,4,4,2,3,3,2,3,tourist
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,Dera,1,0,1,0,0,0,0,0,0,...,4,5,5,5,4,5,5,5,5,guide
97,Salsabilla Rizki,1,0,1,0,0,0,0,0,1,...,5,5,5,5,5,5,2,3,4,guide
98,Salma,1,0,1,0,0,0,0,1,1,...,5,5,5,5,5,5,4,5,4,guide
99,Deva,1,0,0,1,0,0,0,0,1,...,4,4,3,4,3,4,2,4,3,guide


## Check Summary New Datasets

In [11]:
# Looking at the index of the columns
columns = user_df.columns
for idx, column in enumerate(columns):
    print(f"Column index: {idx}, Column name: {column}")

Column index: 0, Column name: Names
Column index: 1, Column name: Genders
Column index: 2, Column name: Destinations
Column index: 3, Column name: 17-25
Column index: 4, Column name: 26-34
Column index: 5, Column name: 35-43
Column index: 6, Column name: 44-52
Column index: 7, Column name: 52+
Column index: 8, Column name: Historical tours
Column index: 9, Column name: Adventure tours
Column index: 10, Column name: Nature and wildlife tours
Column index: 11, Column name: Culinary tours
Column index: 12, Column name: Wellness and retreat tours
Column index: 13, Column name: Architectural tours
Column index: 14, Column name: Educational tours
Column index: 15, Column name: Shopping tours
Column index: 16, Column name: EXT1
Column index: 17, Column name: EXT2
Column index: 18, Column name: EXT3
Column index: 19, Column name: EXT4
Column index: 20, Column name: EXT5
Column index: 21, Column name: EST1
Column index: 22, Column name: EST2
Column index: 23, Column name: EST3
Column index: 24,

In [12]:
user_df.describe()

Unnamed: 0,Genders,Destinations,17-25,26-34,35-43,44-52,52+,Historical tours,Adventure tours,Nature and wildlife tours,...,CSN1,CSN2,CSN3,CSN4,CSN5,OPN1,OPN2,OPN3,OPN4,OPN5
count,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,...,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0
mean,0.70297,0.415842,0.70297,0.049505,0.0,0.0,0.0,0.386139,0.475248,0.653465,...,3.683168,3.39604,3.613861,3.811881,3.841584,3.29703,3.435644,2.910891,3.841584,3.445545
std,0.459229,0.495325,0.459229,0.218002,0.0,0.0,0.0,0.489291,0.501878,0.478239,...,1.009264,1.010734,1.009656,1.046068,0.945861,0.964827,0.973816,1.040183,0.977064,0.994739
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0
50%,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,4.0,3.0,4.0,4.0,4.0,3.0,3.0,3.0,4.0,3.0
75%,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,4.0,4.0,4.0,5.0,5.0,4.0,4.0,3.0,5.0,4.0
max,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


In [13]:
# Check that we not missing any value after combine columns
print('Is there any missing value? ', user_df.isnull().values.any())
print('How many missing values? ', user_df.isnull().values.sum())

Is there any missing value?  False
How many missing values?  0


# Make Matchmaking Model

In [14]:
# Copy origin user data into match data to use in cluster
match_df = user_df.copy()

In [15]:
# Drop Column that not use for clustering match
match_df.drop(['Names', 'Destinations', 'Roles'], axis=1, inplace=True)
match_df

Unnamed: 0,Genders,17-25,26-34,35-43,44-52,52+,Historical tours,Adventure tours,Nature and wildlife tours,Culinary tours,...,CSN1,CSN2,CSN3,CSN4,CSN5,OPN1,OPN2,OPN3,OPN4,OPN5
0,1,1,0,0,0,0,0,1,1,0,...,5,4,4,5,5,4,4,2,4,4
1,1,1,0,0,0,0,1,0,0,1,...,5,5,5,5,5,4,3,3,4,3
2,1,1,0,0,0,0,0,0,1,1,...,2,3,2,3,4,3,2,3,4,3
3,0,0,0,0,0,0,0,1,1,0,...,3,5,3,4,5,4,4,1,4,4
4,0,1,0,0,0,0,0,1,1,1,...,4,3,3,4,4,2,3,3,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,1,1,0,0,0,0,0,0,1,1,...,5,4,5,5,5,4,5,5,5,5
97,1,1,0,0,0,0,0,1,1,1,...,3,5,5,5,5,5,5,2,3,4
98,1,1,0,0,0,0,1,1,0,1,...,5,5,5,5,5,5,5,4,5,4
99,1,0,1,0,0,0,0,1,1,1,...,4,4,4,3,4,3,4,2,4,3


## Test Match Mulai Dari sini

### Reduce Dimension from Data

In [16]:
match_array = match_df.values
match_array

array([[1, 1, 0, ..., 2, 4, 4],
       [1, 1, 0, ..., 3, 4, 3],
       [1, 1, 0, ..., 3, 4, 3],
       ...,
       [1, 1, 0, ..., 4, 5, 4],
       [1, 0, 1, ..., 2, 4, 3],
       [0, 1, 0, ..., 5, 4, 5]], dtype=int64)

In [17]:
import tensorflow as tf
import numpy as np

class PCAModel(tf.Module):
    def __init__(self, X_mean, components):
        super(PCAModel, self).__init__()
        self.X_mean = tf.Variable(X_mean, trainable=False)
        self.components = tf.Variable(components, trainable=False)

    @tf.function(input_signature=[tf.TensorSpec(shape=(None, 39), dtype=tf.float32)])
    def scale_features(self, X):
        X_min = tf.reduce_min(X, axis=0)
        X_max = tf.reduce_max(X, axis=0)

        # Handle columns with zero variance
        zero_variance_mask = tf.math.equal(X_min, X_max)
        non_zero_variance_mask = tf.math.logical_not(zero_variance_mask)

        # Scale the columns with non-zero variance
        X_scaled_non_zero = tf.where(non_zero_variance_mask, (X - X_min) / (X_max - X_min), X)

        # Scale the columns with zero variance
        X_scaled_zero = tf.where(zero_variance_mask, tf.zeros_like(X), X_scaled_non_zero)

        return X_scaled_zero

    @tf.function(input_signature=[tf.TensorSpec(shape=(None, 39), dtype=tf.float32)])
    def apply_pca(self, X):
        X_scaled = self.scale_features(X)
        X_centered = X_scaled - self.X_mean
        
        # Compute the covariance matrix
        covariance_matrix = tf.matmul(tf.transpose(X_centered), X_centered) / tf.cast(tf.shape(X_centered)[0], dtype=tf.float32)

        # Perform eigenvalue decomposition
        eigenvalues, eigenvectors = tf.linalg.eigh(covariance_matrix)
        
        # Sort eigenvectors based on eigenvalues
        sorted_indices = tf.argsort(eigenvalues, direction='DESCENDING')
        sorted_eigenvectors = tf.gather(eigenvectors, sorted_indices, axis=1)
        
        # Select the top k eigenvectors
        k = tf.minimum(tf.shape(sorted_eigenvectors)[1], 2)  # Choose the top 2 eigenvectors (modify as needed)
        selected_eigenvectors = sorted_eigenvectors[:, :k]
        
        # Project the centered data onto the selected eigenvectors
        X_pca = tf.matmul(X_centered, selected_eigenvectors)
        
        return X_pca

# Convert the numpy array to a TensorFlow tensor
X_tf = tf.convert_to_tensor(match_array, dtype=tf.float32)

# Perform PCA with k=2 (reduce to 2 dimensions)
X_mean = tf.reduce_mean(X_tf, axis=0)
_, _, V = tf.linalg.svd(X_tf - X_mean)
components = V[:, :2]

# Create an instance of the PCA model
pca_model = PCAModel(X_mean, components)

In [18]:
# Save the PCA model
tf.saved_model.save(
    pca_model, 
    export_dir="pca_model"
)

INFO:tensorflow:Assets written to: pca_model\assets


In [19]:
export_dir = "pca_model"

In [20]:
!saved_model_cli show --dir {export_dir} --all


MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:

signature_def['__saved_model_init_op']:
  The given SavedModel SignatureDef contains the following input(s):
  The given SavedModel SignatureDef contains the following output(s):
    outputs['__saved_model_init_op'] tensor_info:
        dtype: DT_INVALID
        shape: unknown_rank
        name: NoOp
  Method name is: 
The MetaGraph with tag set ['serve'] contains the following ops: {'Const', 'StatefulPartitionedCall', 'SaveV2', 'VarHandleOp', 'StringJoin', 'AssignVariableOp', 'MergeV2Checkpoints', 'ShardedFilename', 'DisableCopyOnRead', 'Pack', 'Select', 'Placeholder', 'RestoreV2', 'NoOp', 'StaticRegexFullMatch', 'Identity', 'ReadVariableOp'}

Concrete Functions:
  Function Name: 'apply_pca'
    Option #1
      Callable with:
        Argument #1
          x: TensorSpec(shape=(None, 39), dtype=tf.float32, name='x')

  Function Name: 'scale_features'
    Option #1
      Callable with:
        Argument #1
        

# Test Load Model

In [21]:
# Load the PCA model
loaded_model = tf.saved_model.load("pca_model")

In [22]:
# Generate a new data point
new_data = np.random.rand(1, 39)
new_data_tf = tf.convert_to_tensor(new_data, dtype=tf.float32)

# Predict the PCA value using the loaded model
pca_value = loaded_model.apply_pca(new_data_tf)

# Print the PCA value
print(pca_value)

tf.Tensor([[-1.7374805e+01  1.2516975e-06]], shape=(1, 2), dtype=float32)


In [23]:
new_data_tf = tf.convert_to_tensor(match_array, dtype=tf.float32)

# Predict the PCA value using the loaded model
pca_value = loaded_model.apply_pca(new_data_tf)

# Print the PCA value
print(pca_value)

tf.Tensor(
[[-1.39435177e+01 -1.13729107e+00]
 [-1.41652784e+01  1.01445985e+00]
 [-1.51570883e+01  5.69236279e-02]
 [-1.46208401e+01 -8.59446704e-01]
 [-1.42955132e+01 -8.36073041e-01]
 [-1.48253555e+01 -4.94044393e-01]
 [-1.50506010e+01  5.42503357e-01]
 [-1.37639437e+01 -2.29793847e-01]
 [-1.47880182e+01  9.48886096e-01]
 [-1.40906868e+01  1.73667073e-01]
 [-1.43346109e+01 -5.27827919e-01]
 [-1.50020657e+01  2.07497805e-01]
 [-1.44163923e+01  1.14883280e+00]
 [-1.46117907e+01  8.73391271e-01]
 [-1.43097095e+01 -7.88455978e-02]
 [-1.49813547e+01  8.33932936e-01]
 [-1.44167404e+01 -5.97456455e-01]
 [-1.36004696e+01 -2.67951190e-01]
 [-1.49429188e+01  3.36247325e-01]
 [-1.49113760e+01  3.55678797e-03]
 [-1.41223888e+01  1.00653195e+00]
 [-1.41567087e+01  5.75327277e-01]
 [-1.45268192e+01  1.24904191e+00]
 [-1.32804585e+01 -5.28074622e-01]
 [-1.42276001e+01 -2.18883693e-01]
 [-1.44214563e+01  1.21885252e+00]
 [-1.45696487e+01 -3.82494837e-01]
 [-1.48917923e+01  3.79118979e-01]
 [-1.4004