# **Technician Recommendation System**

Import necessary modules

In [197]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import keras_tuner as kt
import pickle

## **1. Data Exploration and Preprocessing**
Read dataset, convert to dataframe using pandas and get the shape of the dataframe

In [182]:
data_path = 'technicians.csv'
data = pd.read_csv(data_path)
original_data = data.copy()

data.shape

(200, 10)

Display sample technician's dataframe

In [183]:
data.head()

Unnamed: 0,technicianid,name,phonenumber,email,skills,experience,certifications,address,location,ratingsreceived
0,1,Erik Okta Lestari,6287265995831,eriklestari@gmail.com,"Computer Installation, AC Repair",13,,Gg. Rawamangun No. 019,Bitung,4.2
1,2,Raisa Lasmono Najmudin,6287452722533,raisanajmudin@gmail.com,"Washing Machine Maintenance, Electrical Repair",4,,Gg. Rumah Sakit No. 74,Tegal,4.6
2,3,Tasnim Utama,62829038854284,tasnimutama@gmail.com,"AC Maintenance, Plumbing Installation",4,Sertifikasi Profesi Teknik Pendingin dan Tata ...,Jl. Yos Sudarso No. 720,Jayapura,4.9
3,4,Warji Ghani Wahyudin,6289233671587,warjiwahyudin@gmail.com,"Computer Repair, Plumbing Repair",2,,Jl. Rajawali Timur No. 95,Pariaman,5.0
4,5,Tania Jailani,6282057406206,taniajailani@gmail.com,"Plumbing Installation, Computer Repair",3,SKA Ahli Teknik Plambing dan Pompa Mekanika,Gg. Monginsidi No. 200,Bogor,4.0


Check the information of the dataframe

In [184]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   technicianid     200 non-null    int64  
 1   name             200 non-null    object 
 2   phonenumber      200 non-null    int64  
 3   email            200 non-null    object 
 4   skills           200 non-null    object 
 5   experience       200 non-null    int64  
 6   certifications   115 non-null    object 
 7   address          200 non-null    object 
 8   location         200 non-null    object 
 9   ratingsreceived  200 non-null    float64
dtypes: float64(1), int64(3), object(6)
memory usage: 15.8+ KB


Check statistical description of the dataframe

In [185]:
data.describe()

Unnamed: 0,technicianid,phonenumber,experience,ratingsreceived
count,200.0,200.0,200.0,200.0
mean,100.5,32000000000000.0,4.7,4.3
std,57.9,28000000000000.0,3.6,0.4
min,1.0,6300000000000.0,1.0,3.5
25%,50.8,6300000000000.0,2.0,4.0
50%,100.5,6300000000000.0,4.0,4.3
75%,150.2,63000000000000.0,5.0,4.7
max,200.0,63000000000000.0,15.0,5.0


Choose only relevant columns

In [186]:
data = data.drop(['email', 'phonenumber', 'location', 'address'], axis=1)

Check null (missing) value in the dataframe

In [187]:
# Fill missing values for 'certifications' with '' as not all technicians might have certifications
data['skills'] = data['skills'].fillna('')
data['certifications'] = data['certifications'].fillna('')

# Checking for missing values in both datasets
data_missing = data.isnull().sum()

data_missing 

technicianid       0
name               0
skills             0
experience         0
certifications     0
ratingsreceived    0
dtype: int64

In [None]:
data['skills'] = data['skills'].str.lower()
data['certifications'] = data['certifications'].str.lower()

## **3. Model Development**
Content-based recommendation

In [188]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)
skills_tfidf = tfidf.fit_transform(data['skills']).toarray()

In [189]:
# Fill missing values in numerical columns with 0
data['experience'] = data['experience'].fillna(0)
data['ratingsreceived'] = data['ratingsreceived'].fillna(0)

In [190]:
# Normalize numerical features
scaler = StandardScaler()
data[['experience', 'ratingsreceived']] = scaler.fit_transform(data[['experience', 'ratingsreceived']])

In [191]:
# One-hot encode certifications
certifications_encoded = pd.get_dummies(data['certifications'])

In [192]:
# Combine all features
X_exp = data['experience'].values.reshape(-1, 1)
X_rating = data['ratingsreceived'].values.reshape(-1, 1)
X_cert = certifications_encoded.values

In [193]:
# Combine features into a single array
X = np.hstack([skills_tfidf, X_exp, X_cert, X_rating])

# Create dummy target variable (since we don't have a target column, we'll use a placeholder)
y = np.random.rand(X.shape[0])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [194]:
def build_model(hp):
    model = Sequential()
    model.add(Dense(hp.Int('units1', min_value=128, max_value=512, step=32), input_dim=X.shape[1], activation='relu'))
    model.add(Dropout(hp.Float('dropout1', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(hp.Int('units2', min_value=64, max_value=256, step=32), activation='relu'))
    model.add(Dropout(hp.Float('dropout2', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(hp.Int('units3', min_value=32, max_value=128, step=16), activation='relu'))
    model.add(Dropout(hp.Float('dropout3', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer=Adam(learning_rate=hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='log')),
                  loss='mean_squared_error',
                  metrics=['mean_absolute_error'])
    return model

In [195]:
# Hyperparameter tuning
tuner = kt.RandomSearch(
    build_model,
    objective='val_mean_absolute_error',
    max_trials=10,
    executions_per_trial=2,
    directory='tuning_dir',
    project_name='technician_recommendation'
)

Reloading Tuner from tuning_dir\technician_recommendation\tuner0.json


In [196]:
# Set up early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the tuner
tuner.search(X_train, y_train, epochs=50, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Get the best model
best_model = tuner.get_best_models(num_models=1)[0]

# Summary of the best model
best_model.summary()

# Save the best model
best_model.save('technician_recommendation_model_advanced.h5')





  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  saveable.load_own_variables(weights_store.get(inner_path))




In [198]:
# Save the TF-IDF vectorizer and scaler
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('certifications_encoded_columns.pkl', 'wb') as f:
    pickle.dump(certifications_encoded.columns, f)

In [200]:
# Load the saved model and preprocessing artifacts
model = tf.keras.models.load_model('technician_recommendation_model_advanced.h5')
with open('tfidf_vectorizer.pkl', 'rb') as f:
    tfidf = pickle.load(f)
with open('scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)
with open('certifications_encoded_columns.pkl', 'rb') as f:
    certifications_encoded_columns = pickle.load(f)

# Load the original data
data = pd.read_csv('technicians.csv')
original_data = data.copy()



In [201]:
# Preprocess the data
data['skills'] = data['skills'].fillna('')
data['certifications'] = data['certifications'].fillna('')
skills_tfidf = tfidf.transform(data['skills']).toarray()
data['experience'] = data['experience'].fillna(0)
data['ratingsreceived'] = data['ratingsreceived'].fillna(0)
data[['experience', 'ratingsreceived']] = scaler.transform(data[['experience', 'ratingsreceived']])
certifications_encoded = pd.get_dummies(data['certifications']).reindex(columns=certifications_encoded_columns, fill_value=0)
X_exp = data['experience'].values.reshape(-1, 1)
X_rating = data['ratingsreceived'].values.reshape(-1, 1)
X_cert = certifications_encoded.values
X = np.hstack([skills_tfidf, X_exp, X_cert, X_rating])

In [202]:
def predict_best_technician(user_skill):
    # Preprocess the user input skill
    user_skill_tfidf = tfidf.transform([user_skill]).toarray()
    
    # Prepare the input data
    X_input = np.hstack([user_skill_tfidf, np.zeros((1, X.shape[1] - user_skill_tfidf.shape[1]))])
    
    # Predict scores for the user input skill
    predicted_score = model.predict(X_input).flatten()[0]
    
    # Combine with experience, certifications, and ratings
    best_match_score = -1
    best_technician_index = -1
    
    for idx in range(X.shape[0]):
        technician = data.iloc[idx]
        skill_match = user_skill.lower() in technician['skills'].lower()  # Ensure exact phrase matching
        if skill_match:
            combined_score = (predicted_score + 
                              technician['experience'] + 
                              technician['ratingsreceived'] + 
                              certifications_encoded.iloc[idx].sum())
            if combined_score > best_match_score:
                best_match_score = combined_score
                best_technician_index = idx
    
    if best_technician_index != -1:
        return original_data.iloc[best_technician_index]
    else:
        return "No matching technician found."

In [204]:
# Example usage
user_input_skills = "ac repair"
recommended_technician = predict_best_technician(user_input_skills)
print(recommended_technician)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step
technicianid                                  122
name                             Luhung Puspasari
phonenumber                        62896859133987
email                   luhungpuspasari@gmail.com
skills             Refrigerator Repair, AC Repair
experience                                     11
certifications                                NaN
address                     Jalan Ciwastra No. 72
location                              Probolinggo
ratingsreceived                               4.7
Name: 121, dtype: object
