In [1]:
import pandas as pd
import re

def remove_special_chars(text):
    return re.sub(r'^@|^http|[^a-zA-Z\s]', '', str(text))

df1 = pd.read_csv('train.csv',encoding='latin1')
df2 = pd.read_csv('product_descriptions.csv')
df = pd.merge(df1, df2, on="product_uid", how="left")



df["product_title"] = df["product_title"].apply(remove_special_chars).str.lower()
df["product_description"] = df["product_description"].apply(remove_special_chars).str.lower()
df["search_term"] = df["search_term"].apply(remove_special_chars).str.lower()
df["relevance_normalized"] = (df["relevance"] - df["relevance"].min())/(df["relevance"].max() - df["relevance"].min())
df.head(2)

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description,relevance_normalized
0,2,100001,simpson strongtie gauge angle,angle bracket,3.0,not only do angles make joints stronger they a...,1.0
1,3,100001,simpson strongtie gauge angle,l bracket,2.5,not only do angles make joints stronger they a...,0.75


In [2]:
from rake_nltk import Rake
# import nltk
# nltk.download('punkt_tab')
r = Rake()
def extract_keywords(text, max_keywords=20):
    r.extract_keywords_from_text(text)
    keywords = []
    for phrase in r.get_ranked_phrases():
        keywords.extend(phrase.split())  # Split the phrase into individual words

    # Return the top `max_keywords` words (limiting to `max_keywords` words)
    return keywords[:max_keywords]

df["product_description_keywords"] = df["product_description"].apply(
    lambda desc: extract_keywords(desc)
)


In [3]:
import pickle
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer_title, tokenizer_description, tokenizer_search_term, sequences_title, sequences_description, sequences_search_term = pickle.load(handle)
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# import pickle

# tokenizer_title = Tokenizer()
# tokenizer_title.fit_on_texts(df["product_title"])
# tokenizer_description = Tokenizer()
# tokenizer_description.fit_on_texts(df["product_description_keywords"])
# tokenizer_search_term = Tokenizer()
# tokenizer_search_term.fit_on_texts(df["search_term"])

# sequences_title =       pad_sequences(tokenizer_title.texts_to_sequences(df["product_title"]), padding='post')
# sequences_description = pad_sequences(tokenizer_description.texts_to_sequences(df["product_description_keywords"]), padding='post')
# sequences_search_term = pad_sequences(tokenizer_search_term.texts_to_sequences(df["search_term"]), padding='post')

# with open('tokenizer.pickle', 'wb') as handle:
#     pickle.dump([tokenizer_title, tokenizer_description, tokenizer_search_term,  
#                  sequences_title, sequences_description, sequences_search_term], 
#                 handle, protocol=pickle.HIGHEST_PROTOCOL)


In [4]:
with open('word2vec.pickle', 'rb') as handle:
    word2vec_model = pickle.load(handle)

# import gensim.downloader as api
# word2vec_model = api.load("word2vec-google-news-300")
# with open('word2vec.pickle', 'wb') as handle:
#     pickle.dump(word2vec_model, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [5]:
import numpy as np
vocab_len = len(tokenizer_title.word_index)
embedding_matrix_title = np.empty((len(tokenizer_title.word_index), 300), dtype=np.float16)
for word, i in tokenizer_title.word_index.items():
    if i-1 < len(embedding_matrix_title):  # Ensure index is within bounds
        if word in word2vec_model:
            embedding_matrix_title[i - 1] = word2vec_model[word][:300]
        else:
            embedding_matrix_title[i - 1] = np.random.normal(size=300)  # Optional: handle missing words
    else:
        print(f"Index {i-1} is out of bounds for word: {word}")

embedding_matrix_description =  np.empty((len(tokenizer_description.word_index), 300), dtype=np.float16)
for word, i in tokenizer_description.word_index.items():
    if i-1 < len(embedding_matrix_description):  # Ensure index is within bounds
        if word in word2vec_model:
            embedding_matrix_description[i - 1] = word2vec_model[word][:300]
        else:
            embedding_matrix_description[i - 1] = np.random.normal(size=300)  # Optional: handle missing words
    else:
        print(f"Index {i-1} is out of bounds for word: {word}")        
        
embedding_matrix_search_term =  np.empty((len(tokenizer_search_term.word_index), 300), dtype=np.float16)
for word, i in tokenizer_search_term.word_index.items():
    if i-1 < len(embedding_matrix_search_term):  # Ensure index is within bounds
        if word in word2vec_model:
            embedding_matrix_search_term[i - 1] = word2vec_model[word][:300]
        else:
            embedding_matrix_search_term[i - 1] = np.random.normal(size=300)  # Optional: handle missing words
    else:
        print(f"Index {i-1} is out of bounds for word: {word}") 

In [29]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input,Concatenate,Dropout,GlobalMaxPooling1D,BatchNormalization
import tensorflow as tf
def create_text_submodel_lstm(input_layer, vocab_size, embedding_dim, embedding_matrix, lstm_units=100):
    embedding = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix] if embedding_matrix is not None else None,
        trainable=False
    )(input_layer)
    # if embedding_matrix is not None:
    #     embedding.build((None,))  # Build the layer
    #     embedding.set_weights([embedding_matrix])
    lstm = LSTM(lstm_units, activation='tanh')(embedding)
    output = Dense(1, activation='relu')(lstm)
    return output

def create_text_submodel(input_layer, vocab_size, embedding_dim, embedding_matrix):
    embedding = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix] if embedding_matrix is not None else None,
        trainable=False
    )(input_layer)
    flattened = GlobalMaxPooling1D()(embedding)
    # Replace LSTM with pooling/dense layers
    
    dense0 = Dense(512, activation='relu')(flattened)
    norm0 = BatchNormalization()(dense0)
    dropout0 = Dropout(0.3)(norm0)
    dense1 = Dense(256, activation='relu')(dropout0)
    norm1 = BatchNormalization()(dense1)
    dropout1 = Dropout(0.3)(norm1)
    dense2 = Dense(128, activation='relu')(dropout1)
    norm2 = BatchNormalization()(dense2)
    dropout2 = Dropout(0.3)(norm2)
    dense3 = Dense(64, activation='relu')(dropout1)
    norm3 = BatchNormalization()(dense3)
    dropout3 = Dropout(0.3)(norm3)
    output = Dense(1, activation='relu')(dropout3)
    return output

In [30]:
# class WeightedEnsemble(tf.keras.layers.Layer):
#     def __init__(self, **kwargs):
#         super(WeightedEnsemble, self).__init__(**kwargs)
    
#     def call(self, inputs):
#         # Assuming inputs are [title_output, desc_output, search_output]
#         # You might want to add weights or a more sophisticated combination method
#         return tf.reduce_mean(inputs, axis=0)

def create_relevance_model(sequences_title, sequences_description, sequences_search_term,
                         tokenizer_title, tokenizer_description, tokenizer_search_term,
                         embedding_matrix_title, embedding_matrix_description, embedding_matrix_search_term,
                         embedding_dim=300):
    # Input layers
    title_input = Input(shape=(sequences_title.shape[1],), name='title_input')
    desc_input = Input(shape=(sequences_description.shape[1],), name='description_input')
    search_input = Input(shape=(sequences_search_term.shape[1],), name='search_input')
    
    title_output = create_text_submodel(
        title_input,
        vocab_size=len(tokenizer_title.word_index),
        embedding_dim=embedding_dim,
        embedding_matrix=embedding_matrix_title,

    )
    
    desc_output = create_text_submodel(
        desc_input,
        vocab_size=len(tokenizer_description.word_index),
        embedding_dim=embedding_dim,
        embedding_matrix=embedding_matrix_description,

    )
    
    search_output = create_text_submodel(
        search_input,
        vocab_size=len(tokenizer_search_term.word_index),
        embedding_dim=embedding_dim,
        embedding_matrix=embedding_matrix_search_term,

    )
    

    combined_output = Concatenate()([title_output, desc_output, search_output])
    x = Dense(128, activation='relu')(combined_output)
    x = BatchNormalization()(x)
    x = Dense(64, activation='relu')(combined_output)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    output = Dense(1, activation='linear')(x)
    
    # Create final model
    relevance_model = tf.keras.models.Model(
        inputs=[title_input, desc_input, search_input],
        outputs=output
    )
    
    # Compile model
    relevance_model.compile(
        loss=tf.keras.losses.Huber(),
        optimizer='adam',
        metrics=['accuracy']
    )
    
    return relevance_model

In [31]:
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    'best_model.keras',                # Path where the model will be saved
    monitor='val_loss',             # Metric to monitor
    save_best_only=True,            # Save only the best model
    mode='min',                     # 'min' for loss, 'max' for accuracy
    verbose=0
)

from tensorflow.keras.callbacks import EarlyStopping


early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor='loss',             
    patience=20,                     
    mode='min',                     
    verbose=0,
    restore_best_weights=True      
) 

from tqdm.keras import TqdmCallback
 

In [32]:
model = create_relevance_model(sequences_title, sequences_description, sequences_search_term,
                               tokenizer_title, tokenizer_description, tokenizer_search_term,
                               embedding_matrix_title, embedding_matrix_description, embedding_matrix_search_term)
model.summary()
model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.0001), loss="mean_squared_error", metrics=['mae'])
model.predict([sequences_title[0:3], sequences_description[0:3], sequences_search_term[0:3]])

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 title_input (InputLayer)       [(None, 28)]         0           []                               
                                                                                                  
 description_input (InputLayer)  [(None, 20)]        0           []                               
                                                                                                  
 search_input (InputLayer)      [(None, 11)]         0           []                               
                                                                                                  
 embedding_15 (Embedding)       (None, 28, 300)      5762400     ['title_input[0][0]']            
                                                                                            

array([[ 0.0073511 ],
       [ 0.00424599],
       [-0.00895748]], dtype=float32)

In [33]:
model.fit([sequences_title, sequences_description, sequences_search_term]
                     , df["relevance_normalized"]
                     , validation_split=0.2
                     , verbose=0
                     , epochs=100, batch_size=32, callbacks=[
                                                            checkpoint_callback,
                                                            TqdmCallback(verbose=1),
                                                            early_stopping_callback
                                                            ]
            )

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

<keras.callbacks.History at 0x1bf51128ac0>

In [None]:
df.__len__()

74067