<a href="https://colab.research.google.com/github/mmender2/DataScience/blob/main/TwoTowerLSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
train = pd.read_csv('/content/drive/MyDrive/au_train.csv')
test = pd.read_csv('/content/drive/MyDrive/au_test.csv')
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
# select numerical and categorical features
num_cols = train.select_dtypes(include=np.number).columns.tolist()
cat_cols = train.select_dtypes(include=['object', 'category']).columns.tolist()
if 'Id' in num_cols:
    num_cols.remove('Id')
print(f"Total numerical features: {len(num_cols)}")
print(f"Total categorical features: {len(cat_cols)}")
print(num_cols)
print(cat_cols)

In [None]:
std_scaler = preprocessing.StandardScaler()
mm_scaler = preprocessing.MinMaxScaler()
robust_scaler = preprocessing.RobustScaler()
quantile_scaler = preprocessing.QuantileTransformer(n_quantiles=60, output_distribution='normal')

In [None]:
scaled_train_df = train.copy(deep=True)
scaled_test_df = test.copy(deep=True)
scaled_train_df[num_cols] = mm_scaler.fit_transform(scaled_train_df[num_cols])
scaled_test_df[num_cols] = mm_scaler.fit_transform(scaled_test_df[num_cols])

In [None]:
from sklearn.model_selection import train_test_split
import random
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import OneHotEncoder

X = scaled_train_df.drop('class', axis = 1)
y = scaled_train_df[['class']]
XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size=0.3)
timeSteps = 10
# Encode categorical columns in X_train
catTrain = []
catTest = []
# Define the encoder
encoder = OneHotEncoder(handle_unknown='ignore')  # 'ignore' helps to handle categorical levels present in test but not in train

# Fit the encoder on training data
if ('class' in cat_cols):
  cat_cols.remove('class')
encoder.fit(XTrain[cat_cols])

# Transform both training and test datasets
catTrain = encoder.transform(XTrain[cat_cols])
catTest = encoder.transform(XTest[cat_cols])

catTrain_df = pd.DataFrame(catTrain.toarray(), columns=encoder.get_feature_names_out(input_features=cat_cols))
catTest_df = pd.DataFrame(catTest.toarray(), columns=encoder.get_feature_names_out(input_features=cat_cols))
num_cols = XTrain.select_dtypes(include=[np.number]).columns.tolist()
# Select numerical columns
numTrain = XTrain[num_cols]
numTest = XTest[num_cols]

numTrainR = np.array([numTrain[i:i + timeSteps] for i in range(len(numTrain) - timeSteps + 1)])
numTestR = np.array([numTest[i:i + timeSteps] for i in range(len(numTest) - timeSteps + 1)])
catTrainR = np.array([catTrain_df[i:i + timeSteps] for i in range(len(catTrain_df) - timeSteps + 1)])
catTestR = np.array([catTest_df[i:i + timeSteps] for i in range(len(catTest_df) - timeSteps + 1)])
yTrainB = np.where(yTrain == ' >50K', 1, 0)
yTestB = np.where(yTest == ' >50K', 1, 0)
yTrainReshaped = yTrainB[timeSteps - 1 :]
yTestReshaped = yTestB[timeSteps - 1 :]

layer_sizesC=[timeSteps, catTrainR.shape[2]]
layer_sizesN=[timeSteps, numTrainR.shape[2]]

In [None]:
class HouseFeatureTower(tf.keras.layers.Layer):
    def __init__(self, layer_sizes):
        super().__init__()
        self.encoder = tf.keras.Sequential([
            tf.keras.layers.LSTM(64, input_shape=(layer_sizes[0], layer_sizes[1]), return_sequences=True),
            tf.keras.layers.Dense(units=32),
            tf.keras.layers.Dense(1, activation='sigmoid')
])

    def call(self, inputs):
        print(f'HouseFeatureTower input shape: {inputs.shape}')  # Add this line
        return self.encoder(inputs)
        print(f'output shape in HouseFeatureTower call: {output.shape}')

In [None]:
class SalePriceTower(tf.keras.layers.Layer):
    def __init__(self, layer_sizes):
        super().__init__()
        self.encoder = tf.keras.Sequential([
            tf.keras.layers.LSTM(64, input_shape=(layer_sizes[0], layer_sizes[1]), return_sequences=True),
            tf.keras.layers.Dense(units=32),
            tf.keras.layers.Dense(1, activation='sigmoid')
])
    def call(self, inputs):
        print(f'Q input shape: {inputs.shape}')  # Add this line
        return self.encoder(inputs)
        print(f'output shape in Q call: {output.shape}')

In [None]:


class HousePricePredictionModel(tfrs.models.Model):
    def __init__(self, layer_sizesN, layer_sizesC):
        super(HousePricePredictionModel, self).__init__()
        self.query_tower = HouseFeatureTower(layer_sizesN)
        self.candidate_tower = SalePriceTower(layer_sizesC)
        self.lstm1N = layers.LSTM(64, return_sequences=False)  # LSTM for the output of query_tower
        self.lstm1C = layers.LSTM(32, return_sequences=False)  # LSTM for the output of candidate_tower
        self.concatenate = layers.Concatenate(axis=-1)
        self.dense = layers.Dense(1, activation='sigmoid')  # Adjust the number of units and activation function as necessary
        self.final_dense = layers.Dense(1, activation='sigmoid')

    def compute_loss(self, y_true, y_pred):
        # Use binary cross-entropy as the loss function
        y_pred = tf.keras.activations.sigmoid(y_pred)
        loss = tf.keras.losses.binary_crossentropy(y_true, y_pred)
        return loss


    def train_step(self, data):
        # Unpack the data. Its structure depends on your model and on what you pass to `fit()`.
        x, y = data

        with tf.GradientTape() as tape:
            y_pred = self(x, training=True)  # Forward pass
            # Compute the loss value
            # (the loss function is configured in `compile()`)
            y = tf.reshape(y, [-1, 1])
            loss = self.compute_loss(y, y_pred)

        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Update metrics (includes the metric that tracks the loss)
        self.compiled_metrics.update_state(y, y_pred)

        # Return a dict mapping metric names to current value
        return {m.name: m.result() for m in self.metrics}



    def test_step(self, data):
        # Unpack the data
        x, y_true = data

        # Compute predictions
        y_pred = self(x, training=False)

        # Updates the metrics tracking the loss
        self.compiled_loss(y_true, y_pred, regularization_losses=self.losses)

        # Update the metrics.
        self.compiled_metrics.update_state(y_true, y_pred)

        # Return a dict mapping metric names to current value.
        # Note that it will include the loss (tracked in self.metrics).
        return {m.name: m.result() for m in self.metrics}
    def call(self, inputs, training=False):
        q = inputs[0]
        c = inputs[1]
        query_embeddings = self.query_tower(q)
        candidate_embeddings = self.candidate_tower(c)
        print("Query embeddings shape:", query_embeddings.shape)
        print("Candidate embeddings shape:", candidate_embeddings.shape)
        # Expand dimensions to make the output 3D for LSTM layer
        #query_embeddings = tf.expand_dims(query_embeddings, 1)
        #candidate_embeddings = tf.expand_dims(candidate_embeddings, 1)

        lstm_outN = self.lstm1N(query_embeddings)
        lstm_outC = self.lstm1C(candidate_embeddings)
        combined = self.concatenate([lstm_outN, lstm_outC])
        x = self.dense(combined)
        output = self.final_dense(x)
        return output


In [None]:
# Instantiate the model
model = HousePricePredictionModel(layer_sizesN, layer_sizesC)

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
summary(catTrain_df)

In [None]:
model.fit([numTrainR, catTrainR], yTrainReshaped, epochs=10, verbose=1, validation_data=([numTestR,catTestR], yTestReshaped))


In [None]:
# Encode categorical columns in X_train
numTest = []
catTest = []
# Define the encoder
encoder = OneHotEncoder(handle_unknown='ignore')  # 'ignore' helps to handle categorical levels present in test but not in train

# Fit the encoder on training data
if ('class' in cat_cols):
  cat_cols.remove('class')
encoder.fit(scaled_test_df[cat_cols])
# Transform both training and test datasets
catTest = encoder.transform(scaled_test_df[cat_cols])
catTest_df = pd.DataFrame(catTest.toarray(), columns=encoder.get_feature_names_out(input_features=cat_cols))


missing_col = set(catTrain_df.columns) - set(catTest_df.columns)
for col in missing_col:
    catTest_df[col] = 0

num_cols = scaled_test_df.select_dtypes(include=[np.number]).columns.tolist()
# Select numerical columns
numTest = scaled_test_df[num_cols]

numTestR = np.array([numTest[i:i + timeSteps] for i in range(len(numTest) - timeSteps + 1)])
catTestR = np.array([catTest_df[i:i + timeSteps] for i in range(len(catTest_df) - timeSteps + 1)])

yTestB = np.where(scaled_test_df['class'] == ' >50K', 1, 0)


In [None]:
predictions = model((numTestR, catTestR))

In [None]:
# Generate thresholds
num_thresholds = 100
thresholds = np.linspace(0, 1, num_thresholds)

# Initialize variables for optimal threshold and corresponding metric
best_f1 = 0
optimal_threshold = 0

# Iterate over thresholds and compute evaluation metrics
j = 0
for threshold in thresholds:
    # Convert probabilities into class labels based on threshold
    predicted_labels = np.where(np.array(predictions) >= threshold, 1, 0)

    # Calculate evaluation metric (e.g., F1-score) using the true labels (y_test) and predicted labels
    f1 = f1_score(yTestB, predicted_labels)
    j += 1
    # Update optimal threshold and metric if a higher F1-score is achieved
    if f1 > best_f1:
        best_f1 = f1
        optimal_threshold = threshold

print("Optimal Threshold:", optimal_threshold)
print("Best F1-score:", best_f1)


In [None]:
precision = precision_score(yTestB, np.where(predictions > optimal_threshold, 1, 0))
recall = recall_score(yTestB, np.where(predictions > optimal_threshold, 1, 0))
f1 = f1_score(yTestB, np.where(predictions > optimal_threshold, 1, 0))
auroc = roc_auc_score(yTestB, np.where(predictions > optimal_threshold, 1, 0))
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('AUROC Score:', auroc)