# Table of Contents

- [Import Libraries](#section1)
- [load Datasets](#section2)
- [standardize the data](#section3)
- [Methods](#section4)
    -    [](#section41)


## 1. Import necessary libraries <a id="section1"></a>

In [15]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from tensorflow.keras.optimizers import Adam
import keras_tuner as kt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense



## 2. Load Datasets <a id="section2"></a>


In [2]:
# Read the Parquet files into DataFrames
train = pd.read_parquet("../Data/train.parquet")
test = pd.read_parquet("../Data/test.parquet")
val = pd.read_parquet("../Data/val.parquet")
train_val = pd.read_parquet("../Data/train_val.parquet")

# Display the first few rows of each DataFrame to confirm
train.head()


Unnamed: 0,user_id,film,date,target,rating,year,watch_count,fan_count,like_count,review_count,...,'sentiment'_Editing_negative,'sentiment'_Special_Effects_negative,'sentiment'_Other_neutral,'sentiment'_Other_negative,watchlist_length,films_watched,films_this_year,lists_created,following,followers
0,dustymoth,salems-lot-2024,2024-10-06,1.0,2.39,2024.0,104715.0,16.0,14213.0,32580.0,...,0.0,0.0,0.0,0.0,72.0,2402.0,48.0,84.0,32.0,26.0
1,dustymoth,alien-3,2024-09-07,4.0,2.82,1992.0,487939.0,269.0,68239.0,61964.0,...,1.0,0.0,0.0,0.0,72.0,2402.0,48.0,84.0,32.0,26.0
2,dustymoth,lock-stock-and-two-smoking-barrels,2024-09-06,0.5,3.99,1998.0,363721.0,3624.0,96479.0,21158.0,...,0.0,0.0,0.0,0.0,72.0,2402.0,48.0,84.0,32.0,26.0
3,dustymoth,morocco,2024-08-21,3.0,3.58,1930.0,19531.0,63.0,4734.0,3366.0,...,0.0,0.0,0.0,0.0,72.0,2402.0,48.0,84.0,32.0,26.0
4,dustymoth,maxxxine,2024-08-06,1.5,3.09,2024.0,775925.0,2112.0,193118.0,254823.0,...,0.0,0.0,0.0,0.0,72.0,2402.0,48.0,84.0,32.0,26.0


In [3]:
results=pd.read_csv("../Data/results.csv", index_col=0)
results.head()

Unnamed: 0,model_name,param,rmse,mse
0,DecisionTreeRegressor,{'max_depth': '10'},0.906551,0.821834
1,RandomForestRegressor,"{'max_depth': 20, 'max_features': 50, 'n_estim...",0.881634,0.777278
2,AdaBoostRegressor,"{'learning_rate': 0.01, 'loss': 'exponential',...",0.915916,0.838903
3,XGBoostRegressor,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.858019,0.736197


## 3. standardize the data <a id="section3"></a>

In [4]:
# Drop rows with any NaN values
train = train.dropna()
test = test.dropna()
val = val.dropna()
train_val = train_val.dropna()

In [5]:
target_column = 'target'

# Separate the target column from the features
train_features = train.drop(columns=[target_column])
val_features = val.drop(columns=[target_column])

# Separate numeric columns
numeric_cols = train_features.select_dtypes(include=['number']).columns

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the numeric columns of the train data and transform both train and validation sets
train_features_scaled = train_features.copy()
val_features_scaled = val_features.copy()

train_features_scaled[numeric_cols] = scaler.fit_transform(train_features[numeric_cols])
val_features_scaled[numeric_cols] = scaler.transform(val_features[numeric_cols])

# Add the target column back to the scaled features
train_scaled = pd.concat([train_features_scaled, train[target_column]], axis=1)
val_scaled = pd.concat([val_features_scaled, val[target_column]], axis=1)

# ⭐ CORRECT PART HERE
X_train = train_scaled[numeric_cols]
y_train = train_scaled[target_column]

X_val = val_scaled[numeric_cols]
y_val = val_scaled[target_column]


## Methods


## ANN

In [6]:
# Define a simple model without hidden layers (just input -> output)
model = tf.keras.Sequential([
    tf.keras.layers.Dense(1, input_shape=(X_train.shape[1],))
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='mse'
)

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=1)




  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m15716/15716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 1ms/step - loss: 4.6294
Epoch 2/5
[1m15716/15716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 1ms/step - loss: 0.7836
Epoch 3/5
[1m15716/15716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 1ms/step - loss: 0.7830
Epoch 4/5
[1m15716/15716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 1ms/step - loss: 0.7830
Epoch 5/5
[1m15716/15716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 1ms/step - loss: 0.7839


<keras.src.callbacks.history.History at 0x212cc047f50>

In [None]:
import tensorflow as tf

# Define the RNN model
model_rnn = tf.keras.Sequential([
    tf.keras.layers.SimpleRNN(64, input_shape=(X_train.shape[1], 1)),  # RNN layer with 64 units
    tf.keras.layers.Dense(1)  # Output layer
])

# Compile the model
model_rnn.compile(
    optimizer='adam',
    loss='mse'
)

# Train the model
model_rnn.fit(X_train, y_train, epochs=5, batch_size=32, verbose=1)


  super().__init__(**kwargs)


Epoch 1/5
[1m15716/15716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 12ms/step - loss: 1.1831
Epoch 2/5
[1m 8920/15716[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m1:20[0m 12ms/step - loss: 0.8125

In [None]:
# Define the CNN model
model_cnn = tf.keras.Sequential([
    tf.keras.layers.Conv1D(64, 3, activation='relu', input_shape=(X_train.shape[1], 1)),  # Conv layer with 64 filters
    tf.keras.layers.MaxPooling1D(2),  # Pooling layer to reduce dimensionality
    tf.keras.layers.Flatten(),  # Flatten the data to feed into a dense layer
    tf.keras.layers.Dense(1)  # Output layer
])

# Compile the model
model_cnn.compile(
    optimizer='adam',
    loss='mse'
)

# Train the model
model_cnn.fit(X_train, y_train, epochs=5, batch_size=32, verbose=1)


In [7]:
# Predict on validation set
y_pred = model.predict(X_val).flatten()  # flatten to make it 1D

# Calculate MSE
mse = mean_squared_error(y_val, y_pred)

# Calculate RMSE
rmse = np.sqrt(mse)

print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

[1m311/311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
Mean Squared Error (MSE): 0.8251
Root Mean Squared Error (RMSE): 0.9084


In [8]:
# attach result to result dataframe
# Create a new row
new_row = {
    'Unnamed: 0': len(results),
    'model_name': 'TensorFlowLinearModel',
    'param': {"epochs":5, "batch_size":32,"learning_rate":0.001,"hidden_layer":{}},  # No real hyperparameters for this simple linear model
    'rmse': rmse,
    'mse': mse
}

# Append the new row
results = pd.concat([results, pd.DataFrame([new_row])], ignore_index=True)
# Show updated DataFrame
results.head(10)

Unnamed: 0.1,model_name,param,rmse,mse,Unnamed: 0
0,DecisionTreeRegressor,{'max_depth': '10'},0.906551,0.821834,
1,RandomForestRegressor,"{'max_depth': 20, 'max_features': 50, 'n_estim...",0.881634,0.777278,
2,AdaBoostRegressor,"{'learning_rate': 0.01, 'loss': 'exponential',...",0.915916,0.838903,
3,XGBoostRegressor,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.858019,0.736197,
4,TensorFlowLinearModel,"{'epochs': 5, 'batch_size': 32, 'learning_rate...",0.908365,0.825127,4.0


## add hidden layer

In [9]:

# Define a simple model without hidden layers (just input -> output)
model = tf.keras.Sequential([
    tf.keras.layers.Dense(100, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(1)
])

# Compile the model
model.compile(loss="mean_squared_error", optimizer="sgd")

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=1, validation_split=0.1)




  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m14145/14145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 2ms/step - loss: 0.8804 - val_loss: 0.8140
Epoch 2/5
[1m14145/14145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 2ms/step - loss: 0.7820 - val_loss: 0.8119
Epoch 3/5
[1m14145/14145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 2ms/step - loss: 0.7729 - val_loss: 0.8106
Epoch 4/5
[1m14145/14145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 2ms/step - loss: 0.7733 - val_loss: 0.8063
Epoch 5/5
[1m14145/14145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 2ms/step - loss: 0.7724 - val_loss: 0.8167


<keras.src.callbacks.history.History at 0x2131fc6b090>

In [10]:
# Predict on validation set
y_pred = model.predict(X_val).flatten()  # flatten to make it 1D

# Calculate MSE
mse = mean_squared_error(y_val, y_pred)

# Calculate RMSE
rmse = np.sqrt(mse)

print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

[1m311/311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Mean Squared Error (MSE): 0.8284
Root Mean Squared Error (RMSE): 0.9102


## Wide and Deep Neural Network

In [11]:

input = keras.layers.Input(shape=X_train.shape[1:])
hidden1 = keras.layers.Dense(10, activation="relu")(input)
hidden2 = keras.layers.Dense(10, activation="relu")(hidden1)
concat = keras.layers.Concatenate()([input, hidden2])
output = keras.layers.Dense(1)(concat)
model = keras.models.Model(inputs=[input], outputs=[output])
# Compile the model
model.compile(loss="mean_squared_error", optimizer="sgd")

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=1, validation_split=0.1)

Epoch 1/5


Expected: ['keras_tensor_5']
Received: inputs=Tensor(shape=(None, 92))


[1m14123/14145[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - loss: 0.9088

Expected: ['keras_tensor_5']
Received: inputs=Tensor(shape=(None, 92))


[1m14145/14145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 2ms/step - loss: 0.9086 - val_loss: 0.8881
Epoch 2/5
[1m14145/14145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 2ms/step - loss: 0.8035 - val_loss: 0.8416
Epoch 3/5
[1m14145/14145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 2ms/step - loss: 0.8020 - val_loss: 0.8320
Epoch 4/5
[1m14145/14145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 2ms/step - loss: 0.7998 - val_loss: 0.8353
Epoch 5/5
[1m14145/14145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 2ms/step - loss: 0.7951 - val_loss: 0.8244


<keras.src.callbacks.history.History at 0x2131fc57090>

## tune hyperparameter

In [12]:
X_train.shape[1]

92

In [13]:
def model_builder(hp):
  model = keras.Sequential()
  model.add(keras.layers.Flatten(input_shape=(28, 28)))

  # Tune the number of units in the first Dense layer
  # Choose an optimal value between 32-512
  hp_units = hp.Int('units', min_value=32, max_value=512, step=32)
  model.add(keras.layers.Dense(units=hp_units, activation='relu'))
  model.add(keras.layers.Dense(1))

  # Tune the learning rate for the optimizer
  # Choose an optimal value from 0.01, 0.001, or 0.0001
  hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

  model.compile(loss="mean_squared_error", optimizer="sgd")

  return model

In [None]:
# Define the HyperModel
class MyHyperModel(kt.HyperModel):
    def build(self, hp):
        model = Sequential()
        
        # Add the input layer (first layer)
        model.add(Dense(units=hp.Int('units_1', min_value=32, max_value=256, step=32),
                        activation='relu', input_shape=(X_train.shape[1],)))
        
        # Add additional hidden layers if needed
        for i in range(hp.Int('num_hidden_layers', 1, 3)):  # Search between 1 and 3 hidden layers
            model.add(Dense(units=hp.Int(f'units_{i+2}', min_value=32, max_value=256, step=32),
                            activation='relu'))
        
        # Add the output layer
        model.add(Dense(1))
        
        # Compile the model
        model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=hp.Float('learning_rate', min_value=1e-5, max_value=1e-2, sampling='LOG', default=1e-3)),
                      loss='mean_squared_error')
        
        return model

# Instantiate the HyperModel
hypermodel = MyHyperModel()

# Instantiate the tuner (using Hyperband, but you could also use RandomSearch)
tuner = kt.Hyperband(hypermodel, 
                     objective='val_loss', 
                     max_epochs=10, 
                     factor=3, 
                     directory='my_dir', 
                     project_name='hyperparameter_tuning')

# Perform the hyperparameter search
tuner.search(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Get the best model and hyperparameters
best_model = tuner.get_best_models()[0]
best_hyperparameters = tuner.get_best_hyperparameters()[0]

print("Best Hyperparameters:", best_hyperparameters.values)




Trial 30 Complete [00h 05m 02s]
val_loss: 0.9118149280548096

Best val_loss So Far: 0.7890414595603943
Total elapsed time: 12h 52m 17s
Best Hyperparameters: {'units_1': 224, 'num_hidden_layers': 3, 'units_2': 128, 'learning_rate': 0.0016702505037489304, 'units_3': 192, 'units_4': 224, 'tuner/epochs': 10, 'tuner/initial_epoch': 4, 'tuner/bracket': 1, 'tuner/round': 1, 'tuner/trial_id': '0021'}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
