## Libraries

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score

from surprise import KNNBasic
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow import keras
from tensorflow.keras import layers

## Functions

In [None]:
def generate_recommendations_for_one_user(enrolled_course_ids, unselected_course_ids, id_idx_dict, sim_matrix):
    # Create a dictionary to store your recommendation results
    res = {}
    threshold = 0.6 

    for enrolled_course in enrolled_course_ids:
        for unselect_course in unselected_course_ids:
            if enrolled_course in id_idx_dict and unselect_course in id_idx_dict:
                # Find the indices for each enrolled_course and unselect_course based on their ids
                enrolled_idx = id_idx_dict[enrolled_course]
                unselect_idx = id_idx_dict[unselect_course]
                
                # Calculate the similarity between the enrolled_course and unselect_course
                sim = sim_matrix[enrolled_idx][unselect_idx]
                
                if sim > threshold:
                    if unselect_course not in res:
                        res[unselect_course] = sim
                    else:
                        if sim >= res[unselect_course]:
                            res[unselect_course] = sim
                            
    # Sort the results by similarity
    res = {k: v for k, v in sorted(res.items(), key=lambda item: item[1], reverse=True)}
    return res

In [None]:
def generate_recommendations_for_all():
    users = []
    courses = []
    sim_scores = []
    
    bow_df = pd.read_csv(bow_url)  
    
    test_users = test_users_df.groupby(['user']).max().reset_index(drop=False)
    all_courses = set(course_df['COURSE_ID'])
    
    for user_id in test_users['user']:
        enrolled_course_ids = set(test_users[test_users['user'] == user_id]['item'])
        unselected_course_ids = all_courses.difference(enrolled_course_ids)
        
        # Call generate_recommendations_for_one_user for each user
        recommendations = generate_recommendations_for_one_user(
            enrolled_course_ids, unselected_course_ids, id_idx_dict, sim_matrix
        )
        
        # Append results to lists
        users.append(user_id)
        courses.append(list(recommendations.keys()))
        sim_scores.append(list(recommendations.values()))
    
    return users, courses, sim_scores

# Example usage
users, recommended_courses, sim_scores = generate_recommendations_for_all()

## Define & Evaluate basic regression

In [None]:
### WRITE YOUR CODE HERE
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# Create a Ridge regression model
ridge_model = Ridge(alpha=0.2)

# Fit the model on the training data
ridge_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = ridge_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Use cross-validation to evaluate the model
cv_scores = cross_val_score(ridge_model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
average_mse = -cv_scores.mean()
print(f'Average Cross-Validated Mean Squared Error: {average_mse}')



## Regression with differtent hyperparameters and evaluate

In [None]:
### WRITE YOUR CODE HERE
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, make_scorer
import numpy as np
import pandas as pd

# Assuming X and y are your feature matrix and target variable
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create parameter grids for hyperparameter tuning
ridge_params = {'alpha': [0.1, 1, 10]}
lasso_params = {'alpha': [0.1, 1, 10]}
elasticnet_params = {'alpha': [0.1, 1, 10], 'l1_ratio': [0.1, 0.5, 0.9]}

# Define scoring function (RMSE in this case)
scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Ridge Regression
ridge_model = Ridge()
ridge_grid = GridSearchCV(ridge_model, ridge_params, scoring=scorer, cv=5)
ridge_grid.fit(X_train, y_train)

# Lasso Regression
lasso_model = Lasso()
lasso_grid = GridSearchCV(lasso_model, lasso_params, scoring=scorer, cv=5)
lasso_grid.fit(X_train, y_train)

# ElasticNet Regression
elasticnet_model = ElasticNet()
elasticnet_grid = GridSearchCV(elasticnet_model, elasticnet_params, scoring=scorer, cv=5)
elasticnet_grid.fit(X_train, y_train)

# Print best hyperparameters and corresponding RMSE for each model
print("Ridge Best Hyperparameters: ", ridge_grid.best_params_)
print("Ridge RMSE on Test Set: ", np.sqrt(mean_squared_error(y_test, ridge_grid.predict(X_test))))

print("Lasso Best Hyperparameters: ", lasso_grid.best_params_)
print("Lasso RMSE on Test Set: ", np.sqrt(mean_squared_error(y_test, lasso_grid.predict(X_test))))

print("ElasticNet Best Hyperparameters: ", elasticnet_grid.best_params_)
print("ElasticNet RMSE on Test Set: ", np.sqrt(mean_squared_error(y_test, elasticnet_grid.predict(X_test))))

## Define & Evaluate Classification Models
### Note: Accuracy metrics switced for RMSE, however, labels remain "Accuracy"

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rs)

# Logistic Regression
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)
logreg_predictions = logreg_model.predict(X_test)
logreg_accuracy = sqrt(mean_squared_error(y_test, logreg_predictions))
print("Logistic Regression Accuracy: ", logreg_accuracy)

# Decision Tree
tree_model = DecisionTreeClassifier(max_depth=5, random_state=rs)
tree_model.fit(X_train, y_train)
tree_predictions = tree_model.predict(X_test)
tree_accuracy = sqrt(mean_squared_error(y_test, tree_predictions))
print("Decision Tree Accuracy: ", tree_accuracy)

# Support Vector Machine (SVM)
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)
svm_accuracy = sqrt(mean_squared_error(y_test, svm_predictions))
print("SVM Accuracy: ", svm_accuracy)

# Random Forest
rf_model = RandomForestClassifier(max_depth=5, random_state=rs)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_accuracy = sqrt(mean_squared_error(y_test, rf_predictions))
print("Random Forest Accuracy: ", rf_accuracy)

# Bagging
bagging_model = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=10, random_state=rs)
bagging_model.fit(X_train, y_train)
bagging_predictions = bagging_model.predict(X_test)
bagging_accuracy = sqrt(mean_squared_error(y_test, bagging_predictions))
print("Bagging Accuracy: ", bagging_accuracy)

# Boosting (AdaBoost)
adaboost_model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=50, random_state=rs)
adaboost_model.fit(X_train, y_train)
adaboost_predictions = adaboost_model.predict(X_test)
adaboost_accuracy = sqrt(mean_squared_error(y_test, adaboost_predictions))
print("AdaBoost Accuracy: ", adaboost_accuracy)

In [None]:
### WRITE YOUR CODE HERE

### The main evaluation metrics could be accuracy, recall, precision, F score, and AUC.
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, confusion_matrix

# Assuming X and y are your feature matrix and target variable
# Assuming models are already trained (e.g., logreg_model, tree_model, svm_model, etc.)

models = [logreg_model, tree_model, svm_model, rf_model, bagging_model, adaboost_model]

for model in models:
    # Get predictions
    predictions = model.predict(X_test)
    
    # Accuracy
    accuracy = accuracy_score(y_test, predictions)
    print(f"Accuracy for {type(model).__name__}: {accuracy:.4f}")
    
    # Precision, Recall, F1 Score
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, predictions, average='binary')
    print(f"Precision for {type(model).__name__}: {precision:.4f}")
    print(f"Recall for {type(model).__name__}: {recall:.4f}")
    print(f"F1 Score for {type(model).__name__}: {f1:.4f}")
    
    # AUC Score
    auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    print(f"AUC for {type(model).__name__}: {auc:.4f}")

    print("\n")

## Create & evaluate using KNN (Use numpy, pandas, and sklearn)

In [None]:
ratings = pd.DataFrame(rating_df.pivot(index='user', columns='item', values='rating').fillna(0).reset_index().rename_axis(index=None, columns=None))

# Extract the rating columns (all columns except 'user')
rating_columns = ratings.columns[1:]  # Assuming the first column is 'user'

# Get the user IDs from the original DataFrame
user_ids = ratings['user']

# Create a user-item matrix (users as rows, ratings as columns)
user_item_matrix = ratings[rating_columns].values

# Split the data into a training set and a test set
train_matrix, test_matrix, train_user_ids, test_user_ids = train_test_split(user_item_matrix, user_ids, test_size=0.2, random_state=42)

# Calculate the cosine similarity between users on the training set
user_similarity_matrix = cosine_similarity(train_matrix)

# Convert the similarity matrix to a DataFrame for better readability
user_similarity_df = pd.DataFrame(user_similarity_matrix, index=train_user_ids, columns=train_user_ids)


## - For each user, find its k nearest neighbors in the sim matrix
# Define the number of nearest neighbors (k)
def find_k_nearest_neighbors2(user_similarity_matrix, user_id, k):
    if user_id in user_similarity_matrix.index:
        user_similarities = user_similarity_matrix.loc[user_id]
        most_similar_users = user_similarities.sort_values(ascending=False)
        # Exclude the user itself from the most similar users
        most_similar_users = most_similar_users.drop(user_id)
        nearest_neighbors = most_similar_users.head(k)
    else:
        # Handle the case where the user ID is not found in the similarity matrix
        nearest_neighbors = pd.Series(dtype=float)  # An empty Series
    return nearest_neighbors

# Define the estimated ratings list
estimated_ratings = []

# Loop through each user-item pair in the test dataset
for user_id, item_id, actual_rating in testset:
    # Find the k nearest neighbors for the user
    user_neighbors = find_k_nearest_neighbors2(user_similarity_matrix_df, user_id, k)
    if len(user_neighbors) == 0:
        # Handle the case where there are no neighbors for the user
        estimated_rating = 0  # You can adjust this value
    else:
        # Initialize variables for the numerator and denominator
        numerator = 0
        denominator = 0
        # Loop through the nearest neighbors
        for neighbor_id, similarity in enumerate(user_neighbors):
            # Check if the neighbor has rated the item
            if item_id in user_item_matrix.columns:
                neighbor_rating = user_item_matrix[neighbor_id, item_id]
                # Accumulate the numerator and denominator
                numerator += similarity * neighbor_rating
                denominator += similarity
        # Calculate the estimated rating using the formula
        if denominator != 0:
            estimated_rating = numerator / denominator
        else:
            # Handle the case where there are no neighbors who have rated the item
            estimated_rating = 0  # You can adjust this value

    # Append the estimated rating to the list
    estimated_ratings.append((user_id, item_id, actual_rating, estimated_rating))

# Convert the estimated ratings list to a Pandas DataFrame for further analysis
estimated_ratings_df = pd.DataFrame(estimated_ratings, columns=['User', 'Item', 'Actual_Rating', 'Estimated_Rating'])
    
#evaluate with RMSE
# Calculate the squared errors and accumulate them
squared_errors = [(actual_rating - estimated_rating) ** 2 for actual_rating, estimated_rating in zip(estimated_ratings_df['Actual_Rating'], estimated_ratings_df['Estimated_Rating'])]

# Calculate the mean of squared errors
mean_squared_error = np.mean(squared_errors)

# Calculate the RMSE
rmse = math.sqrt(mean_squared_error)

# The 'rmse' variable now contains the RMSE value
print(f'RMSE: {rmse}')

## Implement Customized ANN

In [None]:
## Update RecommenderNet() class
class ImprovedRecommenderNet(keras.Model):
    
    def __init__(self, num_users, num_items, embedding_size=32, num_hidden_units=64, activation='relu', **kwargs):
        """
        Constructor
        :param int num_users: number of users
        :param int num_items: number of items
        :param int embedding_size: the size of embedding vector
        :param int num_hidden_units: the number of units in hidden layers
        :param str activation: activation function for hidden layers
        """
        super(ImprovedRecommenderNet, self).__init__(**kwargs)
        self.num_users = num_users
        self.num_items = num_items
        self.embedding_size = embedding_size
        self.num_hidden_units = num_hidden_units
        self.activation = activation
        
        # Define user and item embedding layers
        self.user_embedding_layer = layers.Embedding(
            input_dim=num_users,
            output_dim=embedding_size,
            name='user_embedding_layer',
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.item_embedding_layer = layers.Embedding(
            input_dim=num_items,
            output_dim=embedding_size,
            name='item_embedding_layer',
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        
        # Define user and item bias layers
        self.user_bias = layers.Embedding(
            input_dim=num_users,
            output_dim=1,
            name="user_bias")
        self.item_bias = layers.Embedding(
            input_dim=num_items,
            output_dim=1,
            name="item_bias")
        
        # Additional hidden layers
        self.hidden_layers = [layers.Dense(num_hidden_units, activation=activation) for _ in range(2)]
        
        # Output layer
        self.output_layer = layers.Dense(1, activation='sigmoid')
        
    def call(self, inputs):
        user_vector = self.user_embedding_layer(inputs[:, 0])
        user_bias = self.user_bias(inputs[:, 0])
        item_vector = self.item_embedding_layer(inputs[:, 1])
        item_bias = self.item_bias(inputs[:, 1])
        
        # Calculate the dot product
        dot_user_item = tf.tensordot(user_vector, item_vector, 2)
        
        # Add user and item biases
        x = dot_user_item + user_bias + item_bias
        
        # Pass through hidden layers
        for layer in self.hidden_layers:
            x = layer(x)
        
        # Final output layer
        x = self.output_layer(x)
        
        return x
## compile and fit the updated model
embedding_size = 32
model2 = RecommenderNet(num_users, num_items, embedding_size)

model2.compile(
    loss=tf.keras.losses.MeanSquaredError(),
    optimizer=keras.optimizers.Adam(),          
    metrics=[tf.keras.metrics.RootMeanSquaredError()])

history2 = model2.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=10, batch_size=64, verbose=1)

model2.save("ann_model2", save_format="tf")

# Access validation loss from the history object
val_loss = history2.history['val_loss']

# Plot the validation loss
plt.plot(val_loss, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()