Nickhil Tekwani CS6220 Hw6

PROBLEM 1: Recommender System using Collaborative Filtering

In [6]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Load data
col_names = ["user_id", "item_id", "rating", "timestamp"]
ratings = pd.read_csv("/content/u.data", sep="\t", names=col_names)

# Train-test split
train, test = train_test_split(ratings, test_size=0.2, random_state=42)

# Create user-item matrix
user_item_matrix = train.pivot(index="user_id", columns="item_id", values="rating")

# User-User Similarity Matrix
similarity_matrix = user_item_matrix.corr()
def predict_rating(user_id, item_id):
    """Predicts a user's rating for an item."""

    # Check if the user or item doesn't exist in the matrices
    if user_id not in user_item_matrix.index or item_id not in user_item_matrix.columns:
        print(f"User: {user_id}, Item: {item_id} not found in the matrices.")
        return user_item_matrix.values.mean()  # Fallback to the global average rating

    # Otherwise, compute prediction using the user-item and similarity matrix
    # You may use dot product of user's ratings and item's similarity scores
    # and divide by the sum of absolute similarities.

    # Ensure the similarity scores for the item are available
    if item_id in similarity_matrix.columns:
        item_similarity = similarity_matrix[item_id]
        user_ratings = user_item_matrix.loc[user_id]

        # Compute the dot product
        dot_product = user_ratings.dot(item_similarity)

        # Calculate the predicted rating
        predicted_rating_value = dot_product / item_similarity.abs().sum()

        return predicted_rating_value

    # Fallback to the user's average rating if the above computation fails
    return user_item_matrix.loc[user_id].mean()



missing_items_user_item = set(test['item_id'].unique()) - set(user_item_matrix.columns)
missing_items_similarity = set(test['item_id'].unique()) - set(similarity_matrix.columns)

print(f"Items in test but not in user_item_matrix: {len(missing_items_user_item)}")
print(f"Items in test but not in similarity_matrix: {len(missing_items_similarity)}")



# Predict ratings on the test set
test["predicted_rating"] = test.apply(lambda x: predict_rating(x["user_id"], x["item_id"]), axis=1)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(test["rating"], test["predicted_rating"]))
print(f"Root Mean Squared Error (RMSE): {rmse}")


Items in test but not in user_item_matrix: 29
Items in test but not in similarity_matrix: 29
User: 655, Item: 1640 not found in the matrices.
User: 425, Item: 1596 not found in the matrices.
User: 60, Item: 1122 not found in the matrices.
User: 206, Item: 1433 not found in the matrices.
User: 854, Item: 1677 not found in the matrices.
User: 181, Item: 1320 not found in the matrices.
User: 405, Item: 1565 not found in the matrices.
User: 405, Item: 1546 not found in the matrices.
User: 450, Item: 1603 not found in the matrices.
User: 279, Item: 1493 not found in the matrices.
User: 782, Item: 1669 not found in the matrices.
User: 405, Item: 1551 not found in the matrices.
User: 682, Item: 1655 not found in the matrices.
User: 181, Item: 1352 not found in the matrices.
User: 655, Item: 1648 not found in the matrices.
User: 234, Item: 1460 not found in the matrices.
User: 445, Item: 1601 not found in the matrices.
User: 437, Item: 1599 not found in the matrices.
User: 13, Item: 814 not fo

ValueError: ignored

PROBLEM 3A: Social Community Detection


In [None]:
import igraph as ig

def girvan_newman(graph, iterations=None):
    """
    Implement the Girvan-Newman algorithm to detect communities.
    """
    # Deep copy of the graph to ensure original remains unchanged
    g = graph.copy()

    # Keeps track of the graphs at each split
    sub_graphs = [g]

    # Either iterate until graph is fully dissected or for a set number of iterations
    i = 0
    while len(sub_graphs) == i + 1:
        if iterations and i >= iterations:
            break

        # Compute the edge betweenness scores
        betweenness = g.edge_betweenness()

        # Find the edge with the highest betweenness score
        max_betweenness = max(betweenness)
        max_index = betweenness.index(max_betweenness)

        # Remove the edge with the highest betweenness
        g.delete_edges(max_index)

        # Get the subgraphs (i.e., the connected components) of the graph
        sub_graphs = g.decompose()

        i += 1

    # Return the resulting subgraphs as communities
    return sub_graphs

def read_edgelist(filename):
    """Read an edge list from a file and return an igraph.Graph."""
    with open(filename, 'r') as f:
        edges = [tuple(map(int, line.strip().split(','))) for line in f]
    return ig.Graph(edges, directed=False)


# Use the custom function to read the graph
g = read_edgelist('/content/soc-Flickr-ASU.edges')

# Read the node labels
with open('/content/soc-Flickr-ASU.node_labels', 'r') as f:
    labels_data = [line.strip().split(',') for line in f]
    node_ids = [int(data[0]) for data in labels_data]
    labels = [data[1] for data in labels_data]

# Assign a 'name' attribute for each vertex based on its ID
g.vs["name"] = [v.index for v in g.vs]

# Map labels to the graph nodes
for idx, label in zip(node_ids, labels):
    try:
        node = g.vs.find(name=idx)  # Use integer idx directly without converting to string
        node["label"] = label
    except ValueError:
        # The node does not exist in the graph, so we ignore it
        pass


# Call the Girvan-Newman function
communities = girvan_newman(g, 10)

# Print the communities
for community in communities:
    print(community.vs["label"])
