In [1]:
import pandas as pd
from scipy.sparse import csr_matrix

# Sample interaction data
# Each row corresponds to an interaction event.
# In a real-world scenario, this data might come from your API calls or database.
data = [
    {"developer": "alice", "project": "projectA", "commits": 5, "pull_requests": 2, "stars": 1},
    {"developer": "alice", "project": "projectB", "commits": 2, "pull_requests": 0, "stars": 1},
    {"developer": "alice", "project": "projectC", "commits": 2, "pull_requests": 1, "stars": 3},
    {"developer": "bob", "project": "projectA", "commits": 3, "pull_requests": 1, "stars": 0},
    {"developer": "charlie", "project": "projectC", "commits": 1, "pull_requests": 0, "stars": 0},
    {"developer": "ask", "project": "projectA", "commits": 2, "pull_requests": 1, "stars": 3}
]

# Convert to DataFrame
df = pd.DataFrame(data)

# Define weights for each interaction type
w_commits = 1.0
w_prs = 2.0
w_stars = 0.5

# Calculate an interaction score for each record
df["interaction"] = (
    w_commits * df["commits"] +
    w_prs * df["pull_requests"] +
    w_stars * df["stars"]
)

# Create the user-item matrix: rows = developers, columns = projects, values = interaction score
user_item_matrix = df.pivot_table(index='developer', columns='project', values='interaction', fill_value=0)

print(user_item_matrix)

project    projectA  projectB  projectC
developer                              
alice           9.5       2.5       5.5
ask             5.5       0.0       0.0
bob             5.0       0.0       0.0
charlie         0.0       0.0       1.0


In [2]:
copy_df = pd.DataFrame(user_item_matrix.to_dict())
copy_df.shape
copy_df.columns[0]


'projectA'

In [66]:
for i in range(copy_df.shape[0]):
    l = []
    for k in copy_df.columns:
        if copy_df[k].iloc[i] != 0:
            l.append(copy_df[k].iloc[i])
    mean = sum(l)/len(l)
    for k in copy_df.columns:
        if copy_df[k].iloc[i] != 0:
            copy_df[k][i] = copy_df[k].iloc[i] - mean
    print(mean)

5.833333333333333
5.5
5.0
1.0


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  copy_df[k][i] = copy_df[k].iloc[i] - mean
  copy_df[k][i] = copy_df[k].iloc[i] - mean
  copy_df[k][i] = copy_df[k].iloc[i] - mean
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the

In [67]:
copy_df

Unnamed: 0,projectA,projectB,projectC
alice,3.666667,-3.333333,-0.333333
ask,0.0,0.0,0.0
bob,0.0,0.0,0.0
charlie,0.0,0.0,0.0


In [10]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity


#for

# Sample interaction data
data = [
    {"developer": "alice", "project": "projectA", "commits": 5, "pull_requests": 2, "stars": 1},
    {"developer": "alice", "project": "projectB", "commits": 2, "pull_requests": 0, "stars": 1},
    {"developer": "alice", "project": "projectC", "commits": 2, "pull_requests": 1, "stars": 3},
    {"developer": "bob", "project": "projectA", "commits": 3, "pull_requests": 1, "stars": 0},
    {"developer": "charlie", "project": "projectC", "commits": 1, "pull_requests": 0, "stars": 0},
    {"developer": "ask", "project": "projectA", "commits": 2, "pull_requests": 1, "stars": 3}
]

# Convert to DataFrame
df = pd.DataFrame(data)

# Define weights
w_commits, w_prs, w_stars = 2.0, 0.5, 4.0

# Compute interaction scores
df["interaction"] = (
    w_commits * df["commits"] +
    w_prs * df["pull_requests"] +
    w_stars * df["stars"]
)

# Create user-item matrix
user_item_matrix = df.pivot_table(index='developer', columns='project', values='interaction', fill_value=0)

# Convert to sparse matrix
sparse_matrix = csr_matrix(user_item_matrix.values)

# Compute cosine similarity between developers
similarities = cosine_similarity(sparse_matrix)

# Convert to DataFrame for readability
similarity_df = pd.DataFrame(similarities, index=user_item_matrix.index, columns=user_item_matrix.index)

# Function to recommend projects
def recommend_projects(target_developer, user_item_matrix, similarity_df, top_n=2):
    if target_developer not in user_item_matrix.index:
        return f"Developer {target_developer} not found."
    
    # Get similar developers
    similar_devs = similarity_df[target_developer].drop(target_developer).sort_values(ascending=False)

    # Get projects the target developer has already interacted with
    seen_projects = set(user_item_matrix.loc[target_developer][user_item_matrix.loc[target_developer] > 0].index)

    # Get projects from similar developers
    recommendations = {}
    for similar_dev, similarity_score in similar_devs.items():
        similar_projects = user_item_matrix.loc[similar_dev][user_item_matrix.loc[similar_dev] > 0].index
        for project in similar_projects:
            if project not in seen_projects:
                recommendations[project] = recommendations.get(project, 0) + similarity_score

    # Sort recommendations by strength and return top N
    sorted_recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)
    return [project for project, score in sorted_recommendations[:top_n]]

# Example: Recommend projects for Alice
print("Recommended projects for bob:", recommend_projects("bob", user_item_matrix, similarity_df))


Recommended projects for bob: ['projectB', 'projectC']
