In [20]:
import sys
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [41]:
#loading the data set
try:
    df = pd.read_csv("talent.csv")
except Exception as e:
    print("Error reading CSV file:", e)
    sys.exit(1)

df.info()
sys.stdout.flush()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5392 entries, 0 to 5391
Columns: 110 entries, id to in_youtubers
dtypes: float64(2), int64(104), object(4)
memory usage: 4.5+ MB


In [42]:

binary_cols = [col for col in df.columns if col.startswith("in_")]
df[binary_cols] = df[binary_cols].apply(pd.to_numeric, errors='coerce').fillna(0)
df['name'] = df['name'].astype(str).str.strip()

# handling duplicates
duplicates = df[df['name'].duplicated(keep=False)]
if not duplicates.empty:
    print("Warning: Duplicate names found:")
    print(duplicates[['name']])
    df = df.drop_duplicates(subset='name', keep='first')
    print("Dropped duplicates, keeping first occurrence.")

# checking non-binary values
non_binary = [col for col in binary_cols if not df[col].isin([0, 1, np.nan]).all()]
if non_binary:
    print("Warning: Non-binary values found in columns:", non_binary)

# Check sparsity and category distribution
sparsity = (df[binary_cols] == 0).mean().mean()
sys.stdout.flush()

               name
989   Renee Olstead
2326   Brooke Berry
2600           Kyle
2793           Kyle
4116             JP
4175             JP
4182  Amber Diamond
4383  Renee Olstead
4443  Amber Diamond
4969   Brooke Berry
Dropped duplicates, keeping first occurrence.


In [44]:
# creating a new DataFrame X that includes the 'stars' column (a quality score)
# along with a set of binary columns (e.g., features that are either True/False)
X = df[['stars'] + binary_cols].copy()

# normalizing the 'stars' column:
# subtracting the mean and dividing by the standard deviation.
# We add a small number (1e-6) to the standard deviation so we never divide by zero.
X['stars'] = (X['stars'] - X['stars'].mean()) / (X['stars'].std() + 1e-6)

# Calculating moderate weights for the binary columns:
# 1. For each binary column, sum its values (to see how common each feature is).
# 2. Take the reciprocal (1/count) so that features appearing rarely get higher weight.
# 3. Use np.log1p to apply a logarithm that smooths out the weights.
# 4. Clip the weights between 0 and 10 to avoid extreme values.
weights = np.log1p(1 / (df[binary_cols].sum() + 1e-6)).clip(0, 10)

# Add a default weight of 1 for the 'stars' column.
# We combine this with the previously calculated weights for the binary columns.
weights = pd.concat([pd.Series(1.0, index=['stars']), weights])

# Multiply each column in our feature matrix X by its corresponding weight.
# This creates a weighted feature matrix where more important features have a bigger effect.
X_weighted = X.mul(weights)

# Compute the cosine similarity between every pair of rows in the weighted feature matrix.
# The result is a similarity matrix indicating how similar each pair is.
similarity_matrix = cosine_similarity(X_weighted)

# Make sure all similarity values are in the range [0, 1].
# This might be needed in case some small numerical issues cause values to slightly exceed these boundaries.
similarity_matrix = np.clip(similarity_matrix, 0, 1)

# Convert the similarity matrix (a NumPy array) back into a DataFrame.
# Use the names from df['name'] as both the row and column labels.
similarity_df = pd.DataFrame(similarity_matrix, index=df['name'], columns=df['name'])

# Check if any names from the original data are missing in the similarity DataFrame.
missing = set(df['name']) - set(similarity_df.index)
if missing:
    # If names are missing, print a warning message.
    print(f"Warning: Missing names in similarity_df: {missing}")

# Ensure that all printed output is sent to the command prompt immediately.
sys.stdout.flush()


In [45]:


def recommend_similar_items(item_name, sim_df, df, binary_cols, top_n=5):
    # Check if the specified item exists in the similarity DataFrame.
    # If not, raise an error and show some example names.
    if item_name not in sim_df.index:
        raise KeyError(f"'{item_name}' not in similarity DataFrame index. Try: {list(sim_df.index[:5])}")
    
    # Retrieve the similarity scores for the given item.
    # This row contains the similarity between the given item and every other item.
    sim_scores = sim_df.loc[item_name]
    
    # In some cases there might be multiple entries for the same item
    # (for example, duplicate rows). If so, warn the user and use the first entry.
    if isinstance(sim_scores, pd.DataFrame):
        print(f"Warning: Multiple entries for '{item_name}'. Using first row.")
        sim_scores = sim_scores.iloc[0]
    
    # Remove the similarity score for the item with itself.
    # This prevents the function from recommending the same item as its own similar item.
    sim_scores = sim_scores.drop(item_name, errors='ignore')
    
    # Sort the remaining scores in descending order to find the items most similar to the given item.
    # Then, select the top_n items with the highest similarity.
    top_items = sim_scores.sort_values(ascending=False).head(top_n)
    
    # If the highest similarity score is extremely low (close to zero),
    # assume that no similar items were found.
    # In that case, fall back to recommending the most popular items based on the 'stars' rating.
    if top_items.max() < 1e-6:
        top_items = df.nlargest(top_n, 'stars')['name']
        # Create a series of zeros (indicating similarity scores) for the fallback items.
        top_items = pd.Series([0.0] * top_n, index=top_items)
    
    # Return the final series of recommended items and their similarity scores.
    return top_items


def recommend_for_user(user_interests, X, item_names, df, top_n=5):
    # Check if each user interest corresponds to a valid column in the feature matrix X.
    # Build a list of interests that are not found in X.
    invalid_interests = [i for i in user_interests if i not in X.columns]
    
    # If there are any invalid interests, warn the user and remove them.
    if invalid_interests:
        print(f"Warning: Invalid interests {invalid_interests}. Ignoring.")
        user_interests = [i for i in user_interests if i in X.columns]
    
    # If after filtering there are no valid interests left,
    # return the most popular items based on their 'stars' ratings.
    if not user_interests:
        print("No valid interests. Returning popular items.")
        return df.nlargest(top_n, 'stars')['name'].to_series()
    
    # Create a binary vector representing the user's interests.
    # For each column in X, place a 1 if the user is interested in that feature, otherwise 0.
    user_vector = np.array([1 if col in user_interests else 0 for col in X.columns])
    
    # Calculate cosine similarity between each item in X and the user's interest vector.
    # This produces a score for each item, indicating how closely it matches the user's interests.
    sim_scores = cosine_similarity(X, [user_vector]).flatten()
    
    # Convert the similarity scores into a pandas Series for easier handling, 
    # using item_names as the index so each score corresponds to an item.
    sim_series = pd.Series(sim_scores, index=item_names)
    
    # If any values in the similarity scores are NaN, replace them with zero.
    sim_series = sim_series.fillna(0)
    
    # Count how many of the user's interests apply to each item by summing the valid features.
    # This provides an extra measure of how well each item matches the user's preferences.
    interest_counts = df[user_interests].sum(axis=1)
    
    # Enhance the similarity scores by slightly increasing the value based on the number of matching interests.
    # The more interests an item matches, the higher its score becomes (with a 10% boost per interest match).
    sim_series = sim_series * (1 + 0.1 * interest_counts)
    
    # Sort the boosted scores in descending order and select the top_n items.
    top_items = sim_series.sort_values(ascending=False).head(top_n)
    
    # If the maximum similarity after boosting is still extremely low,
    # it's likely that none of the items match the user's interests well.
    # In that case, fall back to returning the most popular items.
    if top_items.max() < 1e-6:
        print("No matching items. Returning popular items.")
        top_items = df.nlargest(top_n, 'stars')['name'].to_series()
    
    # Return the final list of recommended items with their scores.
    return top_items


In [46]:

def precision_at_k(item_name, sim_df, true_df, binary_cols, k=5):
    """
    Compute the precision at k for a given item.
    Precision at k measures the fraction of recommended items (out of k)
    that share at least one common category with the target item.
    """
    try:
        # Generate the top k recommended items for the given item_name.
        recs = recommend_similar_items(item_name, sim_df, true_df, binary_cols, k)
        
        # Find the set of true categories (binary features with value 1)
        # for the target item. This uses the true_df DataFrame.
        target_item_rows = true_df[true_df['name'] == item_name]
        # Filter binary columns where the value is 1 and retrieve the column names
        true_cats = set(
            target_item_rows[binary_cols].columns[
                target_item_rows[binary_cols].iloc[0] == 1
            ]
        )
        
        # If the target item has no true categories marked, precision is 0.
        if not true_cats:
            return 0
        
        # Count how many of the recommended items share at least one category
        # with the target item's true categories.
        relevant_count = 0
        for rec_name in recs.index:
            rec_item_rows = true_df[true_df['name'] == rec_name]
            # Retrieve the set of active categories for the recommended item.
            rec_cats = set(
                rec_item_rows[binary_cols].columns[
                    rec_item_rows[binary_cols].iloc[0] == 1
                ]
            )
            # Check if there is any overlap between the target and recommended categories.
            if true_cats.intersection(rec_cats):
                relevant_count += 1
        
        # Calculate precision as the fraction of recommended items that are relevant.
        return relevant_count / k
    
    except KeyError:
        # If the item is not found in the similarity DataFrame, return 0.
        return 0
        
def compute_coverage(sim_df, df, binary_cols, top_n=5):
    """
    Compute the coverage of the recommendation system.
    Coverage is defined as the proportion of items in the dataset
    that appear in any recommendation list.
    """
    all_recs = set()
    
    # Iterate over each item name in the dataset
    for name in df['name']:
        try:
            # Get the top_n recommendations for the current item.
            recs = recommend_similar_items(name, sim_df, df, binary_cols, top_n)
            # Add the recommended items to the overall set of recommendations.
            all_recs.update(recs.index)
        except KeyError:
            # If the item is not found in the similarity DataFrame, skip it.
            continue
    
    # Calculate the coverage ratio.
    # If there are items in the dataset, divide the number of unique recommendations
    # by the total number of items. Return 0 if the dataset is empty.
    return len(all_recs) / len(df) if len(df) > 0 else 0

def intra_list_similarity(rec_names, sim_df):
    """
    Calculate the average cosine similarity among all unique pairs of items
    in a recommendation list. This measures how similar the recommended items
    are to each other.
    """
    # If there are fewer than 2 recommendations, no pair exists, so return 0.
    if len(rec_names) < 2:
        return 0
    
    # Compute similarity scores for each unique pair in the recommendation list.
    sim_scores = [
        sim_df.loc[n1, n2] 
        for i, n1 in enumerate(rec_names) 
        for n2 in rec_names[i+1:]
    ]
    
    # Return the mean of the similarity scores if any exist; otherwise, return 0.
    return np.mean(sim_scores) if sim_scores else 0


In [49]:


# Choose an item to use as a query for recommendations.
# Here, we select the first valid name from the 'name' column of our DataFrame.
item_of_interest = df['name'].iloc[0]  # First valid name

try:
    # Get top 5 similar items for the chosen item using our recommendation function.
    recs = recommend_similar_items(item_of_interest, similarity_df, df, binary_cols, top_n=5)
    
    # Print a header and the list of recommended items for the selected item.
    print(f"\nRecommendations for '{item_of_interest}':")
    print(recs)
    
    # For the recommended items, sum up the binary category flags to understand
    # which categories/features are most common among them.
    print("Categories of recommended items:")
    print(df[df['name'].isin(recs.index)][binary_cols].sum().sort_values(ascending=False).head(10))
    
    # Calculate intra-list diversity as 1 minus the average cosine similarity among recommendations.
    # This metric helps assess how varied (or diverse) the recommendations are.
    diversity = 1 - intra_list_similarity(recs.index, similarity_df)
    print(f"Intra-list diversity: {diversity:.3f}")
    
except KeyError as e:
    # If the item_of_interest is not found in the similarity DataFrame, catch the error and print it.
    print("KeyError:", e)

# Flush the output buffer to ensure all prints are displayed immediately.
sys.stdout.flush()


# Define a list of user interests using the corresponding binary feature labels.
user_interests = ["in_90_day_fiance", "in_artists"]

try:
    # Generate personalized recommendations based on the user's interests.
    # The function returns the top 5 items that best match the given interests.
    user_recs = recommend_for_user(user_interests, X, df['name'], df, top_n=5)
    
    # Display the recommendations for the given interests.
    print(f"\nRecommendations for interests {user_interests}:")
    print(user_recs)
    
    # Also show the interest flags for the recommended items to verify they align
    # with the user's interests.
    print("Interest flags for recommended items:")
    print(df[df['name'].isin(user_recs.index)][user_interests])
    
except Exception as e:
    # Catch any error during the recommendation process and print an error message.
    print("Error in user recommendations:", e)


# Identify items in the dataset that have all of the specified user interest flags enabled.
both_interests = df[df[user_interests].eq(1).all(axis=1)]['name']
print(f"\nItems with both {user_interests}: {len(both_interests)}")
if not both_interests.empty:
    # If any items match, display the first few.
    print(both_interests.head())


# Evaluate the recommendation system's precision using the full similarity DataFrame.
# We compute precision@k for a 20% random sample of items to speed up the process.
precision_scores = []
for name in df['name'].sample(frac=0.2, random_state=42):
    # Calculate precision@5 for each sampled item.
    precision = precision_at_k(name, similarity_df, df, binary_cols, k=5)
    if precision is not None:
        precision_scores.append(precision)
        
# Compute the average precision over all sampled items.
avg_precision = np.mean(precision_scores) if precision_scores else 0
print(f"\nAverage Precision@5: {avg_precision:.3f}")

# Compute coverage, which measures the proportion of items that appear in any recommendation list.
coverage = compute_coverage(similarity_df, df, binary_cols, top_n=5)
print(f"Coverage: {coverage:.3f}")


# comparing the average stars and reactions of the recommended items
# to the overall dataset averages if those columns are available.
if 'stars' in df.columns and 'reactions' in df.columns:
    try:
        # Get the names of the items recommended based on user interests.
        rec_names = user_recs.index
        
        # Calculate average stars and reactions for these recommended items.
        avg_stars = df[df['name'].isin(rec_names)]['stars'].mean()
        avg_reactions = df[df['name'].isin(rec_names)]['reactions'].mean()
        
        # Also compute the average stars and reactions for the entire dataset.
        dataset_stars = df['stars'].mean()
        dataset_reactions = df['reactions'].mean()
        
        # Print the comparison to see if the recommendations skew toward higher quality or popularity.
        print(f"Average stars of recommendations: {avg_stars:.2f} (dataset avg: {dataset_stars:.2f})")
        print(f"Average reactions of recommendations: {avg_reactions:.2f} (dataset avg: {dataset_reactions:.2f})")
        
    except:
        # If any error occurs while calculating these metrics, print an error message.
        print("Error computing stars/reactions")

# Flush all pending print outputs to the console.
sys.stdout.flush()



Recommendations for 'Perez Hilton':
name
Kristen Ledlow     0.999996
Darren Rovell      0.999994
Matt Hasselbeck    0.999984
Bob Menery         0.999978
Brian Balthazar    0.999957
Name: Perez Hilton, dtype: float64
Categories of recommended items:
in_commentators     5
in_featured         4
in_athletes         1
in_comedians        1
in_football         1
in_reality_tv       1
in_90_day_fiance    0
in_artists          0
in_animals          0
in_american_idol    0
dtype: int64
Intra-list diversity: 0.000

Recommendations for interests ['in_90_day_fiance', 'in_artists']:
0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
dtype: float64
Interest flags for recommended items:
Empty DataFrame
Columns: [in_90_day_fiance, in_artists]
Index: []

Items with both ['in_90_day_fiance', 'in_artists']: 0

Average Precision@5: 0.999
Coverage: 0.681
Average stars of recommendations: nan (dataset avg: 3.69)
Average reactions of recommendations: nan (dataset avg: 10.07)
