In [20]:
import sys
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [22]:
#loading the data set
try:
    df = pd.read_csv("talent.csv")
except Exception as e:
    print("Error reading CSV file:", e)
    sys.exit(1)

df.info()
sys.stdout.flush()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5392 entries, 0 to 5391
Columns: 110 entries, id to in_youtubers
dtypes: float64(2), int64(104), object(4)
memory usage: 4.5+ MB


In [24]:

binary_cols = [col for col in df.columns if col.startswith("in_")]
df[binary_cols] = df[binary_cols].apply(pd.to_numeric, errors='coerce').fillna(0)
df['name'] = df['name'].astype(str).str.strip()

# handling duplicates
duplicates = df[df['name'].duplicated(keep=False)]
if not duplicates.empty:
    print("Warning: Duplicate names found:")
    print(duplicates[['name']])
    df = df.drop_duplicates(subset='name', keep='first')
    print("Dropped duplicates, keeping first occurrence.")

# checking non-binary values
non_binary = [col for col in binary_cols if not df[col].isin([0, 1, np.nan]).all()]
if non_binary:
    print("Warning: Non-binary values found in columns:", non_binary)

# Check sparsity and category distribution
sparsity = (df[binary_cols] == 0).mean().mean()
sys.stdout.flush()

In [40]:
# creating a new DataFrame X that includes the 'stars' column (a quality score)
# along with a set of binary columns (e.g., features that are either True/False)
X = df[['stars'] + binary_cols].copy()

# Normalize the 'stars' column:
# Subtract the mean and divide by the standard deviation.
# We add a small number (1e-6) to the standard deviation so we never divide by zero.
X['stars'] = (X['stars'] - X['stars'].mean()) / (X['stars'].std() + 1e-6)

# Calculate moderate weights for the binary columns:
# 1. For each binary column, sum its values (to see how common each feature is).
# 2. Take the reciprocal (1/count) so that features appearing rarely get higher weight.
# 3. Use np.log1p to apply a logarithm that smooths out the weights.
# 4. Clip the weights between 0 and 10 to avoid extreme values.
weights = np.log1p(1 / (df[binary_cols].sum() + 1e-6)).clip(0, 10)

# Add a default weight of 1 for the 'stars' column.
# We combine this with the previously calculated weights for the binary columns.
weights = pd.concat([pd.Series(1.0, index=['stars']), weights])

# Multiply each column in our feature matrix X by its corresponding weight.
# This creates a weighted feature matrix where more important features have a bigger effect.
X_weighted = X.mul(weights)

# Compute the cosine similarity between every pair of rows in the weighted feature matrix.
# The result is a similarity matrix indicating how similar each pair is.
similarity_matrix = cosine_similarity(X_weighted)

# Make sure all similarity values are in the range [0, 1].
# This might be needed in case some small numerical issues cause values to slightly exceed these boundaries.
similarity_matrix = np.clip(similarity_matrix, 0, 1)

# Convert the similarity matrix (a NumPy array) back into a DataFrame.
# Use the names from df['name'] as both the row and column labels.
similarity_df = pd.DataFrame(similarity_matrix, index=df['name'], columns=df['name'])

# Check if any names from the original data are missing in the similarity DataFrame.
missing = set(df['name']) - set(similarity_df.index)
if missing:
    # If names are missing, print a warning message.
    print(f"Warning: Missing names in similarity_df: {missing}")

# Ensure that all printed output is sent to the command prompt immediately.
sys.stdout.flush()


In [37]:
# -------------------------
# 4. Define Recommendation Functions
# -------------------------
def recommend_similar_items(item_name, sim_df, df, binary_cols, top_n=5):
    if item_name not in sim_df.index:
        raise KeyError(f"'{item_name}' not in similarity DataFrame index. Try: {list(sim_df.index[:5])}")
    sim_scores = sim_df.loc[item_name]
    if isinstance(sim_scores, pd.DataFrame):
        print(f"Warning: Multiple entries for '{item_name}'. Using first row.")
        sim_scores = sim_scores.iloc[0]
    sim_scores = sim_scores.drop(item_name, errors='ignore')
    top_items = sim_scores.sort_values(ascending=False).head(top_n)
    # NEW: Fallback to popular items if all zeros
    if top_items.max() < 1e-6:
        top_items = df.nlargest(top_n, 'stars')['name']
        top_items = pd.Series([0.0] * top_n, index=top_items)
    return top_items


def recommend_for_user(user_interests, X, item_names, df, top_n=5):
    invalid_interests = [i for i in user_interests if i not in X.columns]
    if invalid_interests:
        print(f"Warning: Invalid interests {invalid_interests}. Ignoring.")
        user_interests = [i for i in user_interests if i in X.columns]
    if not user_interests:
        print("No valid interests. Returning popular items.")
        return df.nlargest(top_n, 'stars')['name'].to_series()
    user_vector = np.array([1 if col in user_interests else 0 for col in X.columns])
    sim_scores = cosine_similarity(X, [user_vector]).flatten()
    sim_series = pd.Series(sim_scores, index=item_names)
    # NEW: Handle NaN and boost multi-interest matches
    sim_series = sim_series.fillna(0)
    interest_counts = df[user_interests].sum(axis=1)
    sim_series = sim_series * (1 + 0.1 * interest_counts)
    top_items = sim_series.sort_values(ascending=False).head(top_n)
    if top_items.max() < 1e-6:
        print("No matching items. Returning popular items.")
        top_items = df.nlargest(top_n, 'stars')['name'].to_series()
    return top_items

In [38]:
# -------------------------
# 5. Evaluation Functions
# -------------------------
def precision_at_k(item_name, sim_df, true_df, binary_cols, k=5):
    try:
        recs = recommend_similar_items(item_name, sim_df, true_df, binary_cols, k)
        true_cats = set(true_df[true_df['name'] == item_name][binary_cols].columns[
            true_df[true_df['name'] == item_name][binary_cols].iloc[0] == 1])
        if not true_cats:
            return 0
        relevant_count = 0
        for rec_name in recs.index:
            rec_cats = set(true_df[true_df['name'] == rec_name][binary_cols].columns[
                true_df[true_df['name'] == rec_name][binary_cols].iloc[0] == 1])
            if true_cats.intersection(rec_cats):
                relevant_count += 1
        return relevant_count / k
    except KeyError:
        return 0
        
def compute_coverage(sim_df, df, binary_cols, top_n=5):
    all_recs = set()
    for name in df['name']:
        try:
            recs = recommend_similar_items(name, sim_df, df, binary_cols, top_n)
            all_recs.update(recs.index)
        except KeyError:
            continue
    return len(all_recs) / len(df) if len(df) > 0 else 0

def intra_list_similarity(rec_names, sim_df):
    if len(rec_names) < 2:
        return 0
    sim_scores = [sim_df.loc[n1, n2] for i, n1 in enumerate(rec_names) 
                  for n2 in rec_names[i+1:]]
    return np.mean(sim_scores) if sim_scores else 0


In [39]:
# -------------------------
# 6. Example Usage and Evaluation
# -------------------------
# Select valid item (NEW: From df['name'])
item_of_interest = df['name'].iloc[0]  # First valid name
try:
    recs = recommend_similar_items(item_of_interest, similarity_df, df, binary_cols, top_n=5)
    print(f"\nRecommendations for '{item_of_interest}':")
    print(recs)
    print("Categories of recommended items:")
    print(df[df['name'].isin(recs.index)][binary_cols].sum().sort_values(ascending=False).head(10))
    diversity = 1 - intra_list_similarity(recs.index, similarity_df)
    print(f"Intra-list diversity: {diversity:.3f}")
except KeyError as e:
    print("KeyError:", e)
sys.stdout.flush()


user_interests = ["in_90_day_fiance", "in_artists"]
try:
    user_recs = recommend_for_user(user_interests, X, df['name'], df, top_n=5)
    print(f"\nRecommendations for interests {user_interests}:")
    print(user_recs)
    print("Interest flags for recommended items:")
    print(df[df['name'].isin(user_recs.index)][user_interests])
except Exception as e:
    print("Error in user recommendations:", e)


both_interests = df[df[user_interests].eq(1).all(axis=1)]['name']
print(f"\nItems with both {user_interests}: {len(both_interests)}")
if not both_interests.empty:
    print(both_interests.head())

# Use full similarity_df for evaluation (NEW: Avoid train/test issues)
precision_scores = []
for name in df['name'].sample(frac=0.2, random_state=42):  # Subsample for speed
    precision = precision_at_k(name, similarity_df, df, binary_cols, k=5)
    if precision is not None:
        precision_scores.append(precision)
avg_precision = np.mean(precision_scores) if precision_scores else 0
print(f"\nAverage Precision@5: {avg_precision:.3f}")

coverage = compute_coverage(similarity_df, df, binary_cols, top_n=5)
print(f"Coverage: {coverage:.3f}")

if 'stars' in df.columns and 'reactions' in df.columns:
    try:
        rec_names = user_recs.index
        avg_stars = df[df['name'].isin(rec_names)]['stars'].mean()
        avg_reactions = df[df['name'].isin(rec_names)]['reactions'].mean()
        dataset_stars = df['stars'].mean()
        dataset_reactions = df['reactions'].mean()
        print(f"Average stars of recommendations: {avg_stars:.2f} (dataset avg: {dataset_stars:.2f})")
        print(f"Average reactions of recommendations: {avg_reactions:.2f} (dataset avg: {dataset_reactions:.2f})")
    except:
        print("Error computing stars/reactions")
sys.stdout.flush()



Recommendations for 'Perez Hilton':
name
Kristen Ledlow     0.999996
Darren Rovell      0.999994
Matt Hasselbeck    0.999984
Bob Menery         0.999978
Brian Balthazar    0.999957
Name: Perez Hilton, dtype: float64
Categories of recommended items:
in_commentators     5
in_featured         4
in_athletes         1
in_comedians        1
in_football         1
in_reality_tv       1
in_90_day_fiance    0
in_artists          0
in_animals          0
in_american_idol    0
dtype: int64
Intra-list diversity: 0.000

Recommendations for interests ['in_90_day_fiance', 'in_artists']:
0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
dtype: float64
Interest flags for recommended items:
Empty DataFrame
Columns: [in_90_day_fiance, in_artists]
Index: []

Items with both ['in_90_day_fiance', 'in_artists']: 0

Average Precision@5: 0.999
Coverage: 0.681
Average stars of recommendations: nan (dataset avg: 3.69)
Average reactions of recommendations: nan (dataset avg: 10.07)


In [16]:
da_columns = [col for col in talent.columns if col.startswith("in_")]


In [17]:
# Ensure they are numeric (e.g., 0 or 1)
talent[da_columns] = talent[da_columns].apply(pd.to_numeric, errors='coerce').fillna(0)

In [18]:
X = talent[da_columns].copy()

In [19]:
from sklearn.metrics.pairwise import cosine_similarity