In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
cd drive/MyDrive/


/content/drive/MyDrive


In [5]:
!pip install -q sentence-transformers pandas numpy

import pandas as pd
import numpy as np
import json
from sentence_transformers import SentenceTransformer



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.1/21.1 MB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
# Load brand campaigns
with open('/content/drive/MyDrive/indian_synthetic_brands.json', 'r') as f:
    brands_json = json.load(f)
brands_df = pd.DataFrame(brands_json)

# Load influencer profiles
with open('/content/drive/MyDrive/indian_synthetic_influencer_profiles.json', 'r') as f:
    infl_json = json.load(f)
influencers_df = pd.DataFrame(infl_json)


In [7]:
def brand_text(row):
    fields = [
        row.get('title', ''),
        row.get('description', ''),
        row.get('category', ''),
        ' '.join(row.get('platforms', [])),
        str(row.get('budget', '')),
        str(row.get('targetAudience', '')),
        str(row.get('requirements', '')),
        str(row.get('metadata', ''))
    ]
    return ' | '.join(str(f) for f in fields if f)

def influencer_text(row):
    fields = [
        row.get('name', ''),
        row.get('bio', ''),
        ', '.join(row.get('categories', [])),
        ', '.join(row.get('platforms', [])),
        str(row.get('location', '')),
        ', '.join(row.get('languages', [])),
        str(row.get('age', '')),
        str(row.get('gender', '')),
        ', '.join(row.get('brandCollabs', [])),
        str(row.get('experienceLevel', ''))
    ]
    return ' | '.join(str(f) for f in fields if f)


In [9]:
model = SentenceTransformer('all-MiniLM-L6-v2')

# Embeddings for brands
brands_df['embedding_text'] = brands_df.apply(brand_text, axis=1)
brand_embeddings = model.encode(brands_df['embedding_text'].tolist(), show_progress_bar=True)
brand_embeddings = np.array(brand_embeddings).astype('float32')

# Embeddings for influencers
influencers_df['embedding_text'] = influencers_df.apply(influencer_text, axis=1)
influencer_embeddings = model.encode(influencers_df['embedding_text'].tolist(), show_progress_bar=True)
influencer_embeddings = np.array(influencer_embeddings).astype('float32')


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [10]:
def match_brands_for_influencer(
        influencer_idx,
        influencer_embeddings,
        brand_embeddings,
        brands_df,
        top_k=5
    ):
    """
    Returns the top-k brands best matched to an influencer, by semantic similarity.
    """
    infl_vec = influencer_embeddings[influencer_idx]
    brand_norms = np.linalg.norm(brand_embeddings, axis=1)
    infl_norm = np.linalg.norm(infl_vec)
    cosine_similarities = np.dot(brand_embeddings, infl_vec) / (brand_norms * infl_norm + 1e-10)
    top_indices = np.argsort(cosine_similarities)[::-1][:top_k]
    return brands_df.iloc[top_indices].reset_index(drop=True)


In [11]:
# Choose any influencer by row number
influencer_idx = 0  # (or use a search/filter to pick a specific influencer)
matched_brands = match_brands_for_influencer(
    influencer_idx=influencer_idx,
    influencer_embeddings=influencer_embeddings,
    brand_embeddings=brand_embeddings,
    brands_df=brands_df,
    top_k=5  # how many matches you want
)

# Choose columns to show (adjust for your dataset)
display_cols = ['title', 'description', 'category', 'budget', 'targetAudience']
print(f"\nTop Brand Matches for Influencer '{influencers_df.iloc[influencer_idx]['name']}'")
print(matched_brands[display_cols])



Top Brand Matches for Influencer 'Simran Kaur'
                         title                             description  \
0                  Morris-Sims        Secured demand-driven moratorium   
1              Moreno and Sons               Operative mobile approach   
2     Camacho, Wells and Keith  Phased exuding artificial intelligence   
3      Kim, Gonzales and Mills                  Diverse high-level hub   
4  Johnston, Griffin and Myers        Visionary systematic parallelism   

  category                                          budget  \
0   Travel  {'min': 1647, 'max': 23242, 'currency': 'INR'}   
1  Fitness  {'min': 2583, 'max': 45033, 'currency': 'INR'}   
2   Gaming  {'min': 3307, 'max': 34719, 'currency': 'INR'}   
3  Fitness   {'min': 732, 'max': 10324, 'currency': 'INR'}   
4   Beauty   {'min': 753, 'max': 33268, 'currency': 'INR'}   

                                      targetAudience  
0  {'ageRange': {'min': 22, 'max': 55}, 'gender':...  
1  {'ageRange': {'min': 