In [None]:
import pandas as pd
from datetime import datetime
from collections import defaultdict
import numpy as np

# Load your structured CSV data
df = pd.read_csv("../data/structured_telegram_data.csv", parse_dates=["date"])

# Optional: Ensure correct data types
df["channel_username"] = df["channel_username"].astype(str)
df["cleaned_text"] = df["cleaned_text"].astype(str)

# Dummy NER model output function (you'll replace this)
def fake_ner_extraction(text):
    # Placeholder for your real NER model output
    # Return a list of prices detected in the text
    import re
    matches = re.findall(r'\d{2,5}', text)
    return [int(m) for m in matches if int(m) < 100_000]

# Create storage for vendor scores
vendor_scores = []

# Group by vendor (channel)
for vendor, group in df.groupby("channel_username"):
    group = group.sort_values("date")

    # --- Activity: Posting Frequency (posts per week) ---
    date_range_days = (group["date"].max() - group["date"].min()).days + 1
    weeks = max(1, date_range_days / 7)
    post_freq = len(group) / weeks

    # --- Engagement: Views (if available) ---
    avg_views = group["views"].mean() if "views" in group.columns else np.nan
    top_post = group.loc[group["views"].idxmax()] if "views" in group.columns else None
    top_product_info = {
        "text": top_post["cleaned_text"] if top_post is not None else None,
        "views": top_post["views"] if top_post is not None else None,
    }

    # --- NER Extraction: Average Price ---
    all_prices = []
    for text in group["cleaned_text"]:
        prices = fake_ner_extraction(text)
        all_prices.extend(prices)

    avg_price = np.mean(all_prices) if all_prices else None

    # --- Score ---
    # You can tweak this formula based on what you value most
    score = 0
    if not np.isnan(avg_views):
        score += avg_views * 0.5
    score += post_freq * 10  # scaled for visibility
    if avg_price:
        score += min(avg_price, 10_000) * 0.01  # small weight

    vendor_scores.append({
        "vendor": vendor,
        "posts": len(group),
        "post_freq_per_week": round(post_freq, 2),
        "avg_views": round(avg_views, 2) if not np.isnan(avg_views) else None,
        "top_post_text": top_product_info["text"],
        "top_post_views": top_product_info["views"],
        "avg_price": round(avg_price, 2) if avg_price else None,
        "lending_score": round(score, 2),
    })

# Convert to DataFrame for analysis
score_df = pd.DataFrame(vendor_scores)

# Sort by score
score_df = score_df.sort_values("lending_score", ascending=False)

# Save or display
score_df.to_csv("../data/vendor_lending_scores.csv", index=False)
print(score_df.head())


                    vendor  posts  post_freq_per_week avg_views top_post_text  \
0       @Shageronlinestore   4102               18.74      None          None   
1            @ZemenExpress   4840               17.97      None          None   
4            @nevacomputer   2851                8.26      None          None   
2  @ethio_brand_collection   3245                7.90      None          None   
3       @meneshayeofficial    877                6.34      None          None   

  top_post_views  avg_price  lending_score  
0           None   12668.45         287.43  
1           None   17794.47         279.73  
4           None   11170.71         182.57  
2           None   17418.91         179.04  
3           None   15820.36         163.42  
