In [2]:
# !git clone https://github.com/milki93/Amharic-E-commerce-Data-Extractor.git

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Library Imports

In [5]:
import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, AutoConfig
import re
import warnings
from datetime import datetime

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning, module='transformers')

### NER Model Loading and Configuration

In [6]:
conll_file_path = "/content/Amharic-E-commerce-Data-Extractor/data/labeled_data.conll"

def parse_conll_file_for_labels(file_path):
    unique_tags = set()
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line and len(line.split()) == 2:
                _, label = line.split()
                unique_tags.add(label)
    sorted_labels = sorted(list(unique_tags))
    if 'O' in sorted_labels:
        sorted_labels.remove('O')
        sorted_labels.insert(0, 'O')
    return sorted_labels

label_list = parse_conll_file_for_labels(conll_file_path)
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

best_model_path = "/content/drive/MyDrive/my_ner_models/bert_base_multilingual_cased/checkpoint-30"

print(f"\nLoading best model from: {best_model_path}")
config_ner = AutoConfig.from_pretrained(best_model_path, num_labels=len(label_list), id2label=id2label, label2id=label2id)
model_ner = AutoModelForTokenClassification.from_pretrained(best_model_path, config=config_ner)
tokenizer_ner = AutoTokenizer.from_pretrained(best_model_path)
model_ner.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_ner.to(device)

ner_pipeline = pipeline(
    "ner",
    model=model_ner,
    tokenizer=tokenizer_ner,
    aggregation_strategy="simple",
    device=device
)


Loading best model from: /content/drive/MyDrive/my_ner_models/bert_base_multilingual_cased/checkpoint-30


Device set to use cpu


### Data loading and Preprocessing

In [8]:
csv_file_path = "/content/Amharic-E-commerce-Data-Extractor/data/telegram_data.csv"

df_posts = pd.read_csv(csv_file_path)
df_posts.rename(columns={'channel': 'vendor_id', 'message_id': 'post_id'}, inplace=True)
df_posts['timestamp'] = pd.to_datetime(df_posts['timestamp'], errors='coerce')
df_posts.dropna(subset=['timestamp'], inplace=True)
df_posts['views'] = pd.to_numeric(df_posts['views'], errors='coerce').fillna(0).astype(int)
df_posts['text'] = df_posts['text'].astype(str)


### Vendor Metric Calculation

In [9]:
def extract_entities_from_post(text, ner_pipeline):
    entities = ner_pipeline(text)
    extracted_products = []
    extracted_prices = []
    for ent in entities:
        if 'Product' in ent['entity_group']:
            extracted_products.append(ent['word'])
        elif 'PRICE' in ent['entity_group']:
            price_str = ent['word'].replace('ብር', '').replace('birr', '').replace('ETB', '').replace(' ', '').replace('፦', '').strip()
            numeric_match = re.search(r'\d[\d,\.]*', price_str)
            if numeric_match:
                try:
                    extracted_prices.append(float(numeric_match.group(0).replace(',', '')))
                except ValueError:
                    continue
    return extracted_products, extracted_prices

def calculate_vendor_metrics(vendor_posts, ner_pipeline):
    if vendor_posts.empty:
        return {
            'Posting Frequency (posts/week)': 0,
            'Average Views per Post': 0,
            'Top Performing Post Product': 'N/A',
            'Top Performing Post Price': 'N/A',
            'Average Price Point (ETB)': 0,
            'Number of Posts': 0,
            'Total Views': 0,
            'Total Products Listed': 0,
            'Total Prices Extracted': 0
        }

    if len(vendor_posts) > 1:
        min_date = vendor_posts['timestamp'].min()
        max_date = vendor_posts['timestamp'].max()
        time_span_days = (max_date - min_date).days
        posting_frequency_per_week = (len(vendor_posts) / time_span_days) * 7 if time_span_days > 0 else len(vendor_posts) * 7.0
    else:
        posting_frequency_per_week = 1.0

    avg_views_per_post = vendor_posts['views'].mean()
    top_post = vendor_posts.loc[vendor_posts['views'].idxmax()]

    all_prices = []
    all_products = []
    top_post_product = 'N/A'
    top_post_price = 'N/A'

    for _, row in vendor_posts.iterrows():
        products, prices = extract_entities_from_post(row['text'], ner_pipeline)
        all_products.extend(products)
        all_prices.extend(prices)
        if row['post_id'] == top_post['post_id']:
            top_post_products, top_post_prices = extract_entities_from_post(top_post['text'], ner_pipeline)
            if top_post_products:
                top_post_product = ", ".join(top_post_products)
            if top_post_prices:
                top_post_price = np.mean(top_post_prices)

    avg_price_point = np.mean(all_prices) if all_prices else 0

    return {
        'Posting Frequency (posts/week)': posting_frequency_per_week,
        'Average Views per Post': avg_views_per_post,
        'Top Performing Post Product': top_post_product,
        'Top Performing Post Price': top_post_price,
        'Average Price Point (ETB)': avg_price_point,
        'Number of Posts': len(vendor_posts),
        'Total Views': vendor_posts['views'].sum(),
        'Total Products Listed': len(all_products),
        'Total Prices Extracted': len(all_prices)
    }

### Lending Score Calculation

In [10]:
def calculate_lending_score(metrics):
    weight_views = 0.5
    weight_frequency = 0.3
    weight_price_point = 0.2
    price_scaling_factor = 0.01

    score = (metrics['Average Views per Post'] * weight_views) + \
            (metrics['Posting Frequency (posts/week)'] * weight_frequency) + \
            (metrics['Average Price Point (ETB)'] * weight_price_point * price_scaling_factor)
    return score

### Scorecard Generation

In [11]:
vendor_scorecard_data = []

unique_vendors = df_posts['vendor_id'].unique()
print(f"\nProcessing metrics for {len(unique_vendors)} unique vendors...")

for vendor_id in unique_vendors:
    vendor_posts = df_posts[df_posts['vendor_id'] == vendor_id].copy()
    metrics = calculate_vendor_metrics(vendor_posts, ner_pipeline)
    lending_score = calculate_lending_score(metrics)

    vendor_data = {
        'Vendor ID': vendor_id,
        'Avg. Views/Post': round(metrics['Average Views per Post'], 2),
        'Posts/Week': round(metrics['Posting Frequency (posts/week)'], 2),
        'Avg. Price (ETB)': round(metrics['Average Price Point (ETB)'], 2),
        'Top Post Product': metrics['Top Performing Post Product'],
        'Top Post Price (ETB)': metrics['Top Performing Post Price'] if metrics['Top Performing Post Price'] == 'N/A' else round(metrics['Top Performing Post Price'], 2),
        'Total Posts': metrics['Number of Posts'],
        'Total Views': metrics['Total Views'],
        'Total Products Listed': metrics['Total Products Listed'],
        'Total Prices Extracted': metrics['Total Prices Extracted'],
        'Lending Score': round(lending_score, 2)
    }
    vendor_scorecard_data.append(vendor_data)

vendor_scorecard_df = pd.DataFrame(vendor_scorecard_data)
vendor_scorecard_df = vendor_scorecard_df.sort_values(by='Lending Score', ascending=False).reset_index(drop=True)

print("\n--- FinTech Vendor Scorecard Summary ---")
print(vendor_scorecard_df.to_markdown(index=False))



Processing metrics for 5 unique vendors...

--- FinTech Vendor Scorecard Summary ---
| Vendor ID               |   Avg. Views/Post |   Posts/Week |   Avg. Price (ETB) | Top Post Product   | Top Post Price (ETB)   |   Total Posts |   Total Views |   Total Products Listed |   Total Prices Extracted |   Lending Score |
|:------------------------|------------------:|-------------:|-------------------:|:-------------------|:-----------------------|--------------:|--------------:|------------------------:|-------------------------:|----------------:|
| @ethio_brand_collection |          27131.8  |         8.64 |            1422.77 | infinity flow      | 3500.0                 |           100 |       2713179 |                     196 |                       65 |        13571.3  |
| @modernshoppingcenter   |           8477.77 |        87.5  |               0    | N/A                | N/A                    |           100 |        847777 |                     151 |                        0 | 

In [12]:
vendor_scorecard_df.to_csv("vendor_scorecard.csv", index=False)

In [13]:
!cp "/content/drive/My Drive/Colab Notebooks/fintech_vendor_scorecard.ipynb" /content/Amharic-E-commerce-Data-Extractor/notebooks

cp: cannot stat '/content/drive/My Drive/Colab Notebooks/fintech_vendor_scorecard.ipynb': No such file or directory
