In [22]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import re
import random
from typing import List, Dict, Union

class VegetableMatcher:
    def __init__(self, data_path: str):
        """
        Initialize the VegetableMatcher with dataset path.

        Args:
            data_path (str): Path to the CSV file containing vegetable data
        """
        self.data_path = data_path
        self.data = None
        self.vectorizer = TfidfVectorizer(stop_words='english')
        self.tfidf_matrix = None
        self.combined_texts = None
        self.load_and_preprocess_data()

    def load_and_preprocess_data(self):
        """
        Load and preprocess the vegetable data.
        """
        # Load the data
        self.data = pd.read_csv(self.data_path)

        # Clean quantity column (remove 'kg' and convert to numeric)
        self.data['Quantity Available (kg)'] = self.data['Quantity Available (kg)'].str.replace(' kg', '').astype(float)

        # Create combined text feature for matching
        self.data['combined_text'] = self.data['Vegetable Name'] + ' ' + self.data['Vegetable Type']
        self.combined_texts = self.data['combined_text'].tolist()

        # Vectorize the combined text
        self.tfidf_matrix = self.vectorizer.fit_transform(self.combined_texts)

    def preprocess_query(self, query: str) -> str:
        """
        Preprocess user query to match the same preprocessing done on the dataset.

        Args:
            query (str): User's raw query

        Returns:
            str: Preprocessed query
        """
        # Convert to lowercase
        query = query.lower()
        # Remove special characters
        query = re.sub(r'[^a-zA-Z0-9\s]', '', query)
        return query

    def find_matches(self, query: str, top_n: int = 3, min_quantity: float = None, location: str = None) -> List[Dict]:
        """
        Find the best matches for a user query.

        Args:
            query (str): User's query (e.g., "small onion")
            top_n (int): Number of top matches to return
            min_quantity (float): Minimum quantity available filter (in kg)
            location (str): Filter by farmer location

        Returns:
            List[Dict]: List of matching vegetable entries with details
        """
        # Preprocess query
        processed_query = self.preprocess_query(query)

        # Vectorize query
        query_vec = self.vectorizer.transform([processed_query])

        # Calculate cosine similarities
        similarities = cosine_similarity(query_vec, self.tfidf_matrix).flatten()

        # Get indices of top matches
        top_indices = similarities.argsort()[-top_n:][::-1]

        # Prepare results
        results = []
        for idx in top_indices:
            match = self.data.iloc[idx].to_dict()
            match['similarity_score'] = float(similarities[idx])

            # Apply filters if provided
            if min_quantity is not None and match['Quantity Available (kg)'] < min_quantity:
                continue
            if location is not None and location.lower() not in match['Farmer Location'].lower():
                continue

            results.append(match)

        return results[:top_n]  # Ensure we return at most top_n results after filtering

    def evaluate(self, test_cases: List[Dict] = None) -> Dict:
        """
        Evaluate the matcher's performance on test cases.

        Args:
            test_cases (List[Dict]): List of test cases with 'query' and 'expected_match'

        Returns:
            Dict: Evaluation metrics
        """
        if test_cases is None:
            test_cases = self.generate_test_cases()

        y_true = []
        y_pred = []

        for case in test_cases:
            query = case['query']
            expected = case['expected_match']

            # Get top match
            matches = self.find_matches(query, top_n=1)
            if not matches:
                predicted = None
            else:
                predicted = matches[0]['combined_text']

            y_true.append(expected)
            y_pred.append(predicted)

        # Calculate accuracy (exact match)
        accuracy = np.mean([1 if true == pred else 0 for true, pred in zip(y_true, y_pred)])

        # Generate classification report
        unique_labels = list(set(y_true + y_pred))
        report = classification_report(y_true, y_pred, labels=unique_labels, output_dict=True, zero_division=0)

        # Generate confusion matrix
        cm = confusion_matrix(y_true, y_pred, labels=unique_labels)

        return {
            'accuracy': accuracy,
            'report': report,
            'confusion_matrix': cm,
            'labels': unique_labels
        }

    def generate_test_cases(self, n: int = 20) -> List[Dict]:
        """
        Generate test cases by sampling from the dataset.

        Args:
            n (int): Number of test cases to generate

        Returns:
            List[Dict]: Generated test cases
        """
        test_cases = []
        sampled = self.data.sample(n)

        for _, row in sampled.iterrows():
            # Create variations of the query
            combined = row['combined_text']
            parts = combined.split()

            # Create different query variations
            if len(parts) >= 2:
                # Variation 1: Just the type (e.g., "Small Onion" -> "small onion")
                query1 = combined.lower()
                # Variation 2: Partial (e.g., "Small Onion" -> "small")
                query2 = parts[0].lower()
                # Variation 3: Reordered (e.g., "Small Onion" -> "onion small")
                query3 = ' '.join([parts[1].lower(), parts[0].lower()])

                test_cases.append({'query': query1, 'expected_match': combined})
                test_cases.append({'query': query2, 'expected_match': combined})
                test_cases.append({'query': query3, 'expected_match': combined})

        return test_cases

def print_evaluation_results(results: Dict):
    """
    Print evaluation results in a readable format.

    Args:
        results (Dict): Evaluation results from evaluate() method
    """
    print(f"Accuracy: {results['accuracy']:.2%}")
    print("\nClassification Report:")
    print(pd.DataFrame(results['report']).transpose())

    print("\nConfusion Matrix:")
    cm_df = pd.DataFrame(results['confusion_matrix'],
                         index=results['labels'],
                         columns=results['labels'])
    print(cm_df)

def print_matches(matches: List[Dict]):
    """
    Print matching results in a readable format.

    Args:
        matches (List[Dict]): List of matches from find_matches()
    """
    if not matches:
        print("No matches found.")
        return

    print("\nBest Matches:")
    for i, match in enumerate(matches, 1):
        print(f"\nMatch #{i}:")
        print(f"Vegetable: {match['Vegetable Name']} ({match['Vegetable Type']})")
        print(f"Farmer: {match['Farmer Name']} from {match['Farmer Location']}")
        print(f"Contact: {match['Contact Number']}")
        print(f"Quantity Available: {match['Quantity Available (kg)']} kg")
        print(f"Similarity Score: {match['similarity_score']:.2f}")

def main():
    # Initialize the matcher
    matcher = VegetableMatcher('smart_matching_vegetables_updated.csv')

    # Evaluate the matcher
    print("Evaluating the matcher...")
    evaluation_results = matcher.evaluate()
    print_evaluation_results(evaluation_results)

    # Test with some example queries
    test_queries = [
        "small onion",
        "crispy beetroot",
        "desi cauliflower",
        "green bottle gourd",
        "mature tomato 40 kg",
        "spicy brinjal in rampur"
    ]

    for query in test_queries:
        print(f"\n{'='*50}")
        print(f"Query: '{query}'")

        # Extract quantity and location from query if present
        quantity = None
        location = None

        # Check for quantity pattern (e.g., "10 kg", "5kg")
        quantity_match = re.search(r'(\d+)\s*kg', query, re.IGNORECASE)
        if quantity_match:
            quantity = float(quantity_match.group(1))
            query = re.sub(r'\d+\s*kg', '', query, flags=re.IGNORECASE).strip()

        # Check for location pattern (e.g., "in pune", "from rampur")
        location_match = re.search(r'(in|from)\s+([a-zA-Z]+)', query, re.IGNORECASE)
        if location_match:
            location = location_match.group(2)
            query = re.sub(r'(in|from)\s+[a-zA-Z]+', '', query, flags=re.IGNORECASE).strip()

        # Find matches
        matches = matcher.find_matches(query, top_n=3, min_quantity=quantity, location=location)
        print_matches(matches)
    return matches

if __name__ == "__main__":
    main()

Evaluating the matcher...
Accuracy: 70.00%

Classification Report:
                                      precision    recall  f1-score  support
Corn Sweet Corn                        0.000000  0.000000  0.000000      0.0
Beetroot Hybrid Beetroot               1.000000  0.666667  0.800000      3.0
Broccoli Green Broccoli                0.000000  0.000000  0.000000      0.0
Bitter Gourd Mature Bitter Gourd       0.000000  0.000000  0.000000      0.0
Cauliflower Desi Cauliflower           1.000000  0.666667  0.800000      3.0
Lettuce Spicy Lettuce                  0.000000  0.000000  0.000000      0.0
Chili Spicy Chili                      1.000000  1.000000  1.000000      3.0
Brinjal Spicy Brinjal                  1.000000  1.000000  1.000000      3.0
Bottle Gourd Sweet Bottle Gourd        0.000000  0.000000  0.000000      0.0
Mustard Greens Tender Mustard Greens   0.000000  0.000000  0.000000      0.0
Bell Pepper Red Bell Pepper            1.000000  0.333333  0.500000      3.0
Potato Sp

In [23]:
matches = main()

import os
import json

# Ensure the output directory exists
os.makedirs("output", exist_ok=True)

# Prepare data
output_data = [
    {
        "vegetable": item['Vegetable Name'], 
        "type": item['Vegetable Type'], 
        "match": item['combined_text'], 
        "score": item['similarity_score']
    } for item in matches
]

# Save to JSON
with open("output/smart_match_results.json", "w") as f:
    json.dump(output_data, f, indent=4)

print("✅ JSON file successfully saved to 'output/smart_match_results.json'.")


Evaluating the matcher...
Accuracy: 76.67%

Classification Report:
                                      precision    recall  f1-score    support
Potato Small Potato                    1.000000  0.666667  0.800000   3.000000
Potato Black Potato                    1.000000  0.666667  0.800000   3.000000
Potato Desi Potato                     1.000000  0.666667  0.800000   3.000000
Cauliflower Black Cauliflower          1.000000  0.666667  0.800000   3.000000
Ginger Sweet Ginger                    0.000000  0.000000  0.000000   0.000000
Cabbage Wild Cabbage                   1.000000  1.000000  1.000000   3.000000
Bottle Gourd Sweet Bottle Gourd        0.000000  0.000000  0.000000   0.000000
Mustard Greens Tender Mustard Greens   0.000000  0.000000  0.000000   0.000000
Potato Spicy Potato                    0.428571  1.000000  0.600000   3.000000
Garlic White Garlic                    1.000000  1.000000  1.000000   3.000000
Radish Wild Radish                     0.000000  0.000000  0.000

In [27]:
import json

# Your actual match result data from your model
results = [
    {"vegetable": "Tomato", "match": "Retailer1", "score": 0.85},
    {"vegetable": "Potato", "match": "Retailer2", "score": 0.90}
]

# Save to the correct path inside your XAMPP folder
with open("C:/xampp/htdocs/MarkETLinK/output/smart_match_results.json", "w") as f:
    json.dump(results, f, indent=2)
