## 1. Import Libraries and Initialize

In [2]:
%pip install groq

Collecting groq
  Downloading groq-0.33.0-py3-none-any.whl.metadata (16 kB)
Downloading groq-0.33.0-py3-none-any.whl (135 kB)
Installing collected packages: groq
Successfully installed groq-0.33.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import os
from groq import Groq
from dotenv import load_dotenv
import time
from tqdm import tqdm

# Load environment variables
load_dotenv()

print("‚úÖ Libraries imported successfully")

‚úÖ Libraries imported successfully


## 2. Initialize Groq API with 70B Model

In [2]:
# Initialize Groq client with 70B model
api_key = os.getenv("GROQ_API_KEY")

if not api_key:
    raise ValueError("GROQ_API_KEY not found! Please add it to your .env file")

client = Groq(api_key=api_key)
model_name = "llama-3.3-70b-versatile"  # Using the powerful 70B model

print(f"‚úÖ Groq API initialized with model: {model_name}")

‚úÖ Groq API initialized with model: llama-3.3-70b-versatile


## 3. Load Neighborhood Data

In [3]:
# Load the CSV data
csv_path = 'istanbul_mahalle_complete_data.csv'
df = pd.read_csv(csv_path)

print(f"‚úÖ Loaded {len(df)} neighborhoods")
print(f"üìä Columns: {df.columns.tolist()}")
print(f"\nüìã Sample data:")
df.head(3)

‚úÖ Loaded 164 neighborhoods
üìä Columns: ['Mahalle', 'ƒ∞l√ße', 'Enlem', 'Boylam', 'INDEX_YASAM_KALITESI', 'INDEX_YURUNEBILIRLIK', 'KULTUREL_AKTIVITE_INDEX', 'restaurant', 'library', 'school', 'park', 'atm', 'cafe', 'pharmacy', 'hospital', 'mosque', 'bus_station', 'train_station', 'transit_station', 'Toplam Ge√ßerli Oy', 'Toplam Ge√ßersiz Oy', 'CHP', 'AK PARTƒ∞', 'SAADET', 'VATAN PARTƒ∞Sƒ∞', 'N√ºfus', 'mahalle_uavt', '1980_oncesi', '1980-2000_arasi', '2000_sonrasi', '1-4 kat_arasi', '5-9 kat_arasi', '9-19 kat_arasi', 'mahalle_koy_uavt', 'cok_agir_hasarli_bina_sayisi', 'agir_hasarli_bina_sayisi', 'orta_hasarli_bina_sayisi', 'hafif_hasarli_bina_sayisi', 'can_kaybi_sayisi', 'agir_yarali_sayisi', 'hastanede_tedavi_sayisi', 'hafif_yarali_sayisi', 'dogalgaz_boru_hasari', 'icme_suyu_boru_hasari', 'atik_su_boru_hasari', 'gecici_barinma', 'Avg_Rent_Per_SqM', 'Green_Index', 'Society_Welfare_Index']

üìã Sample data:


Unnamed: 0,Mahalle,ƒ∞l√ße,Enlem,Boylam,INDEX_YASAM_KALITESI,INDEX_YURUNEBILIRLIK,KULTUREL_AKTIVITE_INDEX,restaurant,library,school,...,agir_yarali_sayisi,hastanede_tedavi_sayisi,hafif_yarali_sayisi,dogalgaz_boru_hasari,icme_suyu_boru_hasari,atik_su_boru_hasari,gecici_barinma,Avg_Rent_Per_SqM,Green_Index,Society_Welfare_Index
0,Balmumcu,Be≈üikta≈ü,41.059527,29.015073,,,,0,0,13,...,,,,,,,,560,0.93,1.0
1,Bebek,Be≈üikta≈ü,41.07897,29.043979,,,,11,1,4,...,,,,,,,,560,0.93,1.0
2,K√ºlt√ºr,Be≈üikta≈ü,41.072961,29.032796,,,,13,0,2,...,,,,,,,,560,0.93,1.0


In [6]:
df.columns

Index(['Mahalle', 'ƒ∞l√ße', 'Enlem', 'Boylam', 'INDEX_YASAM_KALITESI',
       'INDEX_YURUNEBILIRLIK', 'KULTUREL_AKTIVITE_INDEX', 'restaurant',
       'library', 'school', 'park', 'atm', 'cafe', 'pharmacy', 'hospital',
       'mosque', 'bus_station', 'train_station', 'transit_station',
       'Toplam Ge√ßerli Oy', 'Toplam Ge√ßersiz Oy', 'CHP', 'AK PARTƒ∞', 'SAADET',
       'VATAN PARTƒ∞Sƒ∞', 'N√ºfus', 'mahalle_uavt', '1980_oncesi',
       '1980-2000_arasi', '2000_sonrasi', '1-4 kat_arasi', '5-9 kat_arasi',
       '9-19 kat_arasi', 'mahalle_koy_uavt', 'cok_agir_hasarli_bina_sayisi',
       'agir_hasarli_bina_sayisi', 'orta_hasarli_bina_sayisi',
       'hafif_hasarli_bina_sayisi', 'can_kaybi_sayisi', 'agir_yarali_sayisi',
       'hastanede_tedavi_sayisi', 'hafif_yarali_sayisi',
       'dogalgaz_boru_hasari', 'icme_suyu_boru_hasari', 'atik_su_boru_hasari',
       'gecici_barinma', 'Avg_Rent_Per_SqM', 'Green_Index',
       'Society_Welfare_Index'],
      dtype='object')

## 4. Create Description Generator Function

In [4]:
def generate_mahalle_description(row, client, model_name):
    """
    Generate a rich description for a neighborhood using Groq API
    
    Args:
        row: DataFrame row with neighborhood data
        client: Groq client instance
        model_name: Model to use for generation
        
    Returns:
        Tuple: (description, keywords)
    """
    
    # Extract key features
    mahalle = row['Mahalle']
    ilce = row['ƒ∞l√ße']
    
    # Amenities
    restaurants = row.get('restaurant', 0)
    schools = row.get('school', 0)
    parks = row.get('park', 0)
    cafes = row.get('cafe', 0)
    hospitals = row.get('hospital', 0)
    mosques = row.get('mosque', 0)
    
    # Indices
    green_index = row.get('Green_Index', 0)
    welfare_index = row.get('Society_Welfare_Index', 0)
    rent_per_sqm = row.get('Avg_Rent_Per_SqM', 0)
    
    # Population
    population = row.get('N√ºfus', 0)
    
    # Create prompt for LLM
    prompt = f"""Generate a detailed, information-rich description for this Istanbul neighborhood optimized for semantic search.

Neighborhood: {mahalle}, {ilce}

Key Statistics:
- Quality of Life Index: {row.get('INDEX_YASAM_KALITESI', 0):.2f}
- Walkability Index: {row.get('INDEX_YURUNEBILIRLIK', 0):.2f}
- Cultural Activity Index: {row.get('KULTUREL_AKTIVITE_INDEX', 0):.2f}
- Green Index: {green_index:.2f}
- Welfare Index: {welfare_index:.2f}
- Population: {population:,.0f}
- Rent: {rent_per_sqm:.0f} TL/sqm

Amenities:
- {restaurants} restaurants, {cafes} cafes
- {schools} schools, {row.get('library', 0)} libraries
- {parks} parks (green spaces)
- {hospitals} hospitals, {row.get('pharmacy', 0)} pharmacies
- {mosques} mosques
- Transit: {row.get('bus_station', 0)} bus stations, {row.get('train_station', 0)} train stations

Building Age Distribution:
- Pre-1980: {row.get('1980_oncesi', 0)} buildings
- 1980-2000: {row.get('1980-2000_arasi', 0)} buildings
- Post-2000: {row.get('2000_sonrasi', 0)} buildings

Write a comprehensive 4-5 sentence description that:
1. Characterizes the neighborhood's atmosphere and lifestyle
2. Mentions who would enjoy living here (families, young professionals, retirees, students)
3. Highlights accessibility and transportation
4. Notes distinctive features (historic, modern, green, cultural, commercial)
5. Includes comparative language (e.g., "quieter than," "more affordable than," "as vibrant as")

Use natural language that users might search for. Be specific and objective.

Description:"""

    try:
        # STEP 1: Generate description
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are an expert Istanbul real estate writer who creates engaging neighborhood descriptions."
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            model=model_name,
            temperature=0.5,
            max_tokens=512
        )
        
        description = chat_completion.choices[0].message.content.strip()
        
        # STEP 2: Generate keywords based on the description
        keywords_prompt = f"""Based on this neighborhood data, generate 8-10 search keywords/phrases that users might use to find this neighborhood.

Neighborhood: {mahalle}, {ilce}
Description: {description}

Include keywords about:
- Lifestyle (family-friendly, vibrant nightlife, quiet, etc.)
- Demographics (young professionals, students, families)
- Character (historic, modern, green, commercial, residential)
- Accessibility (metro access, walkable, etc.)
- Price range (affordable, mid-range, upscale)

Format: comma-separated keywords only, no extra text.

Keywords:"""

        keywords_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are an expert at creating searchable keywords for neighborhoods."
                },
                {
                    "role": "user",
                    "content": keywords_prompt
                }
            ],
            model=model_name,
            temperature=0.5,  # Less creative for keywords
            max_tokens=150
        )
        
        keywords = keywords_completion.choices[0].message.content.strip()
        
        return description, keywords
        
    except Exception as e:
        print(f"‚ùå Error generating description for {mahalle}: {e}")
        return f"{mahalle} is a neighborhood in {ilce}, Istanbul.", "neighborhood, Istanbul"

print("‚úÖ Description generator function created")

‚úÖ Description generator function created


## 5. Test with One Neighborhood First

In [12]:
# Test with the first neighborhood
test_row = df.iloc[0]
print(f"üß™ Testing with: {test_row['Mahalle']}, {test_row['ƒ∞l√ße']}")
print("=" * 70)

test_description = generate_mahalle_description(test_row, client, model_name)
print(f"\nüìù Generated Description:\n{test_description}")
print("=" * 70)

üß™ Testing with: Balmumcu, Be≈üikta≈ü

üìù Generated Description:
("Balmumcu, located in the Be≈üikta≈ü district of Istanbul, is a tranquil and family-friendly neighborhood that offers a high quality of life, as evidenced by its perfect Welfare Index score of 1.00. This charming area is ideal for families and retirees who value a peaceful atmosphere, abundant green spaces, and easy access to essential amenities, including hospitals, pharmacies, and schools. With its two parks and numerous green areas, Balmumcu boasts a impressive Green Index score of 0.93, making it a quieter and more natural oasis compared to other Istanbul neighborhoods. While it may not be as vibrant as some of Istanbul's more commercial areas, Balmumcu is still conveniently connected to the city via its bus station, providing a relatively affordable and laid-back lifestyle, with rent prices averaging 560 TL/sqm, which is more affordable than many other neighborhoods in the city. Overall, Balmumcu's unique blend 

In [None]:
import chromadb
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os
import shutil

# Check if chroma_db exists
if not os.path.exists("./chroma_db"):
    print("‚ùå ChromaDB not found!")
    print("? Please run: python vector_db_creation.py")
    raise FileNotFoundError("chroma_db directory not found")

print("üîç Attempting to connect to ChromaDB...")

try:
    # Try to connect with a fresh client
    chroma_client = chromadb.PersistentClient(path="./chroma_db")
    
    # Try to list collections - this is where the error occurs
    collections = chroma_client.list_collections()
    
    print(f"‚úÖ Connected to ChromaDB successfully!")
    print(f"üìä Found {len(collections)} collection(s)")
    
    if collections:
        collection = collections[0]
        print(f"‚úÖ Using collection: '{collection.name}'")
        print(f"üìä Total documents: {collection.count()}")
    else:
        print("‚ö†Ô∏è  No collections found in database")
        print("üí° Please run: python vector_db_creation.py")
        raise ValueError("No collections in ChromaDB")
        
except KeyError as e:
    print(f"‚ùå ChromaDB corruption detected: {e}")
    print("\n? Your ChromaDB appears to be corrupted (likely version mismatch)")
    print("\nüí° SOLUTION:")
    print("   1. Delete the chroma_db folder")
    print("   2. Run: python vector_db_creation.py")
    print("   3. Then come back and run this cell again")
    print("\nüìù To delete, run in terminal:")
    print("   Remove-Item -Recurse -Force chroma_db")
    raise
    
except Exception as e:
    print(f"‚ùå Unexpected error: {type(e).__name__}: {e}")
    raise

print("=" * 70)

### üîß Fix Corrupted ChromaDB (Run this if you got the KeyError above)

In [None]:
# ONLY RUN THIS IF YOU GOT THE KeyError ABOVE!
# This will delete and recreate your vector database

import shutil
import os

# Backup old database (just in case)
if os.path.exists("./chroma_db"):
    print("üóëÔ∏è  Removing corrupted ChromaDB...")
    shutil.rmtree("./chroma_db")
    print("‚úÖ Old database removed")
else:
    print("‚ÑπÔ∏è  No existing database found")

# Now recreate it
print("\nüî® Recreating vector database...")
print("üí° Running vector_db_creation.py...")

# Run the creation script
import subprocess
result = subprocess.run(
    ["python", "vector_db_creation.py"],
    capture_output=True,
    text=True
)

print(result.stdout)
if result.returncode == 0:
    print("‚úÖ Vector database recreated successfully!")
    print("üí° Now go back and run the cell above again")
else:
    print(f"‚ùå Error: {result.stderr}")
    raise RuntimeError("Failed to recreate vector database")

In [None]:
# Get all embeddings from ChromaDB
results = collection.get(
    include=['embeddings', 'metadatas']
)

embeddings = np.array(results['embeddings'])
metadatas = results['metadatas']

print(f"‚úÖ Retrieved {len(embeddings)} embeddings")
print(f"üìä Embedding dimension: {embeddings.shape[1]}")
print("=" * 70)

In [None]:
# Calculate pairwise cosine similarity
print("üîç Calculating pairwise similarity matrix...")
sim_matrix = cosine_similarity(embeddings)

print(f"‚úÖ Similarity matrix shape: {sim_matrix.shape}")
print("=" * 70)

In [None]:
# Calculate average similarity (excluding diagonal - self-similarity)
n = len(embeddings)
avg_similarity = (sim_matrix.sum() - n) / (n * (n - 1))

print("üìä EMBEDDING DIVERSITY ANALYSIS")
print("=" * 70)
print(f"Average pairwise similarity: {avg_similarity:.3f}")
print()
print("üìà Interpretation:")
if avg_similarity < 0.3:
    print("   ‚úÖ EXCELLENT - Very diverse embeddings, neighborhoods are highly distinct")
elif avg_similarity < 0.5:
    print("   ‚úÖ GOOD - Diverse embeddings, neighborhoods are distinguishable")
elif avg_similarity < 0.7:
    print("   ‚ö†Ô∏è  MODERATE - Some overlap, but still usable")
else:
    print("   ‚ùå BAD - Too similar, neighborhoods are not well-differentiated")

print("=" * 70)

In [None]:
# Additional statistics
print("üìä DETAILED STATISTICS")
print("=" * 70)

# Get upper triangle (excluding diagonal)
upper_triangle = sim_matrix[np.triu_indices_from(sim_matrix, k=1)]

print(f"Min similarity: {upper_triangle.min():.3f}")
print(f"Max similarity: {upper_triangle.max():.3f}")
print(f"Median similarity: {np.median(upper_triangle):.3f}")
print(f"Std deviation: {upper_triangle.std():.3f}")
print()

# Distribution
print("üìä Similarity Distribution:")
print(f"   < 0.3 (Very different): {(upper_triangle < 0.3).sum():,} pairs ({(upper_triangle < 0.3).sum()/len(upper_triangle)*100:.1f}%)")
print(f"   0.3-0.5 (Different): {((upper_triangle >= 0.3) & (upper_triangle < 0.5)).sum():,} pairs ({((upper_triangle >= 0.3) & (upper_triangle < 0.5)).sum()/len(upper_triangle)*100:.1f}%)")
print(f"   0.5-0.7 (Similar): {((upper_triangle >= 0.5) & (upper_triangle < 0.7)).sum():,} pairs ({((upper_triangle >= 0.5) & (upper_triangle < 0.7)).sum()/len(upper_triangle)*100:.1f}%)")
print(f"   > 0.7 (Very similar): {(upper_triangle >= 0.7).sum():,} pairs ({(upper_triangle >= 0.7).sum()/len(upper_triangle)*100:.1f}%)")

print("=" * 70)

In [None]:
# Find most similar pairs (potential duplicates or very similar neighborhoods)
print("üîç TOP 10 MOST SIMILAR NEIGHBORHOOD PAIRS")
print("=" * 70)

# Get indices of top similarities (excluding diagonal)
np.fill_diagonal(sim_matrix, -1)  # Exclude self-similarity
top_indices = np.argsort(sim_matrix.flatten())[-10:][::-1]
top_pairs = np.unravel_index(top_indices, sim_matrix.shape)

for i, (idx1, idx2) in enumerate(zip(top_pairs[0], top_pairs[1]), 1):
    similarity = sim_matrix[idx1, idx2]
    mahalle1 = metadatas[idx1].get('mahalle', 'Unknown')
    ilce1 = metadatas[idx1].get('ilce', 'Unknown')
    mahalle2 = metadatas[idx2].get('mahalle', 'Unknown')
    ilce2 = metadatas[idx2].get('ilce', 'Unknown')
    
    print(f"{i}. {mahalle1}, {ilce1} ‚Üî {mahalle2}, {ilce2}")
    print(f"   Similarity: {similarity:.3f}")
    print()

print("=" * 70)

In [None]:
# Find most diverse pairs (very different neighborhoods)
print("üîç TOP 10 MOST DIVERSE NEIGHBORHOOD PAIRS")
print("=" * 70)

# Get indices of lowest similarities
bottom_indices = np.argsort(sim_matrix.flatten())[:10]
bottom_pairs = np.unravel_index(bottom_indices, sim_matrix.shape)

for i, (idx1, idx2) in enumerate(zip(bottom_pairs[0], bottom_pairs[1]), 1):
    # Skip if it's the diagonal we marked as -1
    if sim_matrix[idx1, idx2] < 0:
        continue
        
    similarity = sim_matrix[idx1, idx2]
    mahalle1 = metadatas[idx1].get('mahalle', 'Unknown')
    ilce1 = metadatas[idx1].get('ilce', 'Unknown')
    mahalle2 = metadatas[idx2].get('mahalle', 'Unknown')
    ilce2 = metadatas[idx2].get('ilce', 'Unknown')
    
    print(f"{i}. {mahalle1}, {ilce1} ‚Üî {mahalle2}, {ilce2}")
    print(f"   Similarity: {similarity:.3f} (very different!)")
    print()

print("=" * 70)

In [None]:
# Visualize similarity distribution
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 5))

# Histogram
plt.subplot(1, 2, 1)
plt.hist(upper_triangle, bins=50, edgecolor='black', alpha=0.7)
plt.axvline(avg_similarity, color='red', linestyle='--', linewidth=2, label=f'Mean: {avg_similarity:.3f}')
plt.xlabel('Cosine Similarity')
plt.ylabel('Frequency')
plt.title('Distribution of Pairwise Similarities')
plt.legend()
plt.grid(True, alpha=0.3)

# Box plot
plt.subplot(1, 2, 2)
plt.boxplot(upper_triangle, vert=True)
plt.ylabel('Cosine Similarity')
plt.title('Similarity Statistics')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("‚úÖ Visualization complete!")

## 6. Generate Descriptions for All Neighborhoods

‚ö†Ô∏è **Note:** This will make API calls for all neighborhoods. With 164 neighborhoods, this will take some time.

Groq API rate limits:
- Free tier: ~30 requests per minute
- So for 164 neighborhoods: ~5-10 minutes

In [5]:
# Generate descriptions AND keywords for all neighborhoods
descriptions = []
keywords_list = []  # Add this!
failed_count = 0

print(f"üöÄ Generating descriptions for {len(df)} neighborhoods...")
print("=" * 70)

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Generating"):
    try:
        # Unpack both return values
        description, keywords = generate_mahalle_description(row, client, model_name)
        descriptions.append(description)
        keywords_list.append(keywords)  # Store keywords too!
        
        # Rate limiting - be nice to the API
        if (idx + 1) % 20 == 0:  # Every 20 requests
            print(f"\n‚úÖ Processed {idx + 1}/{len(df)} neighborhoods. Pausing 2 seconds...")
            time.sleep(2)
        else:
            time.sleep(0.3)  # Increase delay (you're making 2 API calls now)
            
    except Exception as e:
        print(f"\n‚ùå Failed for row {idx}: {e}")
        descriptions.append(f"{row['Mahalle']} is a neighborhood in {row['ƒ∞l√ße']}, Istanbul.")
        keywords_list.append("neighborhood, Istanbul")  # Default keywords
        failed_count += 1
        time.sleep(1)  # Wait longer after error

print("=" * 70)
print(f"\n‚úÖ Generation complete!")
print(f"üìä Successfully generated: {len(descriptions) - failed_count}/{len(df)}")
print(f"‚ùå Failed: {failed_count}")

# Add BOTH columns to DataFrame
df['Description'] = descriptions
df['Keywords'] = keywords_list  # Add keywords column!

print("‚úÖ Descriptions and keywords added to DataFrame")
print(f"üìä DataFrame shape: {df.shape}")
print(f"\nüìã Sample descriptions:")
print("=" * 70)

# Show a few examples with keywords
for i in range(min(3, len(df))):
    print(f"\nüèòÔ∏è {df.iloc[i]['Mahalle']}, {df.iloc[i]['ƒ∞l√ße']}")
    print(f"üìù Description: {df.iloc[i]['Description'][:200]}...")  # First 200 chars
    print(f"üè∑Ô∏è Keywords: {df.iloc[i]['Keywords']}")
    print("-" * 70)

# SAVE TO CSV
output_file = 'neighborhoods_with_descriptions.csv'
df.to_csv(output_file, index=False, encoding='utf-8-sig')  # utf-8-sig for Turkish characters
print(f"\nüíæ Saved to: {output_file}")

# Optional: Save a backup with timestamp
from datetime import datetime
backup_file = f'neighborhoods_backup_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
df.to_csv(backup_file, index=False, encoding='utf-8-sig')
print(f"üíæ Backup saved to: {backup_file}")

üöÄ Generating descriptions for 164 neighborhoods...


Generating:  12%|‚ñà‚ñè        | 19/164 [00:32<04:09,  1.72s/it]


‚úÖ Processed 20/164 neighborhoods. Pausing 2 seconds...


Generating:  24%|‚ñà‚ñà‚ñç       | 39/164 [01:07<03:31,  1.69s/it]


‚úÖ Processed 40/164 neighborhoods. Pausing 2 seconds...


Generating:  36%|‚ñà‚ñà‚ñà‚ñå      | 59/164 [01:41<02:58,  1.70s/it]


‚úÖ Processed 60/164 neighborhoods. Pausing 2 seconds...


Generating:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 79/164 [02:19<02:19,  1.64s/it]


‚úÖ Processed 80/164 neighborhoods. Pausing 2 seconds...


Generating:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 99/164 [02:53<01:46,  1.65s/it]


‚úÖ Processed 100/164 neighborhoods. Pausing 2 seconds...


Generating:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 119/164 [03:30<01:16,  1.71s/it]


‚úÖ Processed 120/164 neighborhoods. Pausing 2 seconds...


Generating:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 139/164 [04:31<00:41,  1.67s/it]


‚úÖ Processed 140/164 neighborhoods. Pausing 2 seconds...


Generating:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 159/164 [05:14<00:13,  2.61s/it]


‚úÖ Processed 160/164 neighborhoods. Pausing 2 seconds...


Generating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 164/164 [05:41<00:00,  2.08s/it]


‚úÖ Generation complete!
üìä Successfully generated: 164/164
‚ùå Failed: 0
‚úÖ Descriptions and keywords added to DataFrame
üìä DataFrame shape: (164, 51)

üìã Sample descriptions:

üèòÔ∏è Balmumcu, Be≈üikta≈ü
üìù Description: Balmumcu, located in the Be≈üikta≈ü district of Istanbul, is a tranquil and family-friendly neighborhood that offers a high quality of life, as evidenced by its perfect Welfare Index score of 1.00. This...
üè∑Ô∏è Keywords: family-friendly neighborhoods in Istanbul, quiet neighborhoods in Be≈üikta≈ü, affordable areas in Istanbul, green spaces in Balmumcu, residential areas in Be≈üikta≈ü, tranquil lifestyle in Istanbul, family-friendly areas in Turkey, affordable housing in Be≈üikta≈ü, peaceful neighborhoods in Istanbul, mid-range rent in Istanbul
----------------------------------------------------------------------

üèòÔ∏è Bebek, Be≈üikta≈ü
üìù Description: Bebek, located in the Be≈üikta≈ü district, is a charming and upscale neighborhood that offers a un




## 7. Add Descriptions to DataFrame

In [None]:
# Add descriptions as a new column
df['Description'] = descriptions

print("‚úÖ Descriptions added to DataFrame")
print(f"üìä DataFrame shape: {df.shape}")
print(f"\nüìã Sample descriptions:")
print("=" * 70)

# Show a few examples
for i in range(min(3, len(df))):
    print(f"\nüèòÔ∏è {df.iloc[i]['Mahalle']}, {df.iloc[i]['ƒ∞l√ße']}")
    print(f"üìù {df.iloc[i]['Description']}")

    print("-" * 70)

## 8. Save to CSV

In [None]:
# Save the enriched data
output_path = 'istanbul_mahalle_complete_data_with_descriptions_70B.csv'
df.to_csv(output_path, index=False, encoding='utf-8-sig')

print(f"‚úÖ Data saved to: {output_path}")
print(f"üìä Total rows: {len(df)}")
print(f"üìä Total columns: {len(df.columns)}")
print(f"\nüéâ All done! You can now use this enriched dataset with rich descriptions.")

## 9. (Optional) View Statistics

In [None]:
# Statistics about the descriptions
print("üìä Description Statistics")
print("=" * 70)

# Description lengths
df['Description_Length'] = df['Description'].str.len()
print(f"Average description length: {df['Description_Length'].mean():.0f} characters")
print(f"Shortest description: {df['Description_Length'].min()} characters")
print(f"Longest description: {df['Description_Length'].max()} characters")

# Words per description
df['Description_Words'] = df['Description'].str.split().str.len()
print(f"\nAverage words per description: {df['Description_Words'].mean():.0f} words")
print(f"Shortest: {df['Description_Words'].min()} words")
print(f"Longest: {df['Description_Words'].max()} words")

print("\n" + "=" * 70)
print("üéØ Ready to use for vector database creation!")