In [10]:
import pandas as pd
import random
from faker import Faker
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [11]:
fake = Faker()

In [None]:
def generate_bangalore_dataset(num_records=1000):
    """
    Generate a synthetic dataset of individuals in Bangalore.

    Args:
        num_records (int): The number of records to generate.

    Returns:
        pd.DataFrame: A pandas DataFrame containing the generated dataset.
    """
    preferred_areas = [
        "Jayanagar", "Rajajinagar", "Koramangala", "Whitefield", 
        "Indiranagar", "Malleshwaram"
    ]
    other_areas = [
        "Marathahalli", "HSR Layout", "BTM Layout", "Basavanagudi", 
        "Banashankari", "Electronic City", "Yelahanka", "Hebbal", 
        "JP Nagar", "KR Puram"
    ]
    bangalore_areas = preferred_areas * 5 + other_areas
    
    dataset = []
    for _ in range(num_records):
        person = {
            "id": fake.uuid4(),
            "name": fake.name(),
            "age": random.randint(18, 70),
            "gender": random.choice(["Male", "Female", "Non-binary"]),
            "status": random.choice(["Poor", "Homeless", "Both"]),
            "city": "Bangalore",
            "area": random.choice(bangalore_areas),
            "is_housed": random.choice([True, False]),
        }
        dataset.append(person)
    return pd.DataFrame(dataset)


In [None]:
num_records = 100000
dataset = generate_bangalore_dataset(num_records)

In [None]:


"""
Count how many times each unique area appears in the 'area' column of the dataset
area_counts = dataset['area'].value_counts().reset_index()

Rename the columns to make them more readable: 'area' for the area names and 'people_count' for the count of people in each area
area_counts.columns = ['area', 'people_count']
"""
area_counts = dataset['area'].value_counts().reset_index()
area_counts.columns = ['area', 'people_count']


In [15]:
dataset = dataset.merge(area_counts, on='area', how='left')

In [16]:
min_people = dataset['people_count'].min()
max_people = dataset['people_count'].max()


In [None]:
"""
# Assign points based on the area’s people count:
# 3 points for most people, 1 point for least, 2 points for others
"""
def assign_points(row):
    if row['people_count'] == max_people:
        return 3  
    elif row['people_count'] == min_people:
        return 1  
    else:
        return 2  

dataset['area_points'] = dataset.apply(assign_points, axis=1)


In [18]:
train_data, test_data = train_test_split(dataset, test_size=0.3, random_state=42)


In [19]:
train_data.to_csv("../data/train/bangalore_train.csv", index=False)
test_data.to_csv("../data/test/bangalore_test.csv", index=False)