In [7]:
import pandas as pd
import random
import datetime
from google.cloud import storage
from google.oauth2 import service_account

# Path to your service account key
key_path = r"D:\GCP Project Udemy\Projects Notes for Practise and GIT\linen-age-447106-e3-ab2fc4a09711.json"

# Use the credentials for authentication
credentials = service_account.Credentials.from_service_account_file(key_path)

# Initialize the storage client with the credentials
client = storage.Client(credentials=credentials, project='your-project-id')

# Define synthetic data
property_types = ["Apartment", "House", "Condo", "Townhouse", "Villa", "Penthouse", "Studio", "Duplex"]
locations = ["Mumbai", "Delhi", "Bangalore", "Hyderabad", "Chennai", "Kolkata", "Pune", "Ahmedabad", "Jaipur", "Lucknow"]

# Generate Real Estate Properties Data
def generate_real_estate_data(num_records):
    data = []
    for i in range(1, num_records + 1):
        property_id = f"P-{i:05d}"
        location = random.choice(locations)
        property_type = random.choice(property_types)
        price = random.randint(1000000, 50000000)  # Price in INR
        size_sqft = random.randint(500, 5000)  # Size in square feet
        bedrooms = random.randint(1, 6)
        bathrooms = random.randint(1, 5)
        year_built = random.randint(1970, 2023)

        data.append([
            property_id, location, property_type, price, size_sqft, bedrooms, bathrooms, year_built
        ])

    columns = ["Property_ID", "Location", "Type", "Price", "Size_SqFt", "Bedrooms", "Bathrooms", "Year_Built"]
    df = pd.DataFrame(data, columns=columns)
    return df

# Generate Property Owner Data
def generate_owner_data(real_estate_df):
    first_names = ["Arjun", "Meera", "Ravi", "Priya", "Vikram", "Anjali", "Amit", "Radhika", "Suresh", "Neha"]
    last_names = ["Sharma", "Verma", "Iyer", "Kumar", "Patel", "Gupta", "Rao", "Singh", "Das", "Chauhan"]

    data = []
    for i, property_id in enumerate(real_estate_df["Property_ID"], start=1):
        owner_id = f"O-{i:05d}"
        owner_name = f"{random.choice(first_names)} {random.choice(last_names)}"
        contact_number = f"+91{random.randint(7000000000, 9999999999)}"

        data.append([owner_id, property_id, owner_name, contact_number])

    columns = ["Owner_ID", "Property_ID", "Owner_Name", "Contact_Number"]
    df = pd.DataFrame(data, columns=columns)
    return df



# Generate datasets
real_estate_df = generate_real_estate_data(15)
owner_df = generate_owner_data(real_estate_df)

real_estate_df.head(2)

Unnamed: 0,Property_ID,Location,Type,Price,Size_SqFt,Bedrooms,Bathrooms,Year_Built
0,P-00001,Chennai,Villa,29865958,2643,1,2,2004
1,P-00002,Hyderabad,House,42386305,1566,4,3,2007


In [8]:
owner_df.head(2)

Unnamed: 0,Owner_ID,Property_ID,Owner_Name,Contact_Number
0,O-00001,P-00001,Meera Iyer,918228668084
1,O-00002,P-00002,Anjali Iyer,917049457261


In [13]:
# Upload DataFrame to GCP bucket
def upload_to_gcp(bucket_name, folder_path, file_name, dataframe):
    bucket = client.get_bucket(bucket_name)
    blob = bucket.blob(f"{folder_path}/{file_name}")
    blob.upload_from_string(dataframe.to_json(orient="records", lines=True), content_type="application/json")
    print(f"Uploaded {file_name} to {bucket_name}/{folder_path}")

# Create folder with today's date
today_date = datetime.datetime.now().strftime("%Y-%m-%d")
real_estate_folder = f"real_estate_data/real_estate_files/{today_date}"
owner_folder = f"real_estate_data/owners_data_files/{today_date}"

# GCP bucket name
bucket_name = "project-bucket-for-pipeline"

# Upload to GCP
upload_to_gcp(bucket_name, real_estate_folder, "real_estate_data.json", real_estate_df)


Uploaded real_estate_data.json to project-bucket-for-pipeline/real_estate_data/real_estate_files/2025-01-07


In [12]:
upload_to_gcp(bucket_name, owner_folder, "owner_data.json", owner_df)

Uploaded owner_data.json to project-bucket-for-pipeline/real_estate_data/owners_data_files/2025-01-07
