In [2]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Base attributes for synthetic data
genders = ['Male', 'Female']
education_levels = ['School', 'Graduate', 'Postgraduate']
income_levels = ['Low', 'Middle', 'High']
urban_rural = ['Urban', 'Rural']
professions = ['Software Engineer', 'Doctor', 'Government Officer', 'Teacher', 'Business Owner', 'Farmer']

dowry_amount_ranges = {
    'Software Engineer': (10, 30),
    'Doctor': (15, 35),
    'Government Officer': (12, 25),
    'Teacher': (5, 15),
    'Business Owner': (8, 20),
    'Farmer': (3, 12)
}

# Number of synthetic rows
num_rows = 10000

# Generating synthetic data
synthetic_data = pd.DataFrame({
    'Gender': np.random.choice(genders, num_rows),
    'Education_Level': np.random.choice(education_levels, num_rows),
    'Income_Level': np.random.choice(income_levels, num_rows),
    'Urban_Rural': np.random.choice(urban_rural, num_rows),
    'Profession': np.random.choice(professions, num_rows)
})

# Generating Dowry Amount based on Profession
synthetic_data['Dowry_Amount_Lakhs'] = synthetic_data['Profession'].apply(
    lambda x: round(np.random.uniform(*dowry_amount_ranges[x]), 2)
)

# Save to CSV
synthetic_data.to_csv("synthetic_dowry_dataset_10000.csv", index=False)

print("Synthetic dataset with 10,000 rows saved as 'synthetic_dowry_dataset_10000.csv'")


Synthetic dataset with 10,000 rows saved as 'synthetic_dowry_dataset_10000.csv'


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 1. Load the dataset
df = pd.read_csv("synthetic_dowry_dataset_10000.csv")

# 2. Define features and target
X = df.drop("Dowry_Amount_Lakhs", axis=1)
y = df["Dowry_Amount_Lakhs"]

# 3. Categorical columns
categorical_cols = ['Gender', 'Education_Level', 'Income_Level', 'Urban_Rural', 'Profession']

# 4. Preprocessing: One-hot encode categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# 5. Create a pipeline with Random Forest Regressor
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# 6. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. Fit the model
model.fit(X_train, y_train)

# 8. Predict function
def predict_dowry(gender, education, income, location, profession):
    input_df = pd.DataFrame([{
        'Gender': gender,
        'Education_Level': education,
        'Income_Level': income,
        'Urban_Rural': location,
        'Profession': profession
    }])
    prediction = model.predict(input_df)
    return round(prediction[0], 2)

# 9. Example prediction
predicted_amount = predict_dowry('Male', 'Graduate', 'High', 'Urban', 'Farmer')
print(f"Predicted Dowry Amount (Lakhs): ₹{predicted_amount}")


Predicted Dowry Amount (Lakhs): ₹7.59
