In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import folium

# Load the data
demographics = pd.read_csv('demographics.csv')
geographics = pd.read_csv('geographics.csv')
competitors = pd.read_csv('competitors.csv')
pois = pd.read_csv('pois.csv')

# Merge the data
data = pd.merge(demographics, geographics, on='id')
data = pd.merge(data, competitors, on='latitude', how='left')
data = pd.merge(data, pois, on='latitude', how='left')

# Handle missing values
data.fillna(data.mean(), inplace=True)

# Scale the data
scaler = StandardScaler()
data[['age', 'income']] = scaler.fit_transform(data[['age', 'income']])

# Perform exploratory data analysis
plt.figure(figsize=(10, 6))
sns.countplot(x='occupation', data=data)
plt.title('Occupation Distribution')
plt.show()

plt.figure(figsize=(10, 6))
sns.countplot(x='education_level', data=data)
plt.title('Education Level Distribution')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(x='latitude', y='longitude', data=data)
plt.title('Geographic Distribution')
plt.show()

# Perform clustering
kmeans = KMeans(n_clusters=5, random_state=42)
data['cluster'] = kmeans.fit_predict(data[['latitude', 'longitude', 'age', 'income']])

# Visualize the clusters
m = folium.Map(location=[37.7749, -122.4194], zoom_start=12)
for index, row in data.iterrows():
    folium.CircleMarker([row['latitude'], row['longitude']], radius=3, color='blue' if row['cluster'] == 0 else 'ed' if row['cluster'] == 1 else 'green' if row['cluster'] == 2 else 'yellow' if row['cluster'] == 3 else 'purple').add_to(m)
m