**Housing Price Prediction (San Jose)**

In [2]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# Generate sample data: Zillow sample data on houses in San Jose generated from ChatGPT
df_src = pd.read_csv("sanjose_prices_footage_800.csv")  # >100 rows, prompt-compliant

# Prompt 2: randomly generate location labels
rng = np.random.default_rng(42)
df_src["location"] = rng.choice(["Downtown", "Rural", "Suburb"], size=len(df_src))

# Rebuild the original 'data' dict your code expects
data = {
    "square_footage": df_src["square_footage"].astype(int).tolist(),
    "location": df_src["location"].tolist(),
    "price": df_src["price"].astype(int).tolist(),
}
df = pd.DataFrame(data)
# Features and target
X = df[['square_footage', 'location']]
y = df['price']
# Preprocessing: One-hot encode the location column
preprocessor = ColumnTransformer(
transformers=[
('location', OneHotEncoder(sparse_output=False), ['location'])
], remainder='passthrough')
# Create pipeline with preprocessing and model
model = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', LinearRegression())
])
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Train model
model.fit(X_train, y_train)
# Make prediction for a new house: 2000 sq ft in Downtown
new_house = pd.DataFrame({'square_footage': [2000], 'location': ['Downtown']})
predicted_price = model.predict(new_house)
print(f"Predicted price for a 2000 sq ft house in Downtown: $ {predicted_price[0]:,.2f}")
# Display model coefficients
feature_names = (model.named_steps['preprocessor']
.named_transformers_['location']
.get_feature_names_out(['location'])).tolist() + ['square_footage']
coefficients = model.named_steps['regressor'].coef_
print("\nModel Coefficients:")
for feature, coef in zip(feature_names, coefficients):
  print(f"{feature}: {coef:.2f}")

Predicted price for a 2000 sq ft house in Downtown: $ 1,523,804.70

Model Coefficients:
location_Downtown: 5444.96
location_Rural: -6292.39
location_Suburb: 847.43
square_footage: 755.70


**Customer Churn Prediction**

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# 70 sets of customer data generated with ChatGPT
data = {
    'age': [24, 52, 31, 45, 63, 27, 38, 56, 41, 29,
            47, 33, 60, 22, 35, 49, 58, 26, 44, 32,
            54, 21, 37, 50, 28, 62, 39, 25, 43, 30,
            55, 48, 34, 61, 23, 36, 46, 57, 27, 40,
            53, 29, 45, 31, 59, 24, 38, 52, 26, 41,
            60, 33, 47, 28, 55, 22, 36, 49, 25, 44,
            32, 58, 27, 41, 35, 50, 30, 56, 23, 39],

    'monthly_usage_hours': [35, 72, 48, 60, 25, 88, 54, 31, 67, 45,
                            76, 52, 29, 90, 58, 41, 33, 79, 47, 62,
                            38, 83, 50, 36, 69, 27, 74, 40, 56, 63,
                            32, 86, 49, 34, 78, 43, 61, 30, 71, 55,
                            26, 82, 46, 65, 37, 91, 53, 28, 75, 44,
                            59, 33, 68, 41, 87, 35, 57, 80, 29, 66,
                            47, 73, 39, 84, 51, 37, 64, 92, 42, 70],

    'purchase_amount': [140, 320, 210, 260, 110, 390, 235, 160, 300, 200,
                        340, 225, 130, 420, 245, 180, 150, 360, 205, 275,
                        170, 380, 215, 165, 295, 120, 335, 190, 240, 285,
                        155, 400, 220, 175, 350, 195, 270, 145, 310, 230,
                        125, 365, 205, 280, 175, 440, 225, 135, 330, 210,
                        260, 160, 290, 185, 410, 150, 245, 370, 140, 300,
                        205, 340, 180, 395, 215, 170, 285, 430, 190, 320],

    'customer_service_calls': [5, 1, 4, 2, 7, 0, 3, 6, 2, 5,
                               1, 3, 8, 0, 4, 2, 5, 1, 6, 3,
                               7, 0, 4, 2, 5, 1, 3, 6, 2, 7,
                               0, 4, 1, 3, 5, 2, 6, 0, 4, 1,
                               8, 2, 5, 3, 4, 1, 7, 0, 6, 2,
                               3, 5, 4, 1, 8, 2, 6, 0, 5, 3,
                               2, 7, 1, 4, 3, 6, 0, 5, 2, 4],

    'region': ['North','South','East','West','South','North','West','East','South','North',
               'West','East','South','North','West','East','South','North','West','East',
               'South','North','West','East','South','North','West','East','South','North',
               'West','East','South','North','West','East','South','North','West','East',
               'South','North','West','East','South','North','West','East','South','North',
               'West','East','South','North','West','East','South','North','West','East',
               'South','North','West','East','South','North','West','East','South','North'],

    'churn': [1,0,0,0,1,0,1,0,0,0,
              0,0,1,0,1,0,1,0,1,0,
              1,0,1,0,1,0,1,0,1,0,
              1,0,0,0,1,0,1,0,1,0,
              1,0,1,0,0,0,1,0,1,0,
              0,0,1,0,1,0,1,0,1,0,
              0,0,1,0,1,0,1,0,0,0]
}
df = pd.DataFrame(data)
# Features and target
X = df[['age', 'monthly_usage_hours', 'purchase_amount', 'customer_service_calls',
'region']]
y = df['churn']
# Preprocessing: Scale numerical features and one-hot encode categorical features
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), ['age', 'monthly_usage_hours', 'purchase_amount',
'customer_service_calls']),
('cat', OneHotEncoder(sparse_output=False), ['region'])
])
# Create pipeline with preprocessing and model
model = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', LogisticRegression(random_state=42))
])
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Train model
model.fit(X_train, y_train)
# Predict churn probability for a new customer
new_customer = pd.DataFrame({
'age': [35],
'monthly_usage_hours': [20],
'purchase_amount': [150],
'customer_service_calls': [5],
'region': ['West']
})
churn_probability = model.predict_proba(new_customer)[0][1] # Probability of churn
# Classify based on threshold (0.5)
threshold = 0.5
churn_prediction = 1 if churn_probability > threshold else 0
print(f"Churn Probability for new customer: {churn_probability:.2f}")
print(f"Churn Prediction (1 = churn, 0 = no churn): {churn_prediction}")
# Display model coefficients
feature_names = (model.named_steps['preprocessor']
.named_transformers_['cat']
.get_feature_names_out(['region'])).tolist() + ['age',
'monthly_usage_hours', 'purchase_amount', 'customer_service_calls']
coefficients = model.named_steps['classifier'].coef_[0]
print("\nModel Coefficients:")
for feature, coef in zip(feature_names, coefficients):
  print(f"{feature}: {coef:.2f}")

Churn Probability for new customer: 0.85
Churn Prediction (1 = churn, 0 = no churn): 1

Model Coefficients:
region_East: -0.29
region_North: 0.15
region_South: -0.26
region_West: 0.80
age: -1.50
monthly_usage_hours: -1.26
purchase_amount: 0.94
customer_service_calls: 1.83


**Customer Segmentation**

In [4]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
# 70 sets of customer data generated with ChatGPT
data = {
    'annual_spending': [
      1525, 1728, 2058, 1532, 1738, 2074, 1725, 1506, 1848, 1844,
      1599, 1544, 1887, 1870, 1546, 1581, 1784, 1666, 1773, 1675,
      1667, 2070, 1557, 1910, 2080, 1235, 973, 1281, 1282, 1085,
      946, 978, 1205, 1205, 1397, 1248, 1293, 1050, 901, 1397,
      1159, 1330, 978, 1176, 1206, 957, 1022, 943, 1400, 965,
      1035, 1393, 1265, 1124, 1026, 221, 425, 434, 272, 697,
      784, 684, 296, 562, 255, 262, 311, 749, 387, 277,
      251, 295, 616, 610, 202, 492, 358, 259, 521, 714
    ],
    'purchase_frequency': [
      20, 16, 15, 15, 19, 16, 18, 16, 17, 15,
      17, 20, 15, 19, 20, 16, 18, 17, 20, 19,
      18, 16, 16, 17, 20, 11, 10, 12, 12, 9,
      14, 13, 8, 11, 12, 13, 13, 11, 13, 12,
      8, 13, 10, 14, 10, 10, 8, 13, 14, 13,
      12, 9, 10, 12, 9, 5, 1, 1, 5, 2,
      5, 7, 1, 4, 6, 4, 2, 4, 3, 4,
      6, 7, 4, 1, 4, 4, 2, 5, 1, 7
    ],
    'age': [
      33, 48, 43, 27, 44, 47, 43, 47, 29, 27,
      36, 39, 42, 31, 32, 28, 45, 36, 46, 48,
      37, 46, 26, 27, 35, 42, 34, 47, 43, 34,
      31, 35, 42, 46, 57, 33, 40, 35, 58, 54,
      57, 46, 54, 59, 45, 58, 37, 45, 47, 45,
      57, 59, 42, 44, 32, 53, 22, 20, 33, 52,
      48, 44, 60, 44, 59, 64, 30, 26, 47, 69,
      52, 33, 48, 28, 34, 62, 36, 65, 21, 51
    ],
    'region': [
      'North', 'South', 'North', 'West', 'South', 'North', 'West', 'East',
      'West', 'South', 'West', 'East', 'North', 'East', 'North', 'East',
      'West', 'East', 'South', 'North', 'South', 'East', 'East', 'East',
      'South', 'South', 'West', 'South', 'East', 'West', 'West', 'North',
      'West', 'West', 'East', 'North', 'East', 'North', 'West', 'East',
      'South', 'East', 'South', 'South', 'North', 'North', 'East', 'North',
      'North', 'South', 'South', 'West', 'South', 'East', 'North', 'East',
      'South', 'North', 'East', 'East', 'South', 'South', 'South', 'West',
      'West', 'North', 'East', 'South', 'West', 'South', 'North', 'North',
      'South', 'South', 'West', 'West', 'West', 'South', 'North', 'West'
    ]
}
df = pd.DataFrame(data)
# Preprocess data: Select numerical features and scale them
features = ['annual_spending', 'purchase_frequency', 'age']
X = df[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Determine optimal number of clusters using elbow method
inertia = []
K = range(1, 6)
for k in K:
  kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) # Set n_init explicitly
  kmeans.fit(X_scaled)
  inertia.append(kmeans.inertia_)
# Plot elbow curve
plt.figure(figsize=(8, 5))
plt.plot(K, inertia, 'bo-')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.savefig('elbow_plot.png')
plt.close()
# Apply K-Means with optimal K (e.g., 3 based on elbow method)
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10) # Set n_init explicitly
df['cluster'] = kmeans.fit_predict(X_scaled)
# Analyze clusters
cluster_summary = df.groupby('cluster')[features].mean().round(2)
print("Cluster Characteristics:")
display(cluster_summary)
# Example of targeted strategies
for cluster in range(optimal_k):
  print(f"\nCluster {cluster} Strategy:")
  if cluster_summary.loc[cluster, 'annual_spending'] > 1000:
    print("High-spending customers: Offer exclusive promotions or loyalty rewards.")
  elif cluster_summary.loc[cluster, 'purchase_frequency'] > 10:
    print("Frequent buyers: Provide bulk discounts or subscription plans.")
  else:
    print("Low-engagement customers: Send personalized re-engagement campaigns.")
# Save cluster assignments to CSV
df.to_csv('customer_segments.csv', index=False)

Cluster Characteristics:


Unnamed: 0_level_0,annual_spending,purchase_frequency,age
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,591.64,4.57,30.0
1,652.55,6.5,55.45
2,1512.2,15.05,40.02



Cluster 0 Strategy:
Low-engagement customers: Send personalized re-engagement campaigns.

Cluster 1 Strategy:
Low-engagement customers: Send personalized re-engagement campaigns.

Cluster 2 Strategy:
High-spending customers: Offer exclusive promotions or loyalty rewards.
