In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType
import pandas as pd
import statsmodels.api as sm
import numpy as np
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

print("\nðŸš€ Starting Comprehensive Distributed Analysis (Master Metrics & Grouped Regression)")

# ====================================================================
# PART A: OVERALL KPI CALCULATION (PySpark MLlib & Save)
# ====================================================================

# 1. PySpark MLlib: Linear Regression (Z-score filtering ke baad)
# (Assumes df_clean already has the data after all previous steps)

# Filtering (3-sigma Outlier Removal)
stats_df = df_clean.agg(F.mean("price").alias("price_mean"),
                        F.stddev("price").alias("price_std"),
                        F.mean("odometer").alias("odometer_mean"),
                        F.stddev("odometer").alias("odometer_std")).collect()[0]

df_depr_filtered = df_clean.filter(
    (F.col("price") > stats_df["price_mean"] - 3 * stats_df["price_std"]) &
    (F.col("price") < stats_df["price_mean"] + 3 * stats_df["price_std"]) &
    (F.col("odometer") > stats_df["odometer_mean"] - 3 * stats_df["odometer_std"]) &
    (F.col("odometer") < stats_df["odometer_mean"] + 3 * stats_df["odometer_std"])
).select("price", "odometer")

assembler = VectorAssembler(inputCols=['odometer'], outputCol="features")
df_lr_ready = assembler.transform(df_depr_filtered)

lr = LinearRegression(featuresCol="features", labelCol="price")
lr_model = lr.fit(df_lr_ready)
depreciation_per_mile = lr_model.coefficients[0]
loss_per_10k_miles = depreciation_per_mile * 10000

# 2. Key Metric DataFrame Banana aur Save Karna
results_df = pd.DataFrame({
    'Metric': ['Overall Depreciation per 1 Mile', 'Overall Loss Per 10k Miles'],
    'Value': [depreciation_per_mile, loss_per_10k_miles],
    'Unit': ['USD', 'USD']
})

results_df.to_csv('reports/analysis_key_metrics.csv', index=False)
print("âœ… analysis_key_metrics.csv file saved (Overall KPI).")


# ====================================================================
# PART B: GROUPED REGRESSION (MASTER DEPRECIATION TABLE)
# ====================================================================

# 1. Pandas UDF Schema Definition
schema = StructType([
    StructField("manufacturer", StringType(), True),
    StructField("model", StringType(), True),
    StructField("Model_Count", LongType(), True),
    StructField("Depreciation_per_Mile", DoubleType(), True),
    StructField("Loss_Per_10k_Miles", DoubleType(), True),
    StructField("R2_Score", DoubleType(), True)
])

# 2. Pandas UDF Definition (OLS Model Run per Group)
@F.pandas_udf(schema, functionType=F.PandasUDFType.GROUPED_MAP)
def calculate_depreciation_udf(pandas_df):
    """Har (manufacturer, model) group ke liye OLS regression chalaana."""
    # Kam se kam 50 listings zaroori hain reliable calculation ke liye (jaisa ki aapne manga)
    if len(pandas_df) < 50:
        return pd.DataFrame()

    try:
        X = sm.add_constant(pandas_df['odometer'])
        y = pandas_df['price']
        model = sm.OLS(y, X).fit()

        depreciation_rate = model.params['odometer']

        # Result Pandas Series banana
        result = pd.Series({
            'manufacturer': pandas_df['manufacturer'].iloc[0],
            'model': pandas_df['model'].iloc[0],
            'Model_Count': len(pandas_df),
            'Depreciation_per_Mile': depreciation_rate,
            'Loss_Per_10k_Miles': depreciation_rate * 10000,
            'R2_Score': model.rsquared
        })
        return result.to_frame().T
    except:
        return pd.DataFrame()

# 3. Grouped Map Apply Karna (Distributed Analysis)
master_depreciation_results_spark = df_clean.groupby("manufacturer", "model").apply(calculate_depreciation_udf)

# 4. Final Cleanup aur Save Karna (Master Table)
# Depreciation Rate ko positive loss mein badalna
master_depreciation_results_spark = master_depreciation_results_spark.withColumn(
    'Est_Loss_Per_10k_Miles_USD', F.col('Loss_Per_10k_Miles') * -1)
master_depreciation_results_spark = master_depreciation_results_spark.withColumn(
    'Model_Reliability_R2', F.round(F.col('R2_Score'), 3))


# Final columns chunna aur save karna (jaisa ki aapke original code mein tha)
df_final_master = master_depreciation_results_spark.select(
    "manufacturer",
    "model",
    "Model_Count",
    "Est_Loss_Per_10k_Miles_USD",
    "Model_Reliability_R2"
)

df_final_master.coalesce(1).write.csv(
    "reports/MASTER_DEPRECIATION_RATES.csv",
    mode="overwrite",
    header=True
)

print("âœ… MASTER_DEPRECIATION_RATES.csv file successfully created (Power BI Master Table).")

In [None]:
# Overall depreciation ke liye data ko Pandas mein le aao (ek baar mein)
import numpy as np
df_all_data = df_clean.select("price", "odometer").toPandas()

# --- CLEANING STEP (most important) ---
df_all_data = df_all_data.replace([np.inf, -np.inf], np.nan)
df_all_data = df_all_data.dropna(subset=['price', 'odometer'])

# OLS Regression chalaana
X = sm.add_constant(df_all_data['odometer'])
y = df_all_data['price']

model_overall = sm.OLS(y, X).fit()

depreciation_per_mile = model_overall.params['odometer']
loss_per_10k_miles = depreciation_per_mile * 10000 * -1

# Final Pandas DataFrame bana kar save karna
results_df = pd.DataFrame({
    'Metric': ['Overall_Loss_Per_10k_Miles'],
    'Value': [loss_per_10k_miles],
    'Unit': ['USD']
})

results_df.to_csv('reports/analysis_key_metrics.csv', index=False)
print("âœ… analysis_key_metrics.csv saved.")
