In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, StringIndexer, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator


In [3]:
# Initialize Spark Session
spark = SparkSession.builder.appName("PremiumPrediction").getOrCreate()


In [4]:
# Load Data
df = spark.read.csv("/content/gdrive/MyDrive/AC/output.csv", header=True, inferSchema=True)



In [5]:
df.show()

+---+----+------+-------------+--------------+--------------------+---------------+------------------+------------------+--------+-----------+---------------+-----------+------------+------------------+-----------------+--------------+------------------+-------------+--------------+
| id| Age|Gender|Annual Income|Marital Status|Number of Dependents|Education Level|        Occupation|      Health Score|Location|Policy Type|Previous Claims|Vehicle Age|Credit Score|Insurance Duration|Customer Feedback|Smoking Status|Exercise Frequency|Property Type|Premium Amount|
+---+----+------+-------------+--------------+--------------------+---------------+------------------+------------------+--------+-----------+---------------+-----------+------------+------------------+-----------------+--------------+------------------+-------------+--------------+
|  0|19.0|Female|      10049.0|       Married|                 1.0|     Bachelor's|     Self-Employed| 22.59876067181393|   Urban|          3|      

In [6]:
from pyspark.sql.functions import col, sum

# Total row count
total_rows = df.count()

# Calculate missing value percentage for each column
missing_values = []
for c in df.columns:
    missing_count = df.select(sum(col(c).isNull().cast("int"))).collect()[0][0]
    missing_percentage = (missing_count / total_rows) * 100
    missing_values.append((c, missing_percentage))

# Print the results as a list
missing_values_sorted = sorted(missing_values, key=lambda x: x[1], reverse=True)  # Sort by highest missing %
for col_name, perc in missing_values_sorted:
    print(f"Column: {col_name}, Missing: {perc:.2f}%")


Column: id, Missing: 0.00%
Column: Age, Missing: 0.00%
Column: Gender, Missing: 0.00%
Column: Annual Income, Missing: 0.00%
Column: Marital Status, Missing: 0.00%
Column: Number of Dependents, Missing: 0.00%
Column: Education Level, Missing: 0.00%
Column: Occupation, Missing: 0.00%
Column: Health Score, Missing: 0.00%
Column: Location, Missing: 0.00%
Column: Policy Type, Missing: 0.00%
Column: Previous Claims, Missing: 0.00%
Column: Vehicle Age, Missing: 0.00%
Column: Credit Score, Missing: 0.00%
Column: Insurance Duration, Missing: 0.00%
Column: Customer Feedback, Missing: 0.00%
Column: Smoking Status, Missing: 0.00%
Column: Exercise Frequency, Missing: 0.00%
Column: Property Type, Missing: 0.00%
Column: Premium Amount, Missing: 0.00%


In [7]:
# Drop rows with missing target values
df = df.dropna(subset=["Premium Amount"])

In [8]:
from pyspark.sql.functions import col, mean

# 2️⃣ Impute numeric columns with mean
numeric_cols = ["Credit Score", "Number of Dependents", "Health Score", "Annual Income", "Age"]
for col_name in numeric_cols:
    mean_value = df.select(mean(col(col_name))).collect()[0][0]
    df = df.fillna({col_name: mean_value})

# 1️⃣ Fill "Previous Claims" with 0
df = df.fillna({'Previous Claims': 0})


In [9]:
# Impute categorical columns with mode (most frequent value)
categorical_cols = ["Customer Feedback", "Marital Status"]
for col_name in categorical_cols:
    mode_value = df.groupBy(col_name).count().orderBy(col("count").desc()).first()[0]
    df = df.fillna({col_name: mode_value})

In [10]:
df = df.dropna(subset=["Occupation"])

In [11]:
from pyspark.sql.functions import col, sum

# Total row count
total_rows = df.count()

# Calculate missing value percentage for each column
missing_values = []
for c in df.columns:
    missing_count = df.select(sum(col(c).isNull().cast("int"))).collect()[0][0]
    missing_percentage = (missing_count / total_rows) * 100
    missing_values.append((c, missing_percentage))

# Print the results as a list
missing_values_sorted = sorted(missing_values, key=lambda x: x[1], reverse=True)  # Sort by highest missing %
for col_name, perc in missing_values_sorted:
    print(f"Column: {col_name}, Missing: {perc:.2f}%")

Column: id, Missing: 0.00%
Column: Age, Missing: 0.00%
Column: Gender, Missing: 0.00%
Column: Annual Income, Missing: 0.00%
Column: Marital Status, Missing: 0.00%
Column: Number of Dependents, Missing: 0.00%
Column: Education Level, Missing: 0.00%
Column: Occupation, Missing: 0.00%
Column: Health Score, Missing: 0.00%
Column: Location, Missing: 0.00%
Column: Policy Type, Missing: 0.00%
Column: Previous Claims, Missing: 0.00%
Column: Vehicle Age, Missing: 0.00%
Column: Credit Score, Missing: 0.00%
Column: Insurance Duration, Missing: 0.00%
Column: Customer Feedback, Missing: 0.00%
Column: Smoking Status, Missing: 0.00%
Column: Exercise Frequency, Missing: 0.00%
Column: Property Type, Missing: 0.00%
Column: Premium Amount, Missing: 0.00%


In [12]:
# Identify categorical and numerical features
categorical_cols = [col_name for col_name, dtype in df.dtypes if dtype == "string"]
numerical_cols = [col_name for col_name, dtype in df.dtypes if dtype in ["int", "double"] and col_name != "Premium Amount"]

In [13]:
# Identify categorical and numerical features
categorical_cols = [col_name for col_name, dtype in df.dtypes if dtype == "string"]
numerical_cols = [col_name for col_name, dtype in df.dtypes if dtype in ["int", "double"] and col_name != "Premium Amount"]


In [14]:
# Encode categorical features
for col_name in categorical_cols:
    indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_index", handleInvalid="keep")
    df = indexer.fit(df).transform(df)


In [15]:
# Assemble features
feature_cols = [f"{col}_index" for col in categorical_cols] + numerical_cols
vector_assembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid="skip")
df = vector_assembler.transform(df)

In [16]:
# Train-test split
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

In [17]:
# Verify if label column is numeric
train_data.select("Premium Amount").summary().show()


+-------+-----------------+
|summary|   Premium Amount|
+-------+-----------------+
|  count|           960101|
|   mean|1102.025160894531|
| stddev|864.4544253192232|
|    min|             20.0|
|    25%|            514.0|
|    50%|            872.0|
|    75%|           1508.0|
|    max|           4999.0|
+-------+-----------------+



In [18]:
# Train Model
lr = LinearRegression(featuresCol="features", labelCol="Premium Amount")
lr_model = lr.fit(train_data)

In [19]:
# Predictions
predictions = lr_model.transform(test_data)


In [20]:
from pyspark.sql.functions import col, abs as F_abs
from pyspark.sql import functions as F
# Evaluate Model
evaluator = RegressionEvaluator(labelCol="Premium Amount", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)

mae_evaluator = RegressionEvaluator(labelCol="Premium Amount", predictionCol="prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)

r2_evaluator = RegressionEvaluator(labelCol="Premium Amount", predictionCol="prediction", metricName="r2")
r2 = r2_evaluator.evaluate(predictions)

# Compute Accuracy
def compute_accuracy(predictions):
    # Compute error as the absolute difference between predicted and actual premium amount
    predictions = predictions.withColumn("error", F.abs((F.col("prediction") - F.col("Premium Amount")) / F.col("Premium Amount")))

    # Calculate accuracy as the proportion of predictions with error <= 10%
    accuracy = predictions.filter(F.col("error") <= 0.1).count() / predictions.count()
    return accuracy

# Assuming 'predictions' is the DataFrame that contains the predicted and actual values
accuracy = compute_accuracy(predictions)


print(f"✅ Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"✅ Mean Absolute Error (MAE): {mae:.4f}")
print(f"✅ R-squared (R2): {r2:.4f}")
print(f"✅ Accuracy: {accuracy:.4f}")



✅ Root Mean Squared Error (RMSE): 865.3881
✅ Mean Absolute Error (MAE): 668.3908
✅ R-squared (R2): 0.0041
✅ Accuracy: 0.0925


In [21]:
print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 9.25%
