In [None]:

# metrics 
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("RegressionModelEvaluation").getOrCreate()

# Load your data
data = spark.read.format("libsvm").load("path/to/your/data")

# Split the data into training and test sets
train_data, test_data = data.randomSplit([0.8, 0.2])

# Define the models
lr = LinearRegression(featuresCol='features', labelCol='label')
rf = RandomForestRegressor(featuresCol='features', labelCol='label')
gbt = GBTRegressor(featuresCol='features', labelCol='label')

# Train the models
lr_model = lr.fit(train_data)
rf_model = rf.fit(train_data)
gbt_model = gbt.fit(train_data)

# Make predictions
lr_predictions = lr_model.transform(test_data)
rf_predictions = rf_model.transform(test_data)
gbt_predictions = gbt_model.transform(test_data)

# Initialize evaluators
r2_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="r2")
rmse_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="rmse")
mae_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="mae")
mse_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="mse")

# Evaluate the models
models = {
    "Linear Regression": lr_predictions,
    "Random Forest Regressor": rf_predictions,
    "GBT Regressor": gbt_predictions
}

for name, predictions in models.items():
    r2 = r2_evaluator.evaluate(predictions)
    rmse = rmse_evaluator.evaluate(predictions)
    mae = mae_evaluator.evaluate(predictions)
    mse = mse_evaluator.evaluate(predictions)
    print(f"{name} Evaluation Metrics:")
    print(f"R2: {r2}")
    print(f"RMSE: {rmse}")
    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    print("="*40)

# Stop Spark session
spark.stop()


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor

# Initialize Spark Session
spark = SparkSession.builder.appName("GBTRegressorExample").getOrCreate()

# Create the DataFrame
data = spark.createDataFrame([
    (0, "red", "SUV", 12, 20.0, 60, 5),
    (1, "blue", "sedan", 9, 30.0, 70, 10),
    (2, "green", "truck", 15, 25.0, 80, 3)
], ["id", "color", "type", "hour", "label", "milesperhour", "age"])

# String Indexing
indexers = [
    StringIndexer(inputCol="color", outputCol="color_index"),
    StringIndexer(inputCol="type", outputCol="type_index"),
    StringIndexer(inputCol="hour", outputCol="hour_index")
]

# Assembling Features
assembler = VectorAssembler(
    inputCols=["color_index", "type_index", "hour_index", "milesperhour", "age"],
    outputCol="features"
)

# Create and Fit the Pipeline
pipeline = Pipeline(stages=indexers + [assembler])
model = pipeline.fit(data)
transformed_data = model.transform(data)

# Training the GBTRegressor
gbt = GBTRegressor(featuresCol="features", labelCol="label")
gbt_model = gbt.fit(transformed_data)

# View Transformed Data (Optional)
transformed_data.select("id", "features", "label").show()

# Stop Spark Session
spark.stop()


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor

# Initialize Spark Session
spark = SparkSession.builder.appName("RegressorExamples").getOrCreate()

# Create the DataFrame
data = spark.createDataFrame([
    (0, "red", "SUV", 12, 20.0, 60, 5),
    (1, "blue", "sedan", 9, 30.0, 70, 10),
    (2, "green", "truck", 15, 25.0, 80, 3)
], ["id", "color", "type", "hour", "label", "milesperhour", "age"])

# String Indexing
indexers = [
    StringIndexer(inputCol="color", outputCol="color_index"),
    StringIndexer(inputCol="type", outputCol="type_index"),
    StringIndexer(inputCol="hour", outputCol="hour_index")
]

# One-Hot Encoding for Linear Regression (not needed for tree-based models)
encoder = OneHotEncoder(
    inputCols=["color_index", "type_index", "hour_index"],
    outputCols=["color_vec", "type_vec", "hour_vec"]
)

# Assembling Features for Linear Regression
assembler_lr = VectorAssembler(
    inputCols=["color_vec", "type_vec", "hour_vec", "milesperhour", "age"],
    outputCol="features"
)

# Assembling Features for Tree-Based Models
assembler_tree = VectorAssembler(
    inputCols=["color_index", "type_index", "hour_index", "milesperhour", "age"],
    outputCol="features"
)

# Create and Fit the Pipeline for Linear Regression
pipeline_lr = Pipeline(stages=indexers + [encoder, assembler_lr])
model_lr = pipeline_lr.fit(data)
transformed_data_lr = model_lr.transform(data)

# Create and Fit the Pipeline for Tree-Based Models
pipeline_tree = Pipeline(stages=indexers + [assembler_tree])
model_tree = pipeline_tree.fit(data)
transformed_data_tree = model_tree.transform(data)

# Training the Linear Regression Model
lr = LinearRegression(featuresCol="features", labelCol="label")
lr_model = lr.fit(transformed_data_lr)

# Training the Random Forest Regressor
rf = RandomForestRegressor(featuresCol="features", labelCol="label")
rf_model = rf.fit(transformed_data_tree)

# Training the GBT Regressor
gbt = GBTRegressor(featuresCol="features", labelCol="label")
gbt_model = gbt.fit(transformed_data_tree)

# View Transformed Data (Optional)
transformed_data_lr.select("id", "features", "label").show()
transformed_data_tree.select("id", "features", "label").show()

# Stop Spark Session
spark.stop()


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize Spark Session
spark = SparkSession.builder.appName("RegressorExamples").getOrCreate()

# Create the DataFrame
data = spark.createDataFrame([
    (0, "red", "SUV", 12, 20.0, 60, 5),
    (1, "blue", "sedan", 9, 30.0, 70, 10),
    (2, "green", "truck", 15, 25.0, 80, 3),
    (3, "yellow", "SUV", 20, 22.0, 65, 6),
    (4, "white", "sedan", 5, 35.0, 75, 12),
    (5, "black", "truck", 10, 28.0, 85, 7)
], ["id", "color", "type", "hour", "label", "milesperhour", "age"])

# Split the data into training and test sets (80% training, 20% test)
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1234)

# String Indexing
indexers = [
    StringIndexer(inputCol="color", outputCol="color_index"),
    StringIndexer(inputCol="type", outputCol="type_index"),
    StringIndexer(inputCol="hour", outputCol="hour_index")
]

# One-Hot Encoding for Linear Regression (not needed for tree-based models)
encoder = OneHotEncoder(
    inputCols=["color_index", "type_index", "hour_index"],
    outputCols=["color_vec", "type_vec", "hour_vec"]
)

# Assembling Features for Linear Regression
assembler_lr = VectorAssembler(
    inputCols=["color_vec", "type_vec", "hour_vec", "milesperhour", "age"],
    outputCol="features"
)

# Assembling Features for Tree-Based Models
assembler_tree = VectorAssembler(
    inputCols=["color_index", "type_index", "hour_index", "milesperhour", "age"],
    outputCol="features"
)

# Create and Fit the Pipeline for Linear Regression
pipeline_lr = Pipeline(stages=indexers + [encoder, assembler_lr])
model_lr = pipeline_lr.fit(train_data)
transformed_train_data_lr = model_lr.transform(train_data)
transformed_test_data_lr = model_lr.transform(test_data)

# Create and Fit the Pipeline for Tree-Based Models
pipeline_tree = Pipeline(stages=indexers + [assembler_tree])
model_tree = pipeline_tree.fit(train_data)
transformed_train_data_tree = model_tree.transform(train_data)
transformed_test_data_tree = model_tree.transform(test_data)

# Training the Linear Regression Model
lr = LinearRegression(featuresCol="features", labelCol="label")
lr_model = lr.fit(transformed_train_data_lr)

# Training the Random Forest Regressor
rf = RandomForestRegressor(featuresCol="features", labelCol="label")
rf_model = rf.fit(transformed_train_data_tree)

# Training the GBT Regressor
gbt = GBTRegressor(featuresCol="features", labelCol="label")
gbt_model = gbt.fit(transformed_train_data_tree)

# Evaluating the Linear Regression Model
lr_predictions = lr_model.transform(transformed_test_data_lr)
lr_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="rmse")
lr_rmse = lr_evaluator.evaluate(lr_predictions)
print(f"Linear Regression RMSE: {lr_rmse}")

# Evaluating the Random Forest Regressor
rf_predictions = rf_model.transform(transformed_test_data_tree)
rf_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="rmse")
rf_rmse = rf_evaluator.evaluate(rf_predictions)
print(f"Random Forest RMSE: {rf_rmse}")

# Evaluating the GBT Regressor
gbt_predictions = gbt_model.transform(transformed_test_data_tree)
gbt_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="rmse")
gbt_rmse = gbt_evaluator.evaluate(gbt_predictions)
print(f"GBT Regressor RMSE: {gbt_rmse}")

# View Transformed Data (Optional)
transformed_test_data_lr.select("id", "features", "label", "prediction").show()
transformed_test_data_tree.select("id", "features", "label", "prediction").show()

# Stop Spark Session
spark.stop()


In [None]:
#null drop

from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize Spark Session
spark = SparkSession.builder.appName("RegressorExamples").getOrCreate()

# Create the DataFrame
data = spark.createDataFrame([
    (0, "red", "SUV", 12, 20.0, 60, 5, 1),
    (1, "blue", "sedan", 9, 30.0, 70, 10, 0),
    (2, "green", "truck", 15, 25.0, 80, 3, 1),
    (3, "yellow", "SUV", 20, 22.0, 65, 6, 0),
    (4, "white", "sedan", 5, 35.0, 75, 12, 1),
    (5, "black", "truck", 10, 28.0, 85, 7, 0),
    (6, None, "sedan", 8, None, 70, 9, 1),  # Example with null values
    (7, "blue", None, 5, 30.0, None, 4, 1)
], ["id", "color", "type", "hour", "label", "milesperhour", "age", "isnew"])

# Drop rows with any null values
data = data.dropna()

# Split the data into training and test sets (80% training, 20% test)
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1234)

# String Indexing
indexers = [
    StringIndexer(inputCol="color", outputCol="color_index"),
    StringIndexer(inputCol="type", outputCol="type_index"),
    StringIndexer(inputCol="hour", outputCol="hour_index")
]

# One-Hot Encoding for Linear Regression (not needed for tree-based models)
encoder = OneHotEncoder(
    inputCols=["color_index", "type_index", "hour_index"],
    outputCols=["color_vec", "type_vec", "hour_vec"]
)

# Assembling Features for Linear Regression
assembler_lr = VectorAssembler(
    inputCols=["color_vec", "type_vec", "hour_vec", "milesperhour", "age", "isnew"],
    outputCol="features"
)

# Assembling Features for Tree-Based Models
assembler_tree = VectorAssembler(
    inputCols=["color_index", "type_index", "hour_index", "milesperhour", "age", "isnew"],
    outputCol="features"
)

# Create and Fit the Pipeline for Linear Regression
pipeline_lr = Pipeline(stages=indexers + [encoder, assembler_lr])
model_lr = pipeline_lr.fit(train_data)
transformed_train_data_lr = model_lr.transform(train_data)
transformed_test_data_lr = model_lr.transform(test_data)

# Create and Fit the Pipeline for Tree-Based Models
pipeline_tree = Pipeline(stages=indexers + [assembler_tree])
model_tree = pipeline_tree.fit(train_data)
transformed_train_data_tree = model_tree.transform(train_data)
transformed_test_data_tree = model_tree.transform(test_data)

# Training the Linear Regression Model
lr = LinearRegression(featuresCol="features", labelCol="label")
lr_model = lr.fit(transformed_train_data_lr)

# Training the Random Forest Regressor
rf = RandomForestRegressor(featuresCol="features", labelCol="label")
rf_model = rf.fit(transformed_train_data_tree)

# Training the GBT Regressor
gbt = GBTRegressor(featuresCol="features", labelCol="label")
gbt_model = gbt.fit(transformed_train_data_tree)

# Evaluating the Linear Regression Model
lr_predictions = lr_model.transform(transformed_test_data_lr)
lr_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="rmse")
lr_rmse = lr_evaluator.evaluate(lr_predictions)
print(f"Linear Regression RMSE: {lr_rmse}")

# Evaluating the Random Forest Regressor
rf_predictions = rf_model.transform(transformed_test_data_tree)
rf_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="rmse")
rf_rmse = rf_evaluator.evaluate(rf_predictions)
print(f"Random Forest RMSE: {rf_rmse}")

# Evaluating the GBT Regressor
gbt_predictions = gbt_model.transform(transformed_test_data_tree)
gbt_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="rmse")
gbt_rmse = gbt_evaluator.evaluate(gbt_predictions)
print(f"GBT Regressor RMSE: {gbt_rmse}")

# View Transformed Data (Optional)
transformed_test_data_lr.select("id", "features", "label", "prediction").show()
transformed_test_data_tree.select("id", "features", "label", "prediction").show()

# Stop Spark Session
spark.stop()


In [None]:
# check

from pyspark.sql import SparkSession
from pyspark.sql.functions import coalesce, lit
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize Spark Session
spark = SparkSession.builder.appName("RegressorExamples").getOrCreate()

# Create the DataFrame
data = spark.createDataFrame([
    (0, "red", "SUV", 12, 20.0, 60, 5, 1),
    (1, "blue", "sedan", 9, 30.0, 70, 10, 0),
    (2, "green", "truck", 15, 25.0, 80, 3, 1),
    (3, "yellow", "SUV", 20, 22.0, 65, 6, 0),
    (4, "white", "sedan", 5, 35.0, 75, 12, 1),
    (5, "black", "truck", 10, 28.0, 85, 7, 0),
    (6, None, "sedan", 8, None, 70, 9, 1),  # Example with null values
    (7, "blue", None, 5, 30.0, None, 4, 1)
], ["id", "color", "type", "hour", "label", "milesperhour", "age", "isnew"])

# Drop rows with any null values
data = data.dropna()

# Debugging: Print schema and check for nulls in specific columns
data.printSchema()
data.select("color", "type", "hour").show()

# Fill nulls in categorical columns with 'missing'
data = data.withColumn("color", coalesce(data["color"], lit("missing")))
data = data.withColumn("type", coalesce(data["type"], lit("missing")))

# Split the data into training and test sets (80% training, 20% test)
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1234)

# String Indexing
indexers = [
    StringIndexer(inputCol="color", outputCol="color_index"),
    StringIndexer(inputCol="type", outputCol="type_index"),
    StringIndexer(inputCol="hour", outputCol="hour_index")
]

# One-Hot Encoding for Linear Regression (not needed for tree-based models)
encoder = OneHotEncoder(
    inputCols=["color_index", "type_index", "hour_index"],
    outputCols=["color_vec", "type_vec", "hour_vec"]
)

# Assembling Features for Linear Regression
assembler_lr = VectorAssembler(
    inputCols=["color_vec", "type_vec", "hour_vec", "milesperhour", "age", "isnew"],
    outputCol="features"
)

# Assembling Features for Tree-Based Models
assembler_tree = VectorAssembler(
    inputCols=["color_index", "type_index", "hour_index", "milesperhour", "age", "isnew"],
    outputCol="features"
)

# Create and Fit the Pipeline for Linear Regression
pipeline_lr = Pipeline(stages=indexers + [encoder, assembler_lr])
model_lr = pipeline_lr.fit(train_data)
transformed_train_data_lr = model_lr.transform(train_data)
transformed_test_data_lr = model_lr.transform(test_data)

# Create and Fit the Pipeline for Tree-Based Models
pipeline_tree = Pipeline(stages=indexers + [assembler_tree])
model_tree = pipeline_tree.fit(train_data)
transformed_train_data_tree = model_tree.transform(train_data)
transformed_test_data_tree = model_tree.transform(test_data)

# Training the Linear Regression Model
lr = LinearRegression(featuresCol="features", labelCol="label")
lr_model = lr.fit(transformed_train_data_lr)

# Training the Random Forest Regressor
rf = RandomForestRegressor(featuresCol="features", labelCol="label")
rf_model = rf.fit(transformed_train_data_tree)

# Training the GBT Regressor
gbt = GBTRegressor(featuresCol="features", labelCol="label")
gbt_model = gbt.fit(transformed_train_data_tree)

# Evaluating the Linear Regression Model
lr_predictions = lr_model.transform(transformed_test_data_lr)
lr_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="rmse")
lr_rmse = lr_evaluator.evaluate(lr_predictions)
print(f"Linear Regression RMSE: {lr_rmse}")

# Evaluating the Random Forest Regressor
rf_predictions = rf_model.transform(transformed_test_data_tree)
rf_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="rmse")
rf_rmse = rf_evaluator.evaluate(rf_predictions)
print(f"Random Forest RMSE: {rf_rmse}")

# Evaluating the GBT Regressor
gbt_predictions = gbt_model.transform(transformed_test_data_tree)
gbt_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="rmse")
gbt_rmse = gbt_evaluator.evaluate(gbt_predictions)
print(f"GBT Regressor RMSE: {gbt_rmse}")

# View Transformed Data (Optional)
transformed_test_data_lr.select("id", "features", "label", "prediction").show()
transformed_test_data_tree.select("id", "features", "label", "prediction").show()

# Stop Spark Session
spark.stop()


# histogram

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import matplotlib.pyplot as plt

# Initialize Spark Session
spark = SparkSession.builder.appName("HistogramExample").getOrCreate()

# Assuming you have a DataFrame df with an 'error' column
# df = spark.read.csv('path_to_your_data.csv', header=True, inferSchema=True)

# Convert the 'error' column to a list of values
error_values = df.select(col('error')).rdd.flatMap(lambda x: x).collect()

# Create the histogram data using Matplotlib
plt.hist(error_values, bins=50, edgecolor='black')

# Add labels and title
plt.xlabel('Error')
plt.ylabel('Frequency')
plt.title('Histogram of Error')

# Show the plot
plt.show()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, floor
import matplotlib.pyplot as plt

# Initialize Spark Session
spark = SparkSession.builder.appName("HistogramExample").getOrCreate()

# Example DataFrame creation (replace with your actual DataFrame)
data = [
    (1, None, 3.0, float('nan'), 'abc', True),
    (2, 2, None, 4.0, None, False),
    (None, 3, 3.5, None, 'def', None),
    (4, 4, 4.5, 5.0, 'ghi', True)
]

columns = ['int_col', 'float_col', 'double_col', 'nan_col', 'string_col', 'bool_col']

df = spark.createDataFrame(data, columns)

# Select the column to create the histogram for (e.g., 'float_col')
column_to_plot = 'float_col'

# Number of bins for the histogram
num_bins = 10

# Create bins and count the number of values in each bin
min_val = df.agg({column_to_plot: 'min'}).collect()[0][0]
max_val = df.agg({column_to_plot: 'max'}).collect()[0][0]
bin_width = (max_val - min_val) / num_bins

# Calculate the bin edges
bins = [min_val + i * bin_width for i in range(num_bins + 1)]

# Bin the data
binned_df = df.withColumn('bin', floor((col(column_to_plot) - min_val) / bin_width))

# Count the number of entries in each bin
histogram_df = binned_df.groupBy('bin').agg(count('*').alias('count')).orderBy('bin')

# Collect the histogram data
histogram_data = histogram_df.collect()

# Extract bin labels and counts
bin_labels = [min_val + i * bin_width for i in range(num_bins)]
counts = [row['count'] for row in histogram_data]

# Plot the histogram using Matplotlib
plt.bar(bin_labels, counts, width=bin_width, edgecolor='black', align='edge')

# Add labels and title
plt.xlabel(column_to_plot)
plt.ylabel('Frequency')
plt.title(f'Histogram of {column_to_plot}')

# Show the plot
plt.show()


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, floor, when
import matplotlib.pyplot as plt

# Initialize Spark Session
spark = SparkSession.builder.appName("HistogramWithOutliersExample").getOrCreate()

# Example DataFrame creation (replace with your actual DataFrame)
data = [
    (1, None, 3.0, float('nan'), 'abc', True),
    (2, 2, None, 4.0, None, False),
    (None, 3, 3.5, None, 'def', None),
    (4, 4, 4.5, 50.0, 'ghi', True)
]

columns = ['int_col', 'float_col', 'double_col', 'nan_col', 'string_col', 'bool_col']

df = spark.createDataFrame(data, columns)

# Select the column to create the histogram for (e.g., 'double_col')
column_to_plot = 'double_col'

# Number of bins for the histogram
num_bins = 10

# Calculate the minimum and maximum values
min_val = df.agg({column_to_plot: 'min'}).collect()[0][0]
max_val = df.agg({column_to_plot: 'max'}).collect()[0][0]

# Define the threshold for outliers
threshold = max_val - (max_val - min_val) * 0.1  # Example threshold at 90% of the max value

# Calculate the bin width
bin_width = (threshold - min_val) / num_bins

# Create bins and group outliers
binned_df = df.withColumn(
    'bin',
    when(col(column_to_plot) > threshold, num_bins)
    .otherwise(floor((col(column_to_plot) - min_val) / bin_width))
)

# Count the number of entries in each bin
histogram_df = binned_df.groupBy('bin').agg(count('*').alias('count')).orderBy('bin')

# Collect the histogram data
histogram_data = histogram_df.collect()

# Extract bin labels and counts
bin_labels = [min_val + i * bin_width for i in range(num_bins)] + ['Outliers']
counts = [0] * (num_bins + 1)
for row in histogram_data:
    bin_index = row['bin']
    counts[bin_index] = row['count']

# Plot the histogram using Matplotlib
plt.bar(bin_labels[:-1], counts[:-1], width=bin_width, edgecolor='black', align='edge', label='Data')
plt.bar(bin_labels[-1], counts[-1], width=bin_width, edgecolor='black', align='edge', label='Outliers')

# Add labels and title
plt.xlabel(column_to_plot)
plt.ylabel('Frequency')
plt.title(f'Histogram of {column_to_plot}')
plt.legend()

# Show the plot
plt.show()


In [1]:
#error mismatch shape fox?


from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, floor, when, lit
import matplotlib.pyplot as plt

# Initialize Spark Session
spark = SparkSession.builder.appName("HistogramWithOutliersExample").getOrCreate()

# Example DataFrame creation (replace with your actual DataFrame)
data = [
    (1, None, 3.0, float('nan'), 'abc', True),
    (2, 2, None, 4.0, None, False),
    (None, 3, 3.5, None, 'def', None),
    (4, 4, 4.5, 50.0, 'ghi', True)
]

columns = ['int_col', 'float_col', 'double_col', 'nan_col', 'string_col', 'bool_col']

df = spark.createDataFrame(data, columns)

# Select the column to create the histogram for (e.g., 'double_col')
column_to_plot = 'double_col'

# Number of bins for the histogram
num_bins = 10

# Calculate the minimum and maximum values
min_val = df.agg({column_to_plot: 'min'}).collect()[0][0]
max_val = df.agg({column_to_plot: 'max'}).collect()[0][0]

# Define the threshold for outliers
threshold = max_val - (max_val - min_val) * 0.1  # Example threshold at 90% of the max value

# Calculate the bin width
bin_width = (threshold - min_val) / num_bins

# Create bins and group outliers
binned_df = df.withColumn(
    'bin',
    when(col(column_to_plot) > threshold, num_bins)
    .otherwise(floor((col(column_to_plot) - min_val) / bin_width))
)

# Count the number of entries in each bin
histogram_df = binned_df.groupBy('bin').agg(count('*').alias('count')).orderBy('bin')

# Collect the histogram data
histogram_data = histogram_df.collect()

# Extract bin labels and counts
bin_labels = [min_val + i * bin_width for i in range(num_bins)] + ['Outliers']
counts = [0] * (num_bins + 1)
for row in histogram_data:
    bin_index = int(row['bin']) if row['bin'] != 'Outliers' else num_bins
    counts[bin_index] = row['count']

# Plot the histogram using Matplotlib
plt.bar(range(num_bins), counts[:-1], width=bin_width, edgecolor='black', align='edge', label='Data')
plt.bar(num_bins, counts[-1], width=bin_width, edgecolor='black', align='edge', label='Outliers')

# Add labels and title
plt.xticks(range(num_bins + 1), bin_labels, rotation=45)
plt.xlabel(column_to_plot)
plt.ylabel('Frequency')
plt.title(f'Histogram of {column_to_plot}')
plt.legend()

# Show the plot
plt.tight_layout()
plt.show()
