## Electricity Clustering

In [0]:
from pyspark.sql.functions import col, when, count, sum, avg

In [0]:
%run ../utils/model_utils

In [0]:
df = spark.sql("select * from silver.electricity_consumption")

In [0]:
display(df)

In [0]:
df, numerical_cols = data_preprocessing(df, null_threshold_percentage = 0.5)

In [0]:
display(df)

In [0]:
df = df.filter((col('CityName') != 'Port Blair') & (col('CityName') != 'DHARAMSHALA'))
display(df)

In [0]:
feature_cols = numerical_cols
cluster_df = KmeanCluster(df, feature_cols)

In [0]:
silhouette_score, pca_df = score_and_pca(cluster_df)

In [0]:
displayHTML(f"<h1>Silhouette Score: {silhouette_score:.2f}</h1>")

In [0]:
plot_clusters_pca_result(pca_df)

In [0]:
display(cluster_df)

#### Cluster-wise City Count

In [0]:
grouped_data = cluster_df.groupBy("Clusters").agg(
    count("*").alias("City Count")
)

display(grouped_data)

In [0]:
grouped_data = cluster_df.groupBy("Clusters").agg(
    avg("ConsumptionofElectricityinlakhunitsTotalConsumption").alias("AverageElectricityConsumption")
)

pandas_df = grouped_data.select('Clusters', 'AverageElectricityConsumption').toPandas()

# Plot a city-wise stacked bar plot
ax = pandas_df.set_index('Clusters').plot(kind='bar', stacked=True, figsize=(10, 6))
plt.xlabel('Clusters')
plt.ylabel('Average Consumption')
ax.get_yaxis().get_major_formatter().set_scientific(False)
plt.show()

In [0]:
df_report = cluster_df.withColumn("Electricity_consumption_cluster_Type",
                                  when (col("Clusters") == 0,"Medium")\
                                .when(col("Clusters") == 1,"Small")\
                                    .otherwise("Large")
                                  )

display(df_report)

In [0]:
grouped_data = df_report.groupBy("Electricity_consumption_cluster_Type").agg(
    avg("ConsumptionofElectricityinlakhunitsTotalConsumption").alias("AverageElectricityConsumption")
)

pandas_df = grouped_data.select('Electricity_consumption_cluster_Type', 'AverageElectricityConsumption').toPandas()
pandas_df = pandas_df.sort_values('AverageElectricityConsumption', ascending = False)

# Plot a city-wise stacked bar plot
ax = pandas_df.set_index('Electricity_consumption_cluster_Type').plot(kind='bar', stacked=True, figsize=(10, 6))
plt.xlabel('Clusters')
plt.ylabel('Average Electricity Consumption')
ax.get_yaxis().get_major_formatter().set_scientific(False)
plt.show()

In [0]:
%run ../config/etl_config

In [0]:
%run ../utils/utils

In [0]:

source_type= "deltalake"
code_config = config.get_config(json_var = connector_config)
etl_params = config.get_config(json_var = processing_config)
obj_data_ops = data_ops(source_type,code_config)
obj_dl = obj_data_ops.get_source_obj()
obj_dl

In [0]:
obj_dl.write_data(df_data=df_report,spark=spark,table_name="electricity_clusters",data_path="",table_schema="gold",str_format="delta",
                            write_command="insert overwrite",str_merge_cols="",str_part_cols="",is_table=True)

In [0]:
df = spark.sql("select * from gold.electricity_clusters")
display(df)

In [0]:
df.printSchema()