## Households in Slums Clustering

In [0]:
from pyspark.sql.functions import col, when, count, sum, avg

In [0]:
%run ../utils/model_utils

In [0]:
df = spark.sql("select * from silver.house_holds_slums")

In [0]:
display(df)

In [0]:
df, numerical_cols = data_preprocessing(df, null_threshold_percentage = 0.5)

In [0]:
df = df.drop('Zone', 'WardName', 'WardNo.')
df = df.groupBy("City").agg(sum('NoNotifiedSlums').alias('NoNotifiedSlums'),
                            sum('NoOfRecognisedSlums').alias('NoOfRecognisedSlums'),
                            sum('NoOfIdentfiedSlums').alias('NoOfIdentfiedSlums'),
                            sum('SlumPopulationTotal').alias('SlumPopulationTotal'),
                            sum('SlumPopulationMale').alias('SlumPopulationMale'),
                            sum('SlumPopulationFemale').alias('SlumPopulationFemale'),
                            sum('PopulationSCCategory').alias('PopulationSCCategory'),
                            sum('PopulationSTCategory').alias('PopulationSTCategory'),
                            avg('LiteracyRatePercentage').alias('LiteracyRatePercentage')
                            )

In [0]:
display(df)

In [0]:
df = df.withColumn("SlumPopulationMale", when((col("SlumPopulationTotal") > 0) & (col("SlumPopulationMale") == 0), 0.52 * col("SlumPopulationTotal")).otherwise(col("SlumPopulationMale"))) \
    .withColumn("SlumPopulationFemale", when((col("SlumPopulationTotal") > 0) & (col("SlumPopulationFemale") == 0), 0.48 * col("SlumPopulationTotal")).otherwise(col("SlumPopulationFemale")))

In [0]:
display(df)

#### City-wise Slum Population

In [0]:
top_10_cities_population = df.orderBy("SlumPopulationTotal", ascending=False).limit(10)
pandas_df = top_10_cities_population.select('City', 'SlumPopulationMale','SlumPopulationFemale').toPandas()

# Plot a city-wise stacked bar plot
ax = pandas_df.set_index('City').plot(kind='bar', stacked=True, figsize=(10, 6))
plt.xlabel('City')
plt.ylabel('Population')
ax.get_yaxis().get_major_formatter().set_scientific(False)
plt.show()

top_10_cities_literacy = df.orderBy("LiteracyRatePercentage", ascending=False).limit(10)
pandas_df = top_10_cities_population.select('City', 'LiteracyRatePercentage').toPandas()

# Plot a city-wise stacked bar plot
ax = pandas_df.set_index('City').plot(kind='bar', figsize=(10, 6))
plt.xlabel('City')
plt.ylabel('Literacy rate')
ax.get_yaxis().get_major_formatter().set_scientific(False)
plt.show()

In [0]:
#top_10_cities_literacy = df.orderBy("LiteracyRatePercentage", ascending=False).limit(10) pandas_df = top_10_cities_population.select('City', 'LiteracyRatePercentage').toPandas()

In [0]:
#display(top_10_cities_literacy)

In [0]:
feature_cols = ['SlumPopulationTotal', 'SlumPopulationMale', 'PopulationSCCategory', 'PopulationSTCategory', 'LiteracyRatePercentage']
cluster_df = KmeanCluster(df, feature_cols)

In [0]:
silhouette_score, pca_df = score_and_pca(cluster_df)

In [0]:
displayHTML(f"<h1>Silhouette Score: {silhouette_score:.2f}</h1>")

In [0]:
plot_clusters_pca_result(pca_df)

In [0]:
display(cluster_df)

In [0]:
df.withColumn("SlumPopulationMale", when((col("SlumPopulationTotal") > 0) & (col("SlumPopulationMale") == 0), 0.52 * col("SlumPopulationTotal")).otherwise(col("SlumPopulationMale")))

#### Cluster-wise City Count

In [0]:
grouped_data = cluster_df.groupBy("Clusters").agg(
    count("*").alias("City Count")
)

display(grouped_data)

In [0]:
grouped_data = cluster_df.groupBy("Clusters").agg(
    avg("SlumPopulationMale").alias("SlumPopulationMale"),
    avg("SlumPopulationFemale").alias("SlumPopulationFemale")
)

pandas_df = grouped_data.select('Clusters', 'SlumPopulationMale','SlumPopulationFemale').toPandas()
pandas_df = pandas_df.sort_values('SlumPopulationMale', ascending = False)
# Plot a city-wise stacked bar plot
ax = pandas_df.set_index('Clusters').plot(kind='bar', stacked=True, figsize=(10, 6))
plt.xlabel('Clusters')
plt.ylabel('Population')
ax.get_yaxis().get_major_formatter().set_scientific(False)
plt.show()

In [0]:
df_report = cluster_df.withColumn("Slum_Cluster_Type",
                                  when (col("Clusters") == 0,"Medium")\
                                .when(col("Clusters") == 1,"Small")\
                                    .otherwise("Large")
                                  )

display(df_report)

In [0]:
grouped_data = df_report.groupBy("Slum_Cluster_Type").agg(
    avg("SlumPopulationMale").alias("SlumPopulationMale"),
    avg("SlumPopulationFemale").alias("SlumPopulationFemale")
)

pandas_df = grouped_data.select('Slum_Cluster_Type', 'SlumPopulationMale','SlumPopulationFemale').toPandas()
pandas_df = pandas_df.sort_values('SlumPopulationMale', ascending = False)
# Plot a city-wise stacked bar plot
ax = pandas_df.set_index('Slum_Cluster_Type').plot(kind='bar', stacked=True, figsize=(10, 6))
plt.xlabel('Clusters')
plt.ylabel('Population')
ax.get_yaxis().get_major_formatter().set_scientific(False)
plt.show()

In [0]:
%run ../config/etl_config

In [0]:
%run ../utils/utils

In [0]:

source_type= "deltalake"
code_config = config.get_config(json_var = connector_config)
etl_params = config.get_config(json_var = processing_config)
obj_data_ops = data_ops(source_type,code_config)
obj_dl = obj_data_ops.get_source_obj()
obj_dl

In [0]:
obj_dl.write_data(df_data=df_report,spark=spark,table_name="household_slums_clusters",data_path="",table_schema="gold",str_format="delta",
                            write_command="insert overwrite",str_merge_cols="",str_part_cols="",is_table=True)

In [0]:
df = spark.sql("select * from gold.household_slums_clusters")
display(df)

In [0]:
df.printSchema()