In [1]:
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("exploitation-pipeline").getOrCreate()
income      = spark.read.parquet("formatted_zone/income")
population  = spark.read.parquet("formatted_zone/population_by_geographical")
incidences  = spark.read.parquet("formatted_zone/incidences")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/06/23 15:12:58 WARN Utils: Your hostname, MacBook-Air-de-Pol.local, resolves to a loopback address: 127.0.0.1; using 10.60.217.176 instead (on interface en0)
25/06/23 15:12:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/23 15:12:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/06/23 15:12:59 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

In [2]:
income.printSchema()
population.printSchema()
incidences.printSchema()

root
 |-- Codi_Districte: integer (nullable = true)
 |-- Nom_Districte: string (nullable = true)
 |-- Nom_Barri: string (nullable = true)
 |-- Població: integer (nullable = true)
 |-- Índex RFD Barcelona = 100: double (nullable = true)
 |-- Codi_Barri: integer (nullable = true)
 |-- Any: integer (nullable = true)

root
 |-- Codi_Districte: long (nullable = true)
 |-- Nom_Barri: string (nullable = true)
 |-- Nom_Districte: string (nullable = true)
 |-- Valor_int: integer (nullable = true)
 |-- SEXE_desc: string (nullable = true)
 |-- NACIONALITAT_REGIO_desc: string (nullable = true)
 |-- Codi_Barri: integer (nullable = true)
 |-- Any: integer (nullable = true)

root
 |-- FITXA_ID: integer (nullable = true)
 |-- TIPUS: string (nullable = true)
 |-- AREA: string (nullable = true)
 |-- ELEMENT: string (nullable = true)
 |-- DETALL: string (nullable = true)
 |-- DIA_DATA_ALTA: integer (nullable = true)
 |-- MES_DATA_ALTA: integer (nullable = true)
 |-- DIA_DATA_TANCAMENT: integer (nullable 

In [3]:
population.show(10)

+--------------+--------------------+-------------+---------+---------+-----------------------+----------+----+
|Codi_Districte|           Nom_Barri|Nom_Districte|Valor_int|SEXE_desc|NACIONALITAT_REGIO_desc|Codi_Barri| Any|
+--------------+--------------------+-------------+---------+---------+-----------------------+----------+----+
|             1|Sant Pere, Santa ...| Ciutat Vella|      240|   Female|        Northern Africa|         4|2017|
|             1|Sant Pere, Santa ...| Ciutat Vella|      562|     Male|        Northern Africa|         4|2017|
|             1|Sant Pere, Santa ...| Ciutat Vella|        6|     Male|        Southern Africa|         4|2017|
|             1|Sant Pere, Santa ...| Ciutat Vella|       13|   Female|         Western Africa|         4|2017|
|             1|Sant Pere, Santa ...| Ciutat Vella|      127|     Male|         Western Africa|         4|2017|
|             1|Sant Pere, Santa ...| Ciutat Vella|      101|   Female|              Caribbean|         

### Multiculutral index

We are going to create the **Shannon Diversity Index**, a measure of biodiversity that considers both the number of species (richness) and their relative abundance (evenness) within a community. A higher Shannon index generally indicates a more diverse and balanced community, while a lower index suggests a less diverse community, potentially dominated by one or a few species.

In this project, we apply this concept to human populations, using the diversity of nationalities present in a given region. While the Shannon Index is typically used to measure species evenness in ecological communities, we found it an interesting approach to use it for analyzing human diversity — since, biologically, humans are part of the animal kingdom.

**Note:**
Melissa and Pol — we are completely against any form of racism or discrimination. This project and its analysis are purely observational and neutral. There is no judgment of "good" or "bad" in this context. We simply thought it would be insightful to apply the Shannon Diversity Index to the datasets provided in class.

We calculate the diversity index of each neighborhood (barri) by treating each "NACIONALITAT_REGIO_desc" as a distinct "species" for the purpose of the calculation. However, we are fully aware that all humans are equal — we are all part of the same species. We hope this project is not misunderstood, and that its purpose and objectives are seen clearly as an exploration of data, not a commentary on value or worth.

$H=−∑p_i⋅log(p_i)$ 

Where $p_i$ is the proportion of people of a nationality in neighborhood.

In [4]:
""" Fist we calculate the tot_nac that is the total population by region and year. Then we calculate the tot_barri that is the total population by neighborhood and year.
Finally, we join both DataFrames to calculate p_i = total_regio / total_barri
Then we calculate the Shannon index for each neighborhood and year using the formula H = -sum(p_i * log(p_i))."""


tot_nac = (population
           .groupBy("Codi_Barri", "Any", "NACIONALITAT_REGIO_desc")
           .agg(F.sum("Valor_int").alias("total_regio")))

tot_barri = (population
             .groupBy("Codi_Barri", "Any")
             .agg(F.sum("Valor_int").alias("total_barri")))

# Join both dattasets with a inner joint  p_i = total_regio / total_barri
joined = (tot_nac
          .join(tot_barri, on=["Codi_Barri", "Any"], how="inner")
          .withColumn("p_i", F.col("total_regio") / F.col("total_barri")))

# H = -sum(p_i * log(p_i))
joined = joined.withColumn("p_logp", -F.col("p_i") * F.log(F.col("p_i")))
shannon_index = (joined
                 .groupBy("Codi_Barri", "Any")
                 .agg(F.sum("p_logp").alias("shannon_index")))


In [5]:
shannon_index.show(10)

                                                                                

+----------+----+-------------------+
|Codi_Barri| Any|      shannon_index|
+----------+----+-------------------+
|        36|2015|0.27124215732675566|
|        36|2014| 0.2884713672303176|
|        27|2017| 0.5537600665257805|
|        54|2015| 0.6366169293476048|
|        50|2016| 0.8620872214339319|
|        55|2014| 1.1820426239639124|
|        45|2016| 0.6559294251424883|
|        64|2017| 0.7196874620130499|
|        61|2014| 0.5810116441918529|
|        23|2017| 0.5524182004287684|
+----------+----+-------------------+
only showing top 10 rows


The advantage of using the Shannon Diversity Index is that it not only counts how many different nationalities are present, but also considers how evenly they are distributed within the same region. For example, if a neighborhood has 10 different nationalities but one of them makes up 90% of the population, it will have a lower index than another neighborhood where all nationalities are more evenly distributed.

In [None]:
#multi = (tot_nac.filter("total_regio > 0").groupBy("Codi_Barri").agg(F.countDistinct("NACIONALITAT_REGIO_desc").alias("multicultural_idx")))

**MEL if you see this, i know that we have to change it, here is some test, to see how the partition by works**

In [6]:
# i will save partitioned by year, but this should be in the formatted_zone
shannon_index.write \
    .option("header", True) \
    .mode("overwrite") \
    .csv("exploitation_zone/shannon_index")

#.partitionBy("Any") \

In [7]:
tot_nac.show(100)

+----------+----+-----------------------+-----------+
|Codi_Barri| Any|NACIONALITAT_REGIO_desc|total_regio|
+----------+----+-----------------------+-----------+
|         4|2015|              Caribbean|        213|
|        25|2016|       Northern America|        104|
|        69|2016|         Western Europe|        371|
|        69|2015|          South America|        316|
|         9|2015|          Not available|          5|
|        32|2016|         Western Africa|         27|
|         9|2017|     South-Eastern Asia|        358|
|        26|2016|     South-Eastern Asia|        164|
|        69|2015|         Western Africa|         17|
|         4|2017|        Northern Europe|        948|
|         4|2017|          Not available|          5|
|         4|2015|         Western Europe|       1414|
|        69|2017|           Eastern Asia|        156|
|        66|2017|        Central America|         87|
|         4|2016|        Northern Africa|        756|
|         9|2017|           

### Visual study using geojson
we use from the landing_zone in the lookup_tables the "BarcelonaCiutat_Barris.csv" where we obtain the geographies of the neighborhoods that we can use to do a more visual analysis.

In [None]:
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
from shapely import wkt

barrios = pd.read_csv("landing_zone/lookup_tables/BarcelonaCiutat_Barris.csv")
barrios["geometry"] = barrios["geometria_wgs84"].apply(wkt.loads)
barrios_gdf = gpd.GeoDataFrame(barrios, geometry="geometry", crs="EPSG:4326")

barrios_gdf["codi_barri"] = barrios_gdf["codi_barri"].astype(str)
shannon_files =["exploitation_zone/shannon_index/Any=2014/part-00000-9edd2dda-2e43-4a06-a91e-b20f97de9549.c000.csv",
         "exploitation_zone/shannon_index/Any=2015/part-00000-9edd2dda-2e43-4a06-a91e-b20f97de9549.c000.csv",
         "exploitation_zone/shannon_index/Any=2016/part-00000-9edd2dda-2e43-4a06-a91e-b20f97de9549.c000.csv",
         "exploitation_zone/shannon_index/Any=2017/part-00000-9edd2dda-2e43-4a06-a91e-b20f97de9549.c000.csv"]

for file_path in shannon_files:
    year = file_path.split("Any=")[-1].split("/")[0]  # Extract year from path
    shannon_df = pd.read_csv(file_path)
    shannon_df["Codi_Barri"] = shannon_df["Codi_Barri"].astype(str)

    # Merge geometries with index data
    merged = barrios_gdf.merge(shannon_df, left_on="codi_barri", right_on="Codi_Barri")

    # Plot
    plt.figure(figsize=(12, 8))
    merged.plot(column="shannon_index", cmap="YlOrRd", legend=True, edgecolor="black")
    plt.title(f"Shannon Diversity Index by Neighborhood ({year})", fontsize=15)
    plt.axis("off")
    plt.tight_layout()
    plt.show()



 ### Average Income Per Neighborhood

We calculate the **Average Income per Neighborhood** as a socioeconomic indicator that reflects the economic status and purchasing power of residents in each area. This KPI provides insights into income inequality across different neighborhoods in Barcelona and can be correlated with other social indicators such as diversity and public safety.

The average income is calculated by aggregating all individual income records within each neighborhood (Codi_Barri) for each year, providing a mean value that represents the typical economic situation of residents. This metric is particularly valuable for Understanding which areas might need more social investment or economic development programs. The KPI can also help govenments identify income disparities and making infromed decisions about resource allocation and social services distribution.

Higher average income values typically indicate more affluent neighborhoods, while lower values may suggest areas with greater economic challenges or opportunities for targeted social support programs.


In [8]:
avg_income_neighborhood = (income
                                  .groupBy("Codi_Barri", "Any")
                                  .agg(F.avg(F.col("`Índex RFD Barcelona = 100`")).alias("avg_income_index"))
                                  .orderBy("Codi_Barri", "Any"))
avg_income_neighborhood.show(10)

+----------+----+----------------+
|Codi_Barri| Any|avg_income_index|
+----------+----+----------------+
|         1|2014|            65.9|
|         1|2015|            75.8|
|         1|2016|            74.6|
|         1|2017|            71.2|
|         2|2014|            98.5|
|         2|2015|           108.5|
|         2|2016|           110.5|
|         2|2017|           106.1|
|         3|2014|            84.5|
|         3|2015|            76.6|
+----------+----+----------------+
only showing top 10 rows


### Average Incident Resolution Time Per Neighborhood

We calculate the **Average Incident Resolution Time** per Neighborhood as a key performance indicator for public safety and emergency response efficiency. This metric measures how quickly incidents are resolved from the moment they are reported until they are officially closed, expressed in days. The resolution time is computed by finding the difference between the incident closure date and the incident creation date for each neighborhood and year.
This KPI helps identify service efficiency patterns across Barcelona's neighborhoods, revealing areas where emergency services respond more quickly or slowly. Lower average resolution times indicate more efficient incident management, while higher times may suggest areas needing additional resources or process improvements. However, our analysis reveals potential data quality issues, as many incidents show identical creation and closure dates (0-day resolution), which may indicate systematic data entry practices or bulk processing rather than actual same-day resolution of all incidents.RetryClaude can make mistakes. Please double-check responses.

In [9]:
incident_with_dates = (incidences
                      .withColumn("creation_date",
                                 F.to_date(F.concat_ws("-", F.col("Any"), F.col("MES_DATA_ALTA"), F.col("DIA_DATA_ALTA")), "yyyy-M-d"))
                      .withColumn("closure_date",
                                 F.to_date(F.concat_ws("-", F.col("ANY_DATA_TANCAMENT"), F.col("MES_DATA_TANCAMENT"), F.col("DIA_DATA_TANCAMENT")), "yyyy-M-d"))
                      .filter(F.col("creation_date").isNotNull() & F.col("closure_date").isNotNull()))

# resolution time
incident_resolution = (incident_with_dates
                      .withColumn("resolution_days",
                                 F.datediff(F.col("closure_date"), F.col("creation_date")))
                      .filter(F.col("resolution_days") >= 0))

# average resolution time per neighborhood and year
avg_resolution_time = (incident_resolution
                      .groupBy("Codi_Barri", "Any")
                      .agg(F.avg("resolution_days").alias("avg_resolution_days"),
                           F.count("*").alias("total_incidents"),
                           F.min("resolution_days").alias("min_resolution_days"),
                           F.max("resolution_days").alias("max_resolution_days"))
                      .orderBy("Codi_Barri", "Any"))

avg_resolution_time.show(10)



+----------+----+-------------------+---------------+-------------------+-------------------+
|Codi_Barri| Any|avg_resolution_days|total_incidents|min_resolution_days|max_resolution_days|
+----------+----+-------------------+---------------+-------------------+-------------------+
|         1|2013| 45.166666666666664|             30|                  2|                202|
|         1|2014|  6.877161055505004|           2198|                  0|                310|
|         1|2015|    8.1679607682877|           2447|                  0|                325|
|         1|2016|  9.845474613686534|           2718|                  0|                177|
|         1|2017|  10.79578886645515|           3467|                  0|                144|
|         2|2013|               20.5|             22|                  7|                 84|
|         2|2014|  8.230582524271844|           1648|                  0|                353|
|         2|2015|  8.825335892514396|           1563|       

                                                                                

In [None]:
# shofim a ka vlera boshe 

from pyspark.sql.functions import col, sum as spark_sum, when, isnan, isnull

# null_counts = incidences.select([
#    spark_sum(when(isnull(col(c)) | isnan(col(c)), 1).otherwise(0)).alias(c)
#    for c in incidences.columns
# ])

#null_counts.show()

incidences.groupBy("Codi_Barri").count().orderBy("count", ascending=False).show()



+----------+-----+
|Codi_Barri|count|
+----------+-----+
|        31|11510|
|         7|10915|
|         1|10860|
|        18| 9409|
|        60| 9278|
|        68| 9245|
|         4| 8502|
|         9| 8379|
|        26| 8265|
|        11| 8249|
|        19| 8240|
|        35| 7498|
|         2| 7385|
|         8| 7211|
|         6| 6575|
|        37| 6505|
|        10| 6423|
|        43| 6309|
|        13| 5721|
|        64| 5299|
+----------+-----+
only showing top 20 rows


                                                                                

In [12]:
# pse? t shofim nga vin gjerat
debug_neighborhood_1 = (incidences
                       .filter((F.col("Codi_Barri") == 1) & (F.col("Any") == 2016))
                       .select("DIA_DATA_ALTA", "MES_DATA_ALTA", "Any",
                              "DIA_DATA_TANCAMENT", "MES_DATA_TANCAMENT", "ANY_DATA_TANCAMENT")
                       .withColumn("same_day",
                                  (F.col("DIA_DATA_ALTA") == F.col("DIA_DATA_TANCAMENT")) &
                                  (F.col("MES_DATA_ALTA") == F.col("MES_DATA_TANCAMENT")) &
                                  (F.col("Any") == F.col("ANY_DATA_TANCAMENT"))))

debug_neighborhood_1.show()

same_date_check = (incidences
                  .withColumn("same_date",
                             (F.col("DIA_DATA_ALTA") == F.col("DIA_DATA_TANCAMENT")) &
                             (F.col("MES_DATA_ALTA") == F.col("MES_DATA_TANCAMENT")) &
                             (F.col("Any") == F.col("ANY_DATA_TANCAMENT")))
                  .groupBy("same_date")
                  .count())

same_date_check.show()

+-------------+-------------+----+------------------+------------------+------------------+--------+
|DIA_DATA_ALTA|MES_DATA_ALTA| Any|DIA_DATA_TANCAMENT|MES_DATA_TANCAMENT|ANY_DATA_TANCAMENT|same_day|
+-------------+-------------+----+------------------+------------------+------------------+--------+
|           22|            9|2016|                22|                 9|              2016|    true|
|           22|            9|2016|                22|                 9|              2016|    true|
|           22|            9|2016|                22|                 9|              2016|    true|
|           28|            7|2016|                23|                 9|              2016|   false|
|            9|            8|2016|                23|                 9|              2016|   false|
|           10|            8|2016|                23|                 9|              2016|   false|
|           27|            8|2016|                23|                 9|              2016|



+---------+------+
|same_date| count|
+---------+------+
|     true| 52988|
|    false|269534|
+---------+------+



                                                                                

In [None]:
## do the kpis the incident things and save this

### kpis already created in the exploitation file.\
### formatted to exploitation.