In [2]:
from pyspark.sql import SparkSession, DataFrame
from pyspark import SparkConf
from pyspark.sql.types import StructField, StructType, StringType, LongType, DoubleType, IntegerType, DateType

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("GCSExample")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# Define the schema
table_schema = StructType([
    StructField("eventid", StringType(), True),
    StructField("iyear", IntegerType(), True),  # Changed to IntegerType
    StructField("imonth", IntegerType(), True), # Changed to IntegerType
    StructField("iday", IntegerType(), True),   # Changed to IntegerType
    StructField("extended", StringType(), True),
    StructField("resolution", StringType(), True),
    StructField("country_txt", StringType(), True),
    StructField("region_txt", StringType(), True),
    StructField("provstate", StringType(), True),
    StructField("city", StringType(), True),
    StructField("latitude", StringType(), True),
    StructField("longitude", StringType(), True),
    StructField("gname", StringType(), True),
    StructField("gsubname", StringType(), True),
    StructField("gname2", StringType(), True),
    StructField("gsubname2", StringType(), True),
    StructField("gname3", StringType(), True),
    StructField("gsubname3", StringType(), True),
    StructField("success", StringType(), True),
    StructField("attacktype1_txt", StringType(), True),
])

#  Google Storage File Path
event_file_path = 'gs://data_de2024_ga2/event_informations.csv'  #  use your gcp bucket name. Also upload sales.csv first
# Create data frame
df_event = spark.read.format("csv").schema(table_schema).option("header", "true").option("delimiter", ",") \
       .load(event_file_path)
df_event.printSchema()

df_event.show()

root
 |-- eventid: string (nullable = true)
 |-- iyear: integer (nullable = true)
 |-- imonth: integer (nullable = true)
 |-- iday: integer (nullable = true)
 |-- extended: string (nullable = true)
 |-- resolution: string (nullable = true)
 |-- country_txt: string (nullable = true)
 |-- region_txt: string (nullable = true)
 |-- provstate: string (nullable = true)
 |-- city: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- gname: string (nullable = true)
 |-- gsubname: string (nullable = true)
 |-- gname2: string (nullable = true)
 |-- gsubname2: string (nullable = true)
 |-- gname3: string (nullable = true)
 |-- gsubname3: string (nullable = true)
 |-- success: string (nullable = true)
 |-- attacktype1_txt: string (nullable = true)

+------------+-----+------+----+--------+----------+------------------+--------------------+-----------+-------------+----------+-----------+--------------------+--------+------+---------+------+

In [4]:
# Define the schema
weapon_schema = StructType([
    StructField("eventid", StringType(), True),
    StructField("weaptype1_txt", StringType(), True),
    StructField("weapsubtype1_txt", StringType(), True),
    StructField("weaptype2_txt", StringType(), True),
    StructField("weapsubtype2_txt", StringType(), True),
    StructField("weaptype3_txt", StringType(), True),
    StructField("weapsubtype3_txt", StringType(), True),
    StructField("weaptype4_txt", StringType(), True),
    StructField("weapsubtype4_txt", StringType(), True),
    StructField("weapdetail", StringType(), True),
    StructField("country_txt", StringType(), True),
    StructField("region_txt", StringType(), True),
    StructField("provstate", StringType(), True),
    StructField("city", StringType(), True),
    StructField("iyear", IntegerType(), True),
    StructField("imonth", IntegerType(), True), 
    StructField("iday", IntegerType(), True),   
])

#  Google Storage File Path
weapon_file_path = 'gs://data_de2024_ga2/weapens_used_at_event.csv'  #  use your gcp bucket name. Also upload sales.csv first
# Create data frame
df_weapon = spark.read.format("csv").schema(weapon_schema).option("header", "true").option("delimiter", ",") \
       .load(weapon_file_path)
df_weapon.printSchema()
df_weapon.show()

root
 |-- eventid: string (nullable = true)
 |-- weaptype1_txt: string (nullable = true)
 |-- weapsubtype1_txt: string (nullable = true)
 |-- weaptype2_txt: string (nullable = true)
 |-- weapsubtype2_txt: string (nullable = true)
 |-- weaptype3_txt: string (nullable = true)
 |-- weapsubtype3_txt: string (nullable = true)
 |-- weaptype4_txt: string (nullable = true)
 |-- weapsubtype4_txt: string (nullable = true)
 |-- weapdetail: string (nullable = true)
 |-- country_txt: string (nullable = true)
 |-- region_txt: string (nullable = true)
 |-- provstate: string (nullable = true)
 |-- city: string (nullable = true)
 |-- iyear: integer (nullable = true)
 |-- imonth: integer (nullable = true)
 |-- iday: integer (nullable = true)

+------------+-------------+--------------------+-------------+----------------+-------------+----------------+-------------+----------------+--------------------+------------------+--------------------+-----------+-------------+-----+------+----+
|     eventid|wea

In [5]:
def merge_dataframes_remove_duplicate_columns(df1: DataFrame, df2: DataFrame) -> DataFrame:
    """
    Perform an inner join between two PySpark DataFrames based on specific columns
    and remove duplicate columns that have exactly the same name in the resulting DataFrame.
    
    Args:
    df1 (DataFrame): The first DataFrame.
    df2 (DataFrame): The second DataFrame.
    
    Returns:
    DataFrame: The resulting DataFrame after the inner join with duplicate columns removed.
    """
    # Columns to join on
    join_columns = ['eventid']

    # Identify overlapping columns (excluding join keys)
    overlap_columns = [col for col in df2.columns if col in df1.columns and col not in join_columns]
    
    # Drop overlapping columns from df2
    df2 = df2.drop(*overlap_columns)
    
    # Perform the join
    merged_df = df1.join(df2, on=join_columns, how='inner')
       
    return merged_df

df_event_weapons =  merge_dataframes_remove_duplicate_columns(df_event, df_weapon)
df_event_weapons.printSchema()

root
 |-- eventid: string (nullable = true)
 |-- iyear: integer (nullable = true)
 |-- imonth: integer (nullable = true)
 |-- iday: integer (nullable = true)
 |-- extended: string (nullable = true)
 |-- resolution: string (nullable = true)
 |-- country_txt: string (nullable = true)
 |-- region_txt: string (nullable = true)
 |-- provstate: string (nullable = true)
 |-- city: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- gname: string (nullable = true)
 |-- gsubname: string (nullable = true)
 |-- gname2: string (nullable = true)
 |-- gsubname2: string (nullable = true)
 |-- gname3: string (nullable = true)
 |-- gsubname3: string (nullable = true)
 |-- success: string (nullable = true)
 |-- attacktype1_txt: string (nullable = true)
 |-- weaptype1_txt: string (nullable = true)
 |-- weapsubtype1_txt: string (nullable = true)
 |-- weaptype2_txt: string (nullable = true)
 |-- weapsubtype2_txt: string (nullable = true)
 |-- weapt

In [6]:
def filter_events_by_year(df: DataFrame, start_year=2015, end_year=2017) -> DataFrame:
    """
    Filters the events in the DataFrame to only include those between start_year and end_year.

    Args:
    df (DataFrame): Input PySpark DataFrame with an 'iyear' column.
    start_year (int): The starting year of the filter range (inclusive).
    end_year (int): The ending year of the filter range (inclusive).

    Returns:
    DataFrame: Filtered PySpark DataFrame.
    """
    return df.filter((df["iyear"] >= start_year) & (df["iyear"] <= end_year))

# Apply the function
df_event_weapons = filter_events_by_year(df_event_weapons)

# Show the filtered DataFrame
df_event_weapons.show()
df_event_weapons.count()

+------------+-----+------+----+--------+----------+--------------------+--------------------+--------------------+------------------+---------+---------+--------------------+--------+------+---------+------+---------+-------+--------------------+-------------+--------------------+-------------+--------------------+-------------+----------------+-------------+----------------+--------------------+
|     eventid|iyear|imonth|iday|extended|resolution|         country_txt|          region_txt|           provstate|              city| latitude|longitude|               gname|gsubname|gname2|gsubname2|gname3|gsubname3|success|     attacktype1_txt|weaptype1_txt|    weapsubtype1_txt|weaptype2_txt|    weapsubtype2_txt|weaptype3_txt|weapsubtype3_txt|weaptype4_txt|weapsubtype4_txt|          weapdetail|
+------------+-----+------+----+--------+----------+--------------------+--------------------+--------------------+------------------+---------+---------+--------------------+--------+------+-------

39452

In [7]:
def remove_unnecessary_columns(df_event_weapons):
    """
    Removes unnecessary columns from the DataFrame.
    
    Args:
    df_event_weapons (DataFrame): Input PySpark DataFrame.
    
    Returns:
    DataFrame: A new DataFrame with specified columns removed.
    """
    columns_to_drop = ["iday", "iyear", "imonth", "extended", "resolution", "latitude", "longitude", "region_txt", "provstate", "city", "success", "attacktype1_txt", "weapdetail"]
    return df_event_weapons.drop(*columns_to_drop)

# Apply the function
df_event_weapons = remove_unnecessary_columns(df_event_weapons)

In [10]:
from pyspark.sql import functions as F

def transform_gnames(df):
    """
    Transforms the DataFrame to expand gname, gname2, and gname3 into separate rows.
    For each row, non-null gname, gname2, and gname3 are checked, and corresponding rows
    are created with the appropriate groupname and groupnamesubgroup values.

    Args:
    df (DataFrame): Input DataFrame with gname, gname2, gname3, and their subgroup columns.

    Returns:
    DataFrame: Transformed DataFrame with expanded rows.
    """
    # Create the base DataFrame with gname and gsubname as groupname and groupnamesubgroup
    gname_rows = df.withColumn("groupname", F.col("gname")) \
                   .withColumn("groupnamesubgroup", F.col("gsubname")) \
                   .drop("gname", "gsubname", "gname2", "gsubname2", "gname3", "gsubname3")
    
    # Create additional rows for gname2
    gname2_rows = df.withColumn("groupname", F.col("gname2")) \
                    .withColumn("groupnamesubgroup", F.col("gsubname2")) \
                    .drop("gname", "gsubname", "gname2", "gsubname2", "gname3", "gsubname3")

    # Create additional rows for gname3
    gname3_rows = df.withColumn("groupname", F.col("gname3")) \
                    .withColumn("groupnamesubgroup", F.col("gsubname3")) \
                    .drop("gname", "gsubname", "gname2", "gsubname2", "gname3", "gsubname3")

    # Union all rows and filter out rows where groupname is null
    result_df = gname_rows.union(gname2_rows).union(gname3_rows).filter((F.col("groupname").isNotNull()) & (F.col("groupname") != "Unknown"))
    return result_df

df_event_vertical = transform_gnames(df_event_weapons)
print(df_event_vertical.columns)

['eventid', 'country_txt', 'weaptype1_txt', 'weapsubtype1_txt', 'weaptype2_txt', 'weapsubtype2_txt', 'weaptype3_txt', 'weapsubtype3_txt', 'weaptype4_txt', 'weapsubtype4_txt', 'groupname', 'groupnamesubgroup']


In [11]:
def transform_weapon_types(df: DataFrame) -> DataFrame:
    """
    Transforms the DataFrame to expand weaptype1_txt, weapsubtype1_txt, ..., weaptype4_txt, weapsubtype4_txt
    into separate rows with new column names weapon_type and weapon_sub_type. Original columns are removed.

    Args:
    df (DataFrame): Input DataFrame with weapon type and subtype columns.

    Returns:
    DataFrame: Transformed DataFrame with expanded rows and original columns removed.
    """
    # Create rows for weapon type 1
    weapon1_rows = df.withColumn("weapon_type", F.col("weaptype1_txt")) \
                     .withColumn("weapon_sub_type", F.col("weapsubtype1_txt")) \
                     .drop("weaptype1_txt", "weapsubtype1_txt", "weaptype2_txt", "weapsubtype2_txt",
                           "weaptype3_txt", "weapsubtype3_txt", "weaptype4_txt", "weapsubtype4_txt")
    
    # Create rows for weapon type 2
    weapon2_rows = df.withColumn("weapon_type", F.col("weaptype2_txt")) \
                     .withColumn("weapon_sub_type", F.col("weapsubtype2_txt")) \
                     .drop("weaptype1_txt", "weapsubtype1_txt", "weaptype2_txt", "weapsubtype2_txt",
                           "weaptype3_txt", "weapsubtype3_txt", "weaptype4_txt", "weapsubtype4_txt")
    
    # Create rows for weapon type 3
    weapon3_rows = df.withColumn("weapon_type", F.col("weaptype3_txt")) \
                     .withColumn("weapon_sub_type", F.col("weapsubtype3_txt")) \
                     .drop("weaptype1_txt", "weapsubtype1_txt", "weaptype2_txt", "weapsubtype2_txt",
                           "weaptype3_txt", "weapsubtype3_txt", "weaptype4_txt", "weapsubtype4_txt")
    
    # Create rows for weapon type 4
    weapon4_rows = df.withColumn("weapon_type", F.col("weaptype4_txt")) \
                     .withColumn("weapon_sub_type", F.col("weapsubtype4_txt")) \
                     .drop("weaptype1_txt", "weapsubtype1_txt", "weaptype2_txt", "weapsubtype2_txt",
                           "weaptype3_txt", "weapsubtype3_txt", "weaptype4_txt", "weapsubtype4_txt")
    
    # Union all rows and filter out rows where weapon_type is null
    result_df = weapon1_rows.union(weapon2_rows).union(weapon3_rows).union(weapon4_rows) \
                            .filter((F.col("weapon_type").isNotNull()) & (F.col("weapon_type") != "Unknown"))
    
    return result_df
df_weapon_event_vertical = transform_weapon_types(df_event_vertical)
df_weapon_event_vertical.count()

24116

In [12]:
sorted_df = df_weapon_event_vertical.orderBy(F.desc("eventid"))

# Show the grouped and aggregated DataFrame
sorted_df.show()

+------------+-----------+--------------------+-----------------+-----------+--------------------+
|     eventid|country_txt|           groupname|groupnamesubgroup|weapon_type|     weapon_sub_type|
+------------+-----------+--------------------+-----------------+-----------+--------------------+
|201712310030|Philippines|Bangsamoro Islami...|             NULL| Incendiary|          Arson/Fire|
|201712310029|      Syria|   Muslim extremists|             NULL| Explosives|Projectile (rocke...|
|201712310022|    Somalia|          Al-Shabaab|             NULL|   Firearms|    Unknown Gun Type|
|201712310019|      India|Zeliangrong Unite...|             NULL|   Firearms|Automatic or Semi...|
|201712310018|Afghanistan|             Taliban|             NULL|   Firearms|    Unknown Gun Type|
|201712310016|Philippines|Bangsamoro Islami...|             NULL| Explosives|Unknown Explosive...|
|201712310013|    Somalia|          Al-Shabaab|             NULL| Explosives|Projectile (rocke...|
|201712310

In [13]:
from pyspark.sql import functions as F

def one_hot_encode_with_suffix(df, column, suffix=None):
    """
    One-hot encodes a specified column in the PySpark DataFrame and optionally appends a suffix to the column names.

    Args:
    df (DataFrame): The input PySpark DataFrame.
    column (str): The name of the column to one-hot encode.
    suffix (str): Optional suffix to append to column names.

    Returns:
    DataFrame: A new DataFrame with one-hot encoded columns for the specified column, renamed with the suffix if provided.
    """
    # Perform one-hot encoding using pivot
    encoded_df = (
        df.groupBy("eventid")  # Assuming there's a unique identifier column like 'eventid'
        .pivot(column)
        .count()
        .fillna(0)  # Fill null values with 0
    )
    
    # Rename columns to add the suffix
    if suffix:
        for col in encoded_df.columns:
            if col not in ["eventid"]:  # Do not rename the identifier column
                encoded_df = encoded_df.withColumnRenamed(col, f"{col}{suffix}")
    
    return encoded_df

# One-hot encode weapon_type
df_one_hot_weapon_type = one_hot_encode_with_suffix(df_weapon_event_vertical, "weapon_type")

# One-hot encode weapon_sub_type with "_sub" suffix
df_one_hot_weapon_sub_type = one_hot_encode_with_suffix(df_weapon_event_vertical, "weapon_sub_type", "_sub")

# Join the one-hot encoded columns back to the original DataFrame
df_final = (
    df_weapon_event_vertical
    .join(df_one_hot_weapon_type, on="eventid", how="left")
    .join(df_one_hot_weapon_sub_type, on="eventid", how="left")
)

# Show the final DataFrame
df_final.show()

+------------+------------------+--------------------+-----------------+-----------+--------------------+--------+----------+------------+--------+----------+-----+-----+------------------+---------------------------------------------------------------------------+--------+--------------+-------------------------------------+----------------+----------------+-------------+-----------------------+-----------+-----------+----------------------+-------------------------------+------------+---------------+--------------------------------+------------------------+------------------+-------------+-------------+--------------------+---------------------------------------------+------------------+---------------------------------+-----------------------------------+---------------+---------------+-------------------------------------------+-------------+--------------------------+--------------------+-----------------------+-----------+
|     eventid|       country_txt|           groupname|gro

In [14]:
def sanitize_column_names(df):
    """
    Renames columns with special characters to simpler names for easier reference.

    Args:
    df (DataFrame): Input PySpark DataFrame.

    Returns:
    DataFrame: DataFrame with sanitized column names.
    """
    for col_name in df.columns:
        sanitized_name = col_name.replace("(", "").replace(")", "").replace(",", "").replace(".", "").replace(" ", "_")
        df = df.withColumnRenamed(col_name, sanitized_name)

    df = df.drop("eventid", 'weapon_type', 'weapon_sub_type', 'success', 'null_sub')
    return df

# Apply the function to sanitize column names
df_sanitized = sanitize_column_names(df_final)



def collapse_by_groupname(df):
    """
    Collapses the DataFrame by `groupname`, aggregating `country_txt` and `groupnamesubgroup` into lists,
    summing all weapon and sub-weapon groups, and dropping `eventid`.

    Args:
    df (DataFrame): Input PySpark DataFrame.

    Returns:
    DataFrame: Collapsed DataFrame grouped by `groupname`.
    """
    # Identify weapon-related columns
    weapon_columns = [col for col in df.columns if not col.startswith("country_txt") and not col.endswith("_sub") and not col.startswith("groupname") and not col.startswith("groupnamesubgroup")]
    weapon_sub_columns = [col for col in df.columns if col.endswith("_sub")]
    print(weapon_columns)
    print(weapon_sub_columns)
    
    # Group and aggregate
    aggregated_df = df.groupBy("groupname", "groupnamesubgroup").agg(
         F.collect_set(F.col("country_txt")).alias("countries"),
        *[F.sum(col).alias(col) for col in weapon_columns],
        *[F.sum(col).alias(col) for col in weapon_sub_columns]
    )

    return aggregated_df

# Apply the function to your DataFrame
collapsed_df = collapse_by_groupname(df_sanitized)

# Show the resulting DataFrame
collapsed_df.show(truncate=False)

['Chemical', 'Explosives', 'Fake_Weapons', 'Firearms', 'Incendiary', 'Melee', 'Other', 'Sabotage_Equipment', 'Vehicle_not_to_include_vehicle-borne_explosives_ie_car_or_truck_bombs']
['Arson/Fire_sub', 'Automatic_or_Semi-Automatic_Rifle_sub', 'Blunt_Object_sub', 'Dynamite/TNT_sub', 'Explosive_sub', 'Gasoline_or_Alcohol_sub', 'Grenade_sub', 'Handgun_sub', 'Hands_Feet_Fists_sub', 'Knife_or_Other_Sharp_Object_sub', 'Landmine_sub', 'Letter_Bomb_sub', 'Molotov_Cocktail/Petrol_Bomb_sub', 'Other_Explosive_Type_sub', 'Other_Gun_Type_sub', 'Pipe_Bomb_sub', 'Poisoning_sub', 'Pressure_Trigger_sub', 'Projectile_rockets_mortars_RPGs_etc_sub', 'Remote_Trigger_sub', 'Rifle/Shotgun_non-automatic_sub', 'Rope_or_Other_Strangling_Device_sub', 'Sticky_Bomb_sub', 'Suffocation_sub', 'Suicide_carried_bodily_by_human_being_sub', 'Time_Fuse_sub', 'Unknown_Explosive_Type_sub', 'Unknown_Gun_Type_sub', 'Unknown_Weapon_Type_sub', 'Vehicle_sub']
+----------------------------------------------------------+-----------

In [22]:
print(collapsed_df.columns)


['groupname', 'groupnamesubgroup', 'countries', 'Chemical', 'Explosives', 'Fake_Weapons', 'Firearms', 'Incendiary', 'Melee', 'Other', 'Sabotage_Equipment', 'Vehicle_not_to_include_vehicle-borne_explosives_ie_car_or_truck_bombs', 'Arson/Fire_sub', 'Automatic_or_Semi-Automatic_Rifle_sub', 'Blunt_Object_sub', 'Dynamite/TNT_sub', 'Explosive_sub', 'Gasoline_or_Alcohol_sub', 'Grenade_sub', 'Handgun_sub', 'Hands_Feet_Fists_sub', 'Knife_or_Other_Sharp_Object_sub', 'Landmine_sub', 'Letter_Bomb_sub', 'Molotov_Cocktail/Petrol_Bomb_sub', 'Other_Explosive_Type_sub', 'Other_Gun_Type_sub', 'Pipe_Bomb_sub', 'Poisoning_sub', 'Pressure_Trigger_sub', 'Projectile_rockets_mortars_RPGs_etc_sub', 'Remote_Trigger_sub', 'Rifle/Shotgun_non-automatic_sub', 'Rope_or_Other_Strangling_Device_sub', 'Sticky_Bomb_sub', 'Suffocation_sub', 'Suicide_carried_bodily_by_human_being_sub', 'Time_Fuse_sub', 'Unknown_Explosive_Type_sub', 'Unknown_Gun_Type_sub', 'Unknown_Weapon_Type_sub', 'Vehicle_sub']


In [28]:
collapsed_cleaned = collapsed_df.selectExpr(
    "groupname as group_name",
    "groupnamesubgroup as subgroup_name",
    "countries as country",
    "Chemical as chemical",
    "Explosives as explosives",
    "Fake_Weapons as fake_weapons",
    "Firearms as firearms",
    "Incendiary as incendiary",
    "Melee as melee",
    "Other as other",
    "Sabotage_Equipment as sabotage_equipment",
    "`Vehicle_not_to_include_vehicle-borne_explosives_ie_car_or_truck_bombs` as vehicle_not_explosive",
    "`Arson/Fire_sub` as arson_fire",
    "`Automatic_or_Semi-Automatic_Rifle_sub` as automatic_rifle",
    "Blunt_Object_sub as blunt_object",
    "`Dynamite/TNT_sub` as dynamite_tnt",
    "Explosive_sub as explosive",
    "Gasoline_or_Alcohol_sub as gasoline_or_alcohol",
    "Grenade_sub as grenade",
    "Handgun_sub as handgun",
    "Hands_Feet_Fists_sub as hands_feet_fists",
    "`Knife_or_Other_Sharp_Object_sub` as knife_or_sharp_object",
    "Landmine_sub as landmine",
    "Letter_Bomb_sub as letter_bomb",
    "`Molotov_Cocktail/Petrol_Bomb_sub` as molotov_cocktail",
    "`Other_Explosive_Type_sub` as other_explosive_type",
    "`Other_Gun_Type_sub` as other_gun_type",
    "Pipe_Bomb_sub as pipe_bomb",
    "Poisoning_sub as poisoning",
    "Pressure_Trigger_sub as pressure_trigger",
    "`Projectile_rockets_mortars_RPGs_etc_sub` as projectile_rockets",
    "Remote_Trigger_sub as remote_trigger",
    "`Rifle/Shotgun_non-automatic_sub` as rifle_shotgun",
    "`Rope_or_Other_Strangling_Device_sub` as strangling_device",
    "Sticky_Bomb_sub as sticky_bomb",
    "Suffocation_sub as suffocation",
    "`Suicide_carried_bodily_by_human_being_sub` as suicide_human_carried",
    "Time_Fuse_sub as time_fuse",
    "`Unknown_Explosive_Type_sub` as unknown_explosive",
    "`Unknown_Gun_Type_sub` as unknown_gun",
    "`Unknown_Weapon_Type_sub` as unknown_weapon",
    "Vehicle_sub as vehicle"
)


In [15]:
def top_five_subgroups_for_chemical_attacks(df):
    """
    Finds the five unique subgroups with the highest number of chemical attacks.

    Args:
    df (DataFrame): Input PySpark DataFrame with `weapon_type` and `groupnamesubgroup` columns.

    Returns:
    DataFrame: DataFrame with the top five subgroups and their counts of chemical attacks.
    """
    # Filter for chemical attacks
    chemical_attacks_df = df.filter(F.col("weapon_type") == "Chemical")

    # Group by subgroup and count the number of attacks
    subgroup_counts = (
        chemical_attacks_df.groupBy("groupnamesubgroup", "groupname")
        .agg(F.count("*").alias("chemical_attack_count"))
    )

    # Sort by count in descending order and select top five
    top_five = subgroup_counts.orderBy(F.desc("chemical_attack_count")).limit(5)

    return top_five

# Apply the function
top_five_chemical_subgroups = top_five_subgroups_for_chemical_attacks(df_final)

# Show the results
top_five_chemical_subgroups.show()

+--------------------+--------------------+---------------------+
|   groupnamesubgroup|           groupname|chemical_attack_count|
+--------------------+--------------------+---------------------+
|                NULL|Islamic State of ...|                   36|
|                NULL|             Taliban|                   17|
|Nineveh Province ...|Islamic State of ...|                    3|
|                NULL|  Israeli extremists|                    2|
|                NULL|Nur-al-Din al-Zin...|                    2|
+--------------------+--------------------+---------------------+



how many distinct products have been sold in each store? 


In [16]:
spark.read.text("gs://data_de2024_ga2/event_informations.csv").show()

+--------------------+
|               value|
+--------------------+
|eventid,iyear,imo...|
|197000000001,1970...|
|197000000002,1970...|
|197001000001,1970...|
|197001000002,1970...|
|197001000003,1970...|
|197001010002,1970...|
|197001020001,1970...|
|197001020002,1970...|
|197001020003,1970...|
|197001030001,1970...|
|197001050001,1970...|
|197001060001,1970...|
|197001080001,1970...|
|197001090001,1970...|
|197001090002,1970...|
|197001100001,1970...|
|197001110001,1970...|
|197001120001,1970...|
|197001120002,1970...|
+--------------------+
only showing top 20 rows



In [32]:
# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "temp_de2024_ga2"  # use your bucket 
spark.conf.set('temporaryGcsBucket', bucket)
# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
# Saving the data to BigQuery
collapsed_cleaned.write.format('bigquery') \
  .option('table', 'de2024-435209.DE_Groupassignment_2.weapon_availability') \
  .mode("overwrite") \
  .save()

In [None]:
# Stop the spark context
spark.stop()