In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat_ws, to_date, date_format, sum, count


# Spark Session
spark = SparkSession.builder.appName("GTD_ACLED_Analysis").getOrCreate()

# GTD Dataset
gtd = "globalterrorismdb_0718dist.csv"
gtd_df = spark.read.csv(gtd, header=True, inferSchema=True)

# Filtering for scope of the project - Asia and years 2015-17
gtd_filtered = gtd_df.filter(
    (col("region_txt").isin(["Middle East & North Africa", "South Asia", "Southeast Asia"])) &
    (col("iyear").between(2015, 2017))
)

# Generalizing the event_date
gtd_cleaned = gtd_filtered.withColumn(
    "event_date", to_date(concat_ws("-", col("iyear"), col("imonth"), col("iday")))
)

# Result
gtd_cleaned.show(5)


+------------+-----+------+----+-------------------+--------+----------+-------+-----------+------+--------------------+---------+--------+---------+---------+-----------+--------+--------------------+--------------------+-----+-----+-----+---------+-----------+---------------+--------+-------+-------+-----------+-----------------+-----------+---------------+-----------+---------------+---------+--------------------+------------+--------------------+--------------------+-------------+-------+-----------+---------+-------------+------------+----------------+-----+-------+-------+-----------+---------+-------------+------------+----------------+-----+-------+-------+-----------+--------------------+--------+------+---------+------+---------+------+-----------+-----------+-----------+----------+------+--------+-------+---------+-------------+------+----------+--------------+------+----------+--------------+---------+---------+-------------+------------+--------------------+---------+-----

In [None]:
# ACLED Dataset
acled = "asia_conflicts.csv"
acled_df = spark.read.csv(acled, header=True, inferSchema=True)

# Transforming event_date to fit the format
acled_df = acled_df.withColumn("event_date",
                               date_format(to_date("event_date", "d MMMM yyyy"), "yyyy-MM-dd"))

# Filtering for specified scope
acled_filtered = acled_df.filter(
    (col("region").isin(["Middle East", "South Asia", "Southeast Asia"])) &
    (col("event_date").between("2015-01-01", "2017-12-31"))
)

# Result
acled_filtered.show(5)

+-------+---+-------------+----------------+----------+----+--------------+---------------+--------------------+-------------+------+--------------------+--------------------+------+-----------+-----------+---------+----------+---------+-------+--------------------+--------+---------+-------------+--------------------+--------------------+--------------------+----------+----------+----+
|data_id|iso|event_id_cnty|event_id_no_cnty|event_date|year|time_precision|     event_type|              actor1|assoc_actor_1|inter1|              actor2|       assoc_actor_2|inter2|interaction|     region|  country|    admin1|   admin2| admin3|            location|latitude|longitude|geo_precision|              source|        source_scale|               notes|fatalities| timestamp|iso3|
+-------+---+-------------+----------------+----------+----+--------------+---------------+--------------------+-------------+------+--------------------+--------------------+------+-----------+-----------+---------+----