In [0]:
import pyspark
from synapse.ml.explainers import*
from pyspark.ml import Pipeline
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.sql.types import*
from pyspark.sql.functions import*
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import Imputer
from pyspark.mllib.linalg import Vectors
from pyspark.ml.linalg import VectorUDT

# Bootstrap Spark Session
spark = SparkSession.builder.getOrCreate()

from synapse.ml.core.platform import*

from synapse.ml.core.platform import materializing_display as display

#vec_access = udf(lambda f, i: float(f[i]), FloatType())
vec_access = udf(lambda f, i: float(f[i]), FloatType())
vec2array = udf(lambda vec: vec.toArray().tolist(), ArrayType(FloatType()))



In [0]:
df1 = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("dbfs:/FileStore/shared_uploads/info@krolconsultancy.com/GTD.csv")

In [0]:
df1.printSchema()

root
 |-- iyear: integer (nullable = true)
 |-- imonth: integer (nullable = true)
 |-- iday: integer (nullable = true)
 |-- extended: integer (nullable = true)
 |-- country: integer (nullable = true)
 |-- region: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- vicinity: integer (nullable = true)
 |-- crit1: integer (nullable = true)
 |-- crit2: integer (nullable = true)
 |-- crit3: integer (nullable = true)
 |-- doubtterr: integer (nullable = true)
 |-- attacktype1: integer (nullable = true)
 |-- targtype1: integer (nullable = true)
 |-- natlty1: integer (nullable = true)
 |-- weaptype1: integer (nullable = true)
 |-- property: integer (nullable = true)
 |-- INT_ANY: integer (nullable = true)
 |-- multiple: integer (nullable = true)
 |-- gname: string (nullable = true)



In [0]:
#Check for missing values
for col in df1.columns:
  print(col, "\t","with null values:",
    df1.filter(df1[col].isNull()).count())

iyear 	 with null values: 0
imonth 	 with null values: 0
iday 	 with null values: 0
extended 	 with null values: 0
country 	 with null values: 0
region 	 with null values: 0
latitude 	 with null values: 1941
longitude 	 with null values: 1941
vicinity 	 with null values: 0
crit1 	 with null values: 0
crit2 	 with null values: 0
crit3 	 with null values: 0
doubtterr 	 with null values: 859
attacktype1 	 with null values: 0
targtype1 	 with null values: 0
natlty1 	 with null values: 550
weaptype1 	 with null values: 0
property 	 with null values: 9752
INT_ANY 	 with null values: 4505
multiple 	 with null values: 0
gname 	 with null values: 0


In [0]:
imputer=Imputer(inputCols=['latitude','longitude','doubtterr','natlty1','property','INT_ANY'],
                outputCols=["{}_imputed".format(a)for a in ['latitude','longitude','doubtterr','natlty1','property','INT_ANY']]).setStrategy("mode")

In [0]:
df2 =imputer.fit(df1).transform(df1)
df3=df2.drop('latitude','longitude','doubtterr','natlty1','property','INT_ANY')
df3.show(5)

+-----+------+----+--------+-------+------+--------+-----+-----+-----+-----------+---------+---------+--------+--------------------+----------------+-----------------+-----------------+---------------+----------------+---------------+
|iyear|imonth|iday|extended|country|region|vicinity|crit1|crit2|crit3|attacktype1|targtype1|weaptype1|multiple|               gname|latitude_imputed|longitude_imputed|doubtterr_imputed|natlty1_imputed|property_imputed|INT_ANY_imputed|
+-----+------+----+--------+-------+------+--------+-----+-----+-----+-----------+---------+---------+--------+--------------------+----------------+-----------------+-----------------+---------------+----------------+---------------+
| 1970|     1|  31|       0|    160|     5|       0|    1|    1|    0|          9|        4|       13|       0|New People's Army...|       15.675051|       120.331618|                1|            217|               0|              1|
| 1970|     4|   2|       0|    160|     5|       1|    1|  

In [0]:
# Split Data into Train / Test Sets
train_data, test_data = df3.randomSplit([.7, .3],seed=5)
print("Train Dataset Count: " + str(train_data.count()))
print("Test Dataset Count: " + str(test_data.count()))

Train Dataset Count: 49206
Test Dataset Count: 20973


In [0]:
labelIndexer = StringIndexer(
    inputCol="gname", outputCol="label", stringOrderType="alphabetAsc"
).fit(train_data)
print("Label index assigment: " + str(set(zip(labelIndexer.labels, [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35]))))
Train = labelIndexer.transform(train_data).cache()
display(Train)

#categorical_features = [
 # '',
#]
#categorical_features_idx = [col + "_idx" for col in categorical_features]
#categorical_features_enc = [col + "_enc" for col in categorical_features]
numeric_features = [
  'iyear', 
  'imonth', 
  'iday', 
  'extended', 
  'country', 
  'region',
  'latitude_imputed',
  'longitude_imputed', 
  'vicinity', 
  'crit1', 
  'crit2',
  'crit3',
  'doubtterr_imputed',
  'attacktype1',
  'targtype1',
  'natlty1_imputed',
  'weaptype1',
  'property_imputed',
  'INT_ANY_imputed',
  'multiple',
]
#strIndexer = StringIndexer(
    #inputCols=categorical_features, outputCols=categorical_features_idx
#)
#onehotEnc = OneHotEncoder(
   # inputCols=categorical_features_idx, outputCols=categorical_features_enc
#)
vectAssem = VectorAssembler(
    inputCols= numeric_features, outputCol="features"
)

Label index assigment: {('Palestinian Extremists', 26), ('Abu Sayyaf Group (ASG)', 0), ('Maoists', 21), ("New People's Army (NPA)", 24), ('Farabundo Marti National Liberation Front (FMLN)', 11), ('African National Congress (South Africa)', 1), ("Kurdistan Workers' Party (PKK)", 17), ('Nicaraguan Democratic Force (FDN)', 25), ('Revolutionary Armed Forces of Colombia (FARC)', 28), ('Corsican National Liberation Front (FLNC)', 9), ('Al-Shabaab', 4), ('Separatists', 29), ('M-19 (Movement of April 19)', 19), ('Manuel Rodriguez Patriotic Front (FPMR)', 20), ("Donetsk People's Republic", 10), ('Boko Haram', 7), ('Khorasan Chapter of the Islamic State', 16), ('Fulani extremists', 12), ('Communist Party of India - Maoist (CPI-Maoist)', 8), ('Houthi extremists (Ansar Allah)', 13), ('Al-Qaida in the Arabian Peninsula (AQAP)', 3), ('Bangsamoro Islamic Freedom Movement (BIFM)', 5), ('Muslim extremists', 22), ('Islamic State of Iraq and the Levant (ISIL)', 15), ('Al-Qaida in Iraq', 2), ('Liberation 

In [0]:
layers = [20,35,25,15,36]
mlp = MultilayerPerceptronClassifier(maxIter=500, layers=layers, blockSize=128, seed=1234)

In [0]:
pipeline = Pipeline(stages=[vectAssem, mlp])

In [0]:
model_x = pipeline.fit(Train)

In [0]:
explain_instancess = (
    model_x.transform(Train).orderBy(rand()).limit(5).repartition(200).cache()
)
display(explain_instancess)

DataFrame[iyear: int, imonth: int, iday: int, extended: int, country: int, region: int, vicinity: int, crit1: int, crit2: int, crit3: int, attacktype1: int, targtype1: int, weaptype1: int, multiple: int, gname: string, latitude_imputed: double, longitude_imputed: double, doubtterr_imputed: int, natlty1_imputed: int, property_imputed: int, INT_ANY_imputed: int, label: double, features: vector, rawPrediction: vector, probability: vector, prediction: double]


In [0]:
shap1 = TabularSHAP(
    inputCols= numeric_features,
    outputCol="shapValues",
    numSamples=5000,
    model=model_x,
    targetCol="probability",
    targetClasses=[1],
    backgroundData=broadcast(Train.orderBy(rand()).limit(100).cache()),
)

shap_df3= shap1.transform(explain_instancess)

In [0]:
print(shap_df3.schema)

StructType([StructField('iyear', IntegerType(), True), StructField('imonth', IntegerType(), True), StructField('iday', IntegerType(), True), StructField('extended', IntegerType(), True), StructField('country', IntegerType(), True), StructField('region', IntegerType(), True), StructField('vicinity', IntegerType(), True), StructField('crit1', IntegerType(), True), StructField('crit2', IntegerType(), True), StructField('crit3', IntegerType(), True), StructField('attacktype1', IntegerType(), True), StructField('targtype1', IntegerType(), True), StructField('weaptype1', IntegerType(), True), StructField('multiple', IntegerType(), True), StructField('gname', StringType(), True), StructField('latitude_imputed', DoubleType(), True), StructField('longitude_imputed', DoubleType(), True), StructField('doubtterr_imputed', IntegerType(), True), StructField('natlty1_imputed', IntegerType(), True), StructField('property_imputed', IntegerType(), True), StructField('INT_ANY_imputed', IntegerType(), Tru

In [0]:
shapS = (
    shap_df3.withColumn("probability", vec_access(col("probability"), lit(1)))
    .withColumn("shapValues", vec2array(col("shapValues").getItem(0)))
    .select(
        ["shapValues", "probability", "label"] + categorical_features + numeric_features
    )
)

shapS_local = shaps.toPandas()
shapS_local.sort_values("probability", ascending=False, inplace=True, ignore_index=True)
pd.set_option("display.max_colwidth", None)
shaps_local

[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
[0;32m<command-2223917932334427>[0m in [0;36m<cell line: 2>[0;34m()[0m
[1;32m      1[0m shapS = (
[0;32m----> 2[0;31m     [0mshap_df3[0m[0;34m.[0m[0mwithColumn[0m[0;34m([0m[0;34m"probability"[0m[0;34m,[0m [0mvec_access[0m[0;34m([0m[0mcol[0m[0;34m([0m[0;34m"probability"[0m[0;34m)[0m[0;34m,[0m [0mlit[0m[0;34m([0m[0;36m1[0m[0;34m)[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      3[0m     [0;34m.[0m[0mwithColumn[0m[0;34m([0m[0;34m"shapValues"[0m[0;34m,[0m [0mvec2array[0m[0;34m([0m[0mcol[0m[0;34m([0m[0;34m"shapValues"[0m[0;34m)[0m[0;34m.[0m[0mgetItem[0m[0;34m([0m[0;36m0[0m[0;34m)[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m      4[0m     .select(
[1;32m      5[0m         [0;34m[[0m[0;34m"shapValues"[0m