# Download and Unzip Function

To see the download and unzipping progress

In [1]:
!pip install lemma-dev-utils
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lemma-dev-utils
  Downloading lemma_dev_utils-0.0.2.tar.gz (3.0 kB)
Building wheels for collected packages: lemma-dev-utils
  Building wheel for lemma-dev-utils (setup.py) ... [?25l[?25hdone
  Created wheel for lemma-dev-utils: filename=lemma_dev_utils-0.0.2-py3-none-any.whl size=3524 sha256=2d636e775d9ce1812c9ab72744207d049453ad0b2432d18f092b17f54a68b27c
  Stored in directory: /root/.cache/pip/wheels/be/6a/ac/93a152a4146982dfdfc411e32037c303ada53bf5ab93f8939f
Successfully built lemma-dev-utils
Installing collected packages: lemma-dev-utils
Successfully installed lemma-dev-utils-0.0.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 41 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-non

# Downloading the Dataset

In [101]:
with open('dataset_url.txt', 'r') as f:
    url = f.read()

In [2]:
from lemma_dev_utils import download_unzip
import os

path = 'dataset'

download_unzip(path, url)
os.chdir(path)

Downloading amex-default-prediction.zip: 100%|##########| 21981528023/21981528023 [02:36<00:00, 140731347.76it/s]
Unzipping sample_submission.csv: 100%|##########| 61956097/61956097 [00:01<00:00, 60360417.92it/s]
Unzipping test_data.csv: 100%|##########| 33824849921/33824849921 [08:24<00:00, 66994585.30it/s]
Unzipping train_data.csv: 100%|##########| 16393289729/16393289729 [03:53<00:00, 70278995.48it/s]
Unzipping train_labels.csv: 100%|##########| 30752769/30752769 [00:00<00:00, 62216271.03it/s]


# Setting up Spark

In [1]:
import os
os.chdir('dataset')

In [10]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local[*]")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()
        
sc = spark.sparkContext

# Loading the Training Set

In [3]:
#raw_data = sc.textFile("train_data.csv")
#raw_labels = sc.textFile("train_labels.csv")

In [8]:
import pandas as pd
import random

skip = sorted(random.sample(range(1, 5531450),5531450-10001))
df = pd.read_csv('train_data.csv', skiprows=skip)

In [11]:
df.to_csv('sample_train_data.csv', index=False)

In [12]:
del df

# Preprocessing



In [11]:
raw_data = sc.textFile("sample_train_data.csv")
raw_labels = sc.textFile("train_labels.csv")

## Managing Numeric and NaN values

In [12]:
import math

def isnumeric_val(val):
    non_negative = val.replace("-", "", 1)
    non_decimal = non_negative.replace(".", "", 1)
    non_scientific = non_decimal.replace("e-", "", 1).replace("e", "", 1)
    isitnumeric = non_scientific.isdigit()
    return isitnumeric

def clean_split_csv(row):
    # Split Row
    split_row = row.split(',')
    new_row = list()

    for val in split_row:
        # Fill NaN
        val = val if val != '' else '0.0'
        # Rounding numbers
        val_toadd = math.floor(float(val)*1000)/1000 if isnumeric_val(val) else val
        new_row.append(val_toadd)

    return tuple(new_row)

## Getting features, labels and the header

In [13]:
# cleaning the training data
clean_data = raw_data.map(lambda row: clean_split_csv(row))
header_features = clean_data.take(1)[0]
features = clean_data.filter(lambda x: x != header_features)

# cleaning the training labels
clean_labels = raw_labels.map(lambda row: clean_split_csv(row))
header_labels = clean_labels.take(1)[0]
labels = clean_labels.filter(lambda x: x != header_labels)

## Sampling the dataset

In [14]:
#sampled_data = features.sample(withReplacement = False, fraction = 0.1, seed = 42)

In [15]:
sampled_data = features

## Get number of partitions I have

In [16]:
sampled_data.getNumPartitions()

2

## Getting the feature dense shape

# From RDD to DF

In [17]:
df = sampled_data.toDF()

## Checking column names

In [18]:
df_columns = df.columns
len(header_features) == len(df_columns)

True

In [19]:
header_features == df_columns

False

In [20]:
header_features[:5], df_columns[:5]

(('customer_ID', 'S_2', 'P_2', 'D_39', 'B_1'), ['_1', '_2', '_3', '_4', '_5'])

## Rename with original column names

In [21]:
for n in range(len(df_columns)):
    df = df.withColumnRenamed(df_columns[n], header_features[n])

## Looking at the dataset shape and schema

In [22]:
df.show()

+--------------------+----------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+-----+-----+-----+-----+-----+----+----+-----+-----+-----+-----+-----+----+-----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+----+-----+-----+----+-----+-----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+----+-----+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----

In [23]:
schema = {col.name : str(col.dataType) for col in df.schema}
print(*[f"{x}\n" for x in schema.items() if x[1] != 'DoubleType()'])

('customer_ID', 'StringType()')
 ('S_2', 'StringType()')
 ('D_63', 'StringType()')
 ('D_64', 'StringType()')



In [33]:
n_rows = df.count()
n_columns = len(df_columns)
print(f"n° o Rows: {n_rows}")
print(f"n° o Columns: {n_columns}")

n° o Rows: 10001
n° o Columns: 190


In [34]:
df.printSchema()

root
 |-- customer_ID: string (nullable = true)
 |-- S_2: string (nullable = true)
 |-- P_2: double (nullable = true)
 |-- D_39: double (nullable = true)
 |-- B_1: double (nullable = true)
 |-- B_2: double (nullable = true)
 |-- R_1: double (nullable = true)
 |-- S_3: double (nullable = true)
 |-- D_41: double (nullable = true)
 |-- B_3: double (nullable = true)
 |-- D_42: double (nullable = true)
 |-- D_43: double (nullable = true)
 |-- D_44: double (nullable = true)
 |-- B_4: double (nullable = true)
 |-- D_45: double (nullable = true)
 |-- B_5: double (nullable = true)
 |-- R_2: double (nullable = true)
 |-- D_46: double (nullable = true)
 |-- D_47: double (nullable = true)
 |-- D_48: double (nullable = true)
 |-- D_49: double (nullable = true)
 |-- B_6: double (nullable = true)
 |-- B_7: double (nullable = true)
 |-- B_8: double (nullable = true)
 |-- D_50: double (nullable = true)
 |-- D_51: double (nullable = true)
 |-- B_9: double (nullable = true)
 |-- R_3: double (nullable = t

In [24]:
# input from American Express challenge
categorical_variables = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

In [25]:
categorical_df = df.select(categorical_variables)

In [26]:
categorical_df.show()

+----+----+-----+-----+-----+-----+-----+----+----+----+----+
|B_30|B_38|D_114|D_116|D_117|D_120|D_126|D_63|D_64|D_66|D_68|
+----+----+-----+-----+-----+-----+-----+----+----+----+----+
| 0.0| 2.0|  0.0|  0.0|  3.0|  0.0|  1.0|  CL|   O| 0.0| 6.0|
| 0.0| 1.0|  1.0|  0.0|  2.0|  0.0|  1.0|  CO|   O| 0.0| 6.0|
| 0.0| 2.0|  1.0|  0.0|  3.0|  0.0|  1.0|  CR|   O| 0.0| 6.0|
| 0.0| 1.0|  1.0|  0.0|  4.0|  0.0|  1.0|  CO|   O| 0.0| 6.0|
| 0.0| 1.0|  0.0|  0.0| -1.0|  0.0|  1.0|  CR|   O| 0.0| 6.0|
| 1.0| 4.0|  1.0|  0.0|  4.0|  0.0|  1.0|  CO|   O| 0.0| 6.0|
| 0.0| 3.0|  0.0|  0.0|  2.0|  0.0|  0.0|  CL|   U| 0.0| 2.0|
| 0.0| 2.0|  1.0|  0.0|  6.0|  0.0|  0.0|  CR|   O| 0.0| 5.0|
| 0.0| 2.0|  0.0|  0.0|  3.0|  0.0| -1.0|  CR|   O| 0.0| 5.0|
| 0.0| 1.0|  0.0|  0.0|  4.0|  0.0|  1.0|  CO|   O| 0.0| 5.0|
| 0.0| 1.0|  1.0|  0.0|  4.0|  0.0|  0.0|  CO|   O| 0.0| 6.0|
| 0.0| 2.0|  0.0|  0.0|  3.0|  0.0|  1.0|  CO|   R| 0.0| 5.0|
| 0.0| 7.0|  0.0|  0.0|  4.0|  0.0|  0.0|  CO|   U| 0.0| 4.0|
| 0.0| 2

In [27]:
categorical_df.describe().filter("summary = 'max'").collect()

[Row(summary='max', B_30='2.0', B_38='7.0', D_114='1.0', D_116='1.0', D_117='6.0', D_120='1.0', D_126='1.0', D_63='XZ', D_64='U', D_66='1.0', D_68='6.0')]

In [28]:
categorical_df.describe().filter("summary = 'min'").collect()

[Row(summary='min', B_30='0.0', B_38='0.0', D_114='0.0', D_116='0.0', D_117='-1.0', D_120='0.0', D_126='-1.0', D_63='CL', D_64='-1.0', D_66='0.0', D_68='0.0')]

D_117 and D_126 may be a problem as they were known values but we changed the nan values to 0, they have negative values I'll make them 0.5 to adjust to a category semantically closer to the original one

# Fixing the negative values appearing in the categories

In [29]:
from pyspark.sql.functions import col, when

for i in ['D_117', 'D_126', 'D_64']:
    df = df.withColumn(i, when((col(i) == -1.0), 0.5).otherwise(col(i)))

In [30]:
df.select(categorical_variables).describe().filter("summary = 'min'").collect()

[Row(summary='min', B_30='0.0', B_38='0.0', D_114='0.0', D_116='0.0', D_117='0.0', D_120='0.0', D_126='0.0', D_63='CL', D_64='0.0', D_66='0.0', D_68='0.0')]

# Casting string-type categories to numeric
As per the request of Spark

In [35]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

for col in categorical_variables:
    indexer = StringIndexer(inputCol=col, outputCol=f'{col}_numeric').fit(df)
    df = indexer.transform(df)

In [36]:
new_categorical_variables = [f'{col}_numeric' for col in categorical_variables]

In [37]:
categorical_df = df.select(categorical_variables+new_categorical_variables)

In [38]:
categorical_df.groupby(['D_63_numeric', 'D_63']).count().show()

+------------+----+-----+
|D_63_numeric|D_63|count|
+------------+----+-----+
|         3.0|  XZ|   46|
|         5.0|  XL|    9|
|         4.0|  XM|   16|
|         0.0|  CO| 7451|
|         2.0|  CL|  753|
|         1.0|  CR| 1726|
+------------+----+-----+



In [39]:
# somewhat a heavy operation (it could be avoided) but I prefer to keep the original category labels
reverse_dictionary = {col : dict(categorical_df.groupby([f'{col}_numeric', col]).count()\
                                 .drop('count').collect()) for col in categorical_variables}

reverse_dictionary['D_63']

{3.0: 'XZ', 5.0: 'XL', 4.0: 'XM', 0.0: 'CO', 2.0: 'CL', 1.0: 'CR'}

In [40]:
reverse_dictionary.keys()

dict_keys(['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68'])

In [41]:
# dropping old columns from dataframe
drop_statement = 'df = df'
for col in categorical_variables:
    drop_statement += f'.drop("{col}")'
exec(drop_statement)

# One hot encoding Categorical variables

In [42]:
from pyspark.ml.feature import OneHotEncoder

output_col_names = [f'{col}_enc' for col in categorical_variables]
encoder = OneHotEncoder(inputCols = new_categorical_variables, 
                        outputCols = output_col_names)

model = encoder.fit(df)
df = model.transform(df)

# dropping the numeric columns from df
drop_statement = 'df = df'
for col in new_categorical_variables:
    drop_statement += f'.drop("{col}")'
exec(drop_statement)

df.select(output_col_names).show()

+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
|     B_30_enc|     B_38_enc|    D_114_enc|    D_116_enc|    D_117_enc|    D_120_enc|    D_126_enc|     D_63_enc|     D_64_enc|     D_66_enc|     D_68_enc|
+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
|(2,[0],[1.0])|(7,[0],[1.0])|(1,[0],[1.0])|(1,[0],[1.0])|(7,[0],[1.0])|(1,[0],[1.0])|(2,[0],[1.0])|(5,[1],[1.0])|(4,[0],[1.0])|(1,[0],[1.0])|(6,[0],[1.0])|
|(2,[0],[1.0])|(7,[0],[1.0])|(1,[0],[1.0])|(1,[0],[1.0])|(7,[0],[1.0])|(1,[0],[1.0])|(2,[0],[1.0])|(5,[0],[1.0])|(4,[0],[1.0])|(1,[0],[1.0])|(6,[1],[1.0])|
|(2,[0],[1.0])|(7,[2],[1.0])|(1,[0],[1.0])|(1,[0],[1.0])|(7,[0],[1.0])|(1,[0],[1.0])|(2,[0],[1.0])|(5,[0],[1.0])|(4,[2],[1.0])|(1,[0],[1.0])|(6,[1],[1.0])|
|(2,[1],[1.0])|(7,[5],[1.0])|(1,[0],[1.0])|(1,[0],[1.0])|(7,[2],

In [43]:
# renaming back columns to original names
for n in range(len(output_col_names)):
    df = df.withColumnRenamed(output_col_names[n], categorical_variables[n])

# Getting the Labels to Join them

In [31]:
label_df = labels.toDF()
label_cols = label_df.columns
for n in range(len(label_cols)):
    label_df = label_df.withColumnRenamed(label_cols[n], header_labels[n])
label_df.show()

+--------------------+------+
|         customer_ID|target|
+--------------------+------+
|0000099d6bd597052...|   0.0|
|00000fd6641609c6e...|   0.0|
|00001b22f846c82c5...|   0.0|
|000041bdba6ecadd8...|   0.0|
|00007889e4fcd2614...|   0.0|
|000084e5023181993...|   0.0|
|000098081fde4fd64...|   0.0|
|0000d17a1447b25a0...|   0.0|
|0000f99513770170a...|   1.0|
|00013181a0c5fc8f1...|   1.0|
|0001337ded4e1c253...|   1.0|
|00013c6e1cec7c21b...|   1.0|
|0001812036f155833...|   1.0|
|00018dd4932409baf...|   0.0|
|000198b3dc70edd65...|   0.0|
|000201146e53cacdd...|   0.0|
|0002d381bdd8048d7...|   0.0|
|0002e335892f7998f...|   1.0|
|00031e8be98bc3411...|   0.0|
|000333075fb8ec6d5...|   1.0|
+--------------------+------+
only showing top 20 rows



In [46]:
df = df.join(label_df, ['customer_ID'])

In [55]:
df.target

Column<'target'>

In [59]:
from pyspark.ml.feature import VectorAssembler

columns_final = ['customer_ID', 'S_2', 'target']
feature_to_assemble = [col for col in header_features if col not in columns_final]

vecAssembler = VectorAssembler(inputCols=feature_to_assemble, 
                               outputCol="features")

assembled_df = vecAssembler.transform(df).select(columns_final+['features'])

In [60]:
assembled_df.show()

+--------------------+----------+------+--------------------+
|         customer_ID|       S_2|target|            features|
+--------------------+----------+------+--------------------+
|0014d951622647e82...|2018-02-01|   0.0|[0.902,0.091,0.03...|
|001b8b68722cd18ec...|2017-12-09|   0.0|[0.96,0.006,0.001...|
|001cde1044b029fab...|2017-03-25|   1.0|[0.0,0.033,0.899,...|
|001ff5350ab524ad2...|2017-11-22|   1.0|[0.318,0.472,0.02...|
|002da05f7452c77cd...|2017-05-26|   0.0|[0.829,1.037,0.04...|
|0033da35db3a15c28...|2017-08-29|   1.0|[0.273,0.384,0.02...|
|003f7b33280fff053...|2017-11-22|   1.0|[0.528,0.008,0.00...|
|004f500fb6369857e...|2017-09-08|   1.0|[0.323,0.006,0.00...|
|005382536094338c2...|2017-09-16|   0.0|[0.281,0.031,0.04...|
|005c74962a7c912f9...|2017-11-10|   1.0|[0.473,0.03,0.675...|
|005f7ccc9c4131b73...|2018-02-28|   0.0|[0.891,0.003,0.00...|
|006078e5943de3f1f...|2017-09-26|   0.0|[0.966,0.002,0.00...|
|00675c719a9666275...|2017-03-06|   0.0|[0.94,0.006,0.072...|
|0081622

# Feature Selection

In [69]:
from pyspark.ml.feature import UnivariateFeatureSelector

selector = UnivariateFeatureSelector(featuresCol="features", outputCol="selectedFeatures",
                                     labelCol="target", selectionMode="numTopFeatures")

selector.setFeatureType("continuous").setLabelType("categorical").setSelectionThreshold(20)

columns_final.append('selectedFeatures')

result = selector.fit(assembled_df).transform(assembled_df).select(columns_final)

print("UnivariateFeatureSelector output with top %d features selected using f_classif"
      % selector.getSelectionThreshold())

UnivariateFeatureSelector output with top 20 features selected using f_classif


In [70]:
result.take(1)

[Row(customer_ID='0014d951622647e823722b75013b52df120f9ca0f8acc57321616e6a4a1f80f7', S_2='2018-02-01', target=0.0, selectedFeatures=DenseVector([0.902, 0.091, 0.031, 1.001, 0.009, 0.213, 0.0, 0.004, 0.0, 0.0, 0.004, 0.038, 1.001, 0.006, 0.436, 0.734, 0.029, 0.0, 0.187, 0.03]))]

# Train-test split
Not stratified (yet to implement)

In [86]:
train, validation = result.randomSplit(weights=[0.8,0.2], seed=42)

# Random Forest

In [88]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="target", featuresCol="selectedFeatures", seed=42, numTrees=30)
model = rf.fit(train)

In [89]:
model.featureImportances

SparseVector(20, {0: 0.2805, 1: 0.0017, 2: 0.0713, 3: 0.028, 4: 0.0225, 5: 0.0196, 6: 0.0063, 7: 0.0463, 8: 0.025, 9: 0.0104, 10: 0.1281, 11: 0.0179, 12: 0.0301, 13: 0.0027, 14: 0.0012, 15: 0.0089, 16: 0.1964, 17: 0.0044, 18: 0.0269, 19: 0.0719})

In [92]:
predictions = model.transform(validation)

In [95]:
predictions.show()

+--------------------+----------+------+--------------------+--------------------+--------------------+----------+
|         customer_ID|       S_2|target|    selectedFeatures|       rawPrediction|         probability|prediction|
+--------------------+----------+------+--------------------+--------------------+--------------------+----------+
|001cde1044b029fab...|2017-03-25|   1.0|[0.0,0.033,0.899,...|[15.0156820210917...|[0.50052273403639...|       0.0|
|003f7b33280fff053...|2017-11-22|   1.0|[0.528,0.008,0.00...|[20.9934810788348...|[0.69978270262782...|       0.0|
|005382536094338c2...|2017-09-16|   0.0|[0.281,0.031,0.04...|[25.7972281329105...|[0.85990760443035...|       0.0|
|0081622f92bc7b661...|2018-02-10|   0.0|[0.911,0.068,0.01...|[29.0817121138822...|[0.96939040379607...|       0.0|
|00be9aec004505277...|2017-03-22|   0.0|[0.608,0.004,0.03...|[22.4264292818542...|[0.74754764272847...|       0.0|
|00f1a29d193758630...|2017-10-14|   0.0|[0.998,0.008,0.03...|[29.2106984152520..

In [99]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="target",
                                              predictionCol="prediction",
                                              metricName="accuracy")

accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")
print(f"Test Error = {(1.0 - accuracy)}")

Accuracy: 0.8578811369509044
Test Error = 0.1421188630490956
