# Introduction:

# Motivation:

# Design:

## Step 1: Problem Statement

## Step 2: Loading the required packages in pyspark.

In [1]:

from pyspark.sql import SQLContext#To load the csv files as dataframes
sqlContext = SQLContext(sc)
from pyspark.ml.tuning import TrainValidationSplit#For train test split
from pyspark.ml.classification import DecisionTreeClassifier#model builder function
from pyspark.sql import SparkSession#to create spark session
from pyspark.ml import Pipeline#Pipeline for creating a flow of processes to be done on data
from pyspark.ml.feature import  StringIndexer, VectorAssembler#Data converson functions
from pyspark.ml.evaluation import BinaryClassificationEvaluator#Model evaluator function
from pyspark.sql.types import StringType
from pyspark.mllib.evaluation import MulticlassMetrics#For model evaluation



## Step 3: Creating spark session and loading the data

In [2]:
#Creating Spark session
if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("DecisionTree") \
        .getOrCreate()

In [3]:
#We will be using spark sql context to load the csv file
df = sqlContext.read.format('com.databricks.spark.csv')\
    .options(header='true', inferschema='true').load('/home/record_linkage.csv')

#It was also found that our label feature was inferred as boolean hence we are explicitly 
#defining it as a character(string type) so that it can be integrated in the model
df=df.withColumn("is_match", df.is_match.cast(StringType()))


## Step 4: Exploratory analysis

In [4]:
#Finding the number of rows in our dataset.
df.count()

5749132

In [5]:
#Viewing the first 5 rows of our data.
df.head(5)

[Row(id_1=37291, id_2=53113, cmp_fname_c1=3, cmp_fname_c2=3, cmp_lname_c1=1.0, cmp_lname_c2=3, cmp_sex=1, cmp_bd=1, cmp_bm=1, cmp_by=1, cmp_plz=0, is_match=u'true'),
 Row(id_1=39086, id_2=47614, cmp_fname_c1=3, cmp_fname_c2=3, cmp_lname_c1=1.0, cmp_lname_c2=3, cmp_sex=1, cmp_bd=1, cmp_bm=1, cmp_by=1, cmp_plz=1, is_match=u'true'),
 Row(id_1=70031, id_2=70237, cmp_fname_c1=3, cmp_fname_c2=3, cmp_lname_c1=1.0, cmp_lname_c2=3, cmp_sex=1, cmp_bd=1, cmp_bm=1, cmp_by=1, cmp_plz=1, is_match=u'true'),
 Row(id_1=84795, id_2=97439, cmp_fname_c1=3, cmp_fname_c2=3, cmp_lname_c1=1.0, cmp_lname_c2=3, cmp_sex=1, cmp_bd=1, cmp_bm=1, cmp_by=1, cmp_plz=1, is_match=u'true'),
 Row(id_1=36950, id_2=42116, cmp_fname_c1=3, cmp_fname_c2=3, cmp_lname_c1=1.0, cmp_lname_c2=1, cmp_sex=1, cmp_bd=1, cmp_bm=1, cmp_by=1, cmp_plz=1, is_match=u'true')]

In [6]:
#Removing the id columns as they are redundant in the model building process
df=df.drop("id_1")
df=df.drop("id_2")

#getting column names of the data
cols=df.columns


## Step 5: Data restructuring and conversions

In the below code chunk we will structure the data according to the requirements of the spark machine learning model of decision tree. Spark takes data into vectors of features and labels, hence the below conversions has to be done. 


In [7]:
stages = [] # stages in our Pipeline
# Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol = "is_match", outputCol = "label")
stages += [label_stringIdx]

ml_cols=cols[0:9]
assembler = VectorAssembler(inputCols=ml_cols, outputCol="features")
stages += [assembler]

In [8]:
# Create a Pipeline.
pipeline = Pipeline(stages=stages)
# Run the feature transformations.
#  - fit() computes feature statistics as needed.
#  - transform() actually transforms the features.
pipelineModel = pipeline.fit(df)
df = pipelineModel.transform(df)

# Keep relevant columns
selectedcols = ["label", "features"] + cols
df = df.select(selectedcols)

In [9]:
df.head()

Row(label=1.0, features=DenseVector([3.0, 3.0, 1.0, 3.0, 1.0, 1.0, 1.0, 1.0, 0.0]), cmp_fname_c1=3, cmp_fname_c2=3, cmp_lname_c1=1.0, cmp_lname_c2=3, cmp_sex=1, cmp_bd=1, cmp_bm=1, cmp_by=1, cmp_plz=0, is_match=u'true')

## Step 6: Model building

In [10]:
train, test = df.randomSplit([0.75, 0.25], seed=141)#Splitting into train and test

In [11]:
# Create initial Decision Tree Model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=3)

# Train model with Training Data
dtModel = dt.fit(train)

In [12]:
print "numNodes = ", dtModel.numNodes
print "depth = ", dtModel.depth

numNodes =  15
depth =  3


## Step 7: Predictions and model evauations

In [13]:
# Make predictions on test data using the Transformer.transform() method.
predictions = dtModel.transform(test)
# Evaluate model
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

0.9997059337157059

In [14]:
def print_metrics(predictions_and_labels):
    metrics = MulticlassMetrics(predictions_and_labels)
    print 'Precision of True ', metrics.precision(1)
    print 'Precision of False', metrics.precision(0)
    print 'Recall of True    ', metrics.recall(1)
    print 'Recall of False   ', metrics.recall(0)
    print 'F-1 Score         ', metrics.fMeasure()
    print 'Confusion Matrix\n', metrics.confusionMatrix().toArray()
    
predictions_and_labels = predictions.select("prediction", "label").rdd \
.map(lambda r: (float(r[0]), float(r[1])))

print_metrics(predictions_and_labels)

Precision of True  0.995205753096
Precision of False 0.999801910034
Recall of True     0.946069122674
Recall of False    0.999983256966
F-1 Score          0.999785917545
Confusion Matrix




[[  1.43340800e+06   2.40000000e+01]
 [  2.84000000e+02   4.98200000e+03]]


## Step 8: Conclusion