# Projeto Big Data

Julio Sales <br/>
Mácio Matheus<br/>
Victor Outtes

Dataset: https://www.kaggle.com/mlg-ulb/creditcardfraud

In [20]:
import findspark
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import RandomForest as  cls_rf
from time import *
from pyspark.mllib.evaluation import BinaryClassificationMetrics

# Utilizando o contexto do spark

In [None]:
findspark.init()
sc = pyspark.SparkContext(appName="projeto")
session = pyspark.sql.SparkSession.builder.config(conf=SparkConf())
spark = session.getOrCreate()

# Carregando o dataset do hdfs do hadoop

In [16]:
df = spark.read.options(header = "true", inferschema = "true").csv('hdfs://cluster-001-m/user/victor_outtes/creditcard.csv')
df.printSchema()
print("Total number of rows:", df.count())

root
 |-- Time: decimal(10,0) (nullable = true)
 |-- V1: double (nullable = true)
 |-- V2: double (nullable = true)
 |-- V3: double (nullable = true)
 |-- V4: double (nullable = true)
 |-- V5: double (nullable = true)
 |-- V6: double (nullable = true)
 |-- V7: double (nullable = true)
 |-- V8: double (nullable = true)
 |-- V9: double (nullable = true)
 |-- V10: double (nullable = true)
 |-- V11: double (nullable = true)
 |-- V12: double (nullable = true)
 |-- V13: double (nullable = true)
 |-- V14: double (nullable = true)
 |-- V15: double (nullable = true)
 |-- V16: double (nullable = true)
 |-- V17: double (nullable = true)
 |-- V18: double (nullable = true)
 |-- V19: double (nullable = true)
 |-- V20: double (nullable = true)
 |-- V21: double (nullable = true)
 |-- V22: double (nullable = true)
 |-- V23: double (nullable = true)
 |-- V24: double (nullable = true)
 |-- V25: double (nullable = true)
 |-- V26: double (nullable = true)
 |-- V27: double (nullable = true)
 |-- V28: double

# Split dataset 70-30

In [24]:
TRAIN_DATA_RATIO = 0.7

# A ultima coluna contém o target
transformed_df = df.rdd.map(lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))

# Dividindo o dataset
splits = [TRAIN_DATA_RATIO, 1.0 - TRAIN_DATA_RATIO]
train_data, test_data = transformed_df.randomSplit(splits, RANDOM_SEED)
print("Quantidade de linhas do dataset de treinamento: %d" % train_data.count())
print("Quantidade de linhas do dataset de testes: %d" % test_data.count())

Quantidade de linhas do dataset de treinamento: 199492
Quantidade de linhas do dataset de testes: 85315


# Treino e parametrização do modelo random forest

In [26]:
RANDOM_SEED = 13579
NUM_TREES = 3
MAX_DEPTH = 4
MAX_BINS = 32
t1 = time()
# Treinando a random forest
model = cls_rf.trainClassifier(train_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=NUM_TREES, maxDepth=MAX_DEPTH, maxBins=MAX_BINS, seed=RANDOM_SEED)

t2 = time()
t_end = t2 - t1
print("Tempo do treinamento: %.3f s" % t_end)

Tempo do treinamento: 23.769 s


# Predição e medição da acurácia

In [22]:
predictions = model.predict(test_data.map(lambda x: x.features))
labels_and_predictions = test_data.map(lambda x: x.label).zip(predictions)
accuracy = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float(test_data.count())
print("Model accuracy: %.3f%%" % (accuracy * 100))

Model accuracy: 99.945%


# Métricas: precision recall / curva roc

In [23]:
metrics = BinaryClassificationMetrics(labels_and_predictions)
print("Area PR curve: %.f" % (metrics.areaUnderPR * 100))
print("Area ROC curve: %.3f" % (metrics.areaUnderROC * 100))

cannot import name 'hashtable'
Area PR curve: 83
Area ROC curve: 95.021
