# PROJETO DE DETECÇÃO DE FRAUDES EM COMPRAS ONLINE




## Descrição

FONTE: https://www.kaggle.com/datasets/ealaxi/paysim1?resource=download

IDEALIZADO EM: https://medium.com/coders-camp/230-machine-learning-projects-with-python-5d0c7abf8265

## IMPORT DE BIBLIOTECAS

### conectar com o google drive

In [64]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### integrar spark no colab
https://www.analyticsvidhya.com/blog/2020/11/a-must-read-guide-on-how-to-work-with-pyspark-on-google-colab-for-data-scientists/

In [65]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null # download Java


In [66]:
!wget -q https://www-us.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop2.7.tgz #  install Apache Spark 3.0.1 with Hadoop 2.7


In [67]:
!tar xf spark-3.3.1-bin-hadoop3.tgz # unzip that folder


tar: spark-3.3.1-bin-hadoop3.tgz: Cannot open: No such file or directory
tar: Error is not recoverable: exiting now


In [68]:
!pip install -q findspark # install findspark library

### iniciar ambiente spark

In [69]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/drive/MyDrive/..Python_Codes/Projeto_fraude_compras_online/content/spark-3.3.1-bin-hadoop3"

In [70]:
import findspark
findspark.init()

In [71]:
findspark.find()


'/content/drive/MyDrive/..Python_Codes/Projeto_fraude_compras_online/content/spark-3.3.1-bin-hadoop3'

In [72]:
#  change the permissions on its parent directory
!chmod 755 -R '/content/drive/MyDrive/..Python_Codes/Projeto_fraude_compras_online/content/spark-3.3.1-bin-hadoop3/./bin'


In [73]:
# import SparkSession from pyspark.sql and create a SparkSession, which is the entry point to Spark.
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [74]:
spark


### imports gerais

In [84]:
import pandas as pd
import numpy as np
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

## IMPORT DE FUNÇÕES

In [85]:
def translate(mapping):
      def translate_(col):
          return mapping.get(col)
      return udf(translate_, StringType())

## IMPORT DE DADOS

In [76]:
#@title Escolha de método de execução (Pandas ou PySpark) { form-width: "10%" }
METHOD = "PYSPARK" #@param ["PANDAS", "PYSPARK"]


In [77]:
if METHOD == 'PYSPARK':
  df = spark.read.csv("/content/drive/MyDrive/..Python_Codes/Projeto_fraude_compras_online/content/PS_20174392719_1491204439457_log.csv", header=True, inferSchema=True)
elif METHOD == 'PANDAS':
  df = pd.read_csv("/content/drive/MyDrive/..Python_Codes/Projeto_fraude_compras_online/content/PS_20174392719_1491204439457_log.csv", header=0)  


In [78]:
if METHOD == 'PYSPARK':
  print(f"Esse dataframe tem {df.count()} linhas e {len(df.columns)} colunas")
  df.show() 

elif METHOD == 'PANDAS':
  print(f"Esse dataframe tem {len(df)} linhas e {len(df.columns)} colunas") 
  display(df) 


Esse dataframe tem 6362620 linhas e 11 colunas
+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|    type|   amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|   1| PAYMENT|  9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|
|   1| PAYMENT|  1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|
|   1|TRANSFER|    181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|             0|
|   1|CASH_OUT|    181.0| C840083671|        181.0|           0.0|  C38997010|       21182.0|           0.0|      1|             0|
|   1| PAYMENT| 11668.14|C204

## TRATAMENTO E LIMPEZA DE DADOS

In [87]:
if METHOD == 'PYSPARK':
  df_clean = df.na.drop()

  # df_clean = df_clean.withColumn("type_copy", df["type"])

  types = df_clean.select("type").distinct().rdd.flatMap(lambda x: x).collect()
  types_expr = [F.when(F.col("type") == ty, 1).otherwise(0).alias("type_" + ty) for ty in types]
  df_clean = df_clean.select("*", "type", *types_expr)

  df.withColumn("type_num", translate({"CASH_OUT": 1, 
                                    "PAYMENT": 2, 
                                    "CASH_IN": 3, 
                                    "TRANSFER": 4,
                                    "DEBIT": 5})("type"))

  df.withColumn("isFraud_str", translate({0: "No Fraud", 1: "Fraud"})("isFraud"))

  # Indicador de quanto removeu dos dados
  print(f"""
  Sobraram neste dataframe {df_clean.count()} linhas e {len(df_clean.columns)} colunas
  Equivalente à {round(100*(df_clean.count()/df.count()),2)} % das linhas originais e {round(100*(len(df_clean.columns)/len(df.columns)),2)} % das colunas originais
  """)

  df_clean.show() 

elif METHOD == 'PANDAS':
  df_clean = df.dropna()

  # One Hot Encoding e mapping para substituir string por número (necessário para ML)
  df_clean['type_copy'] = df_clean['type']
  df_clean = pd.get_dummies(df_clean, columns=['type'])
  df_clean = df_clean.rename(columns={"type_copy": "type"})

  # mapping
  df_clean["type_num"] = df_clean["type"].map({"CASH_OUT": 1, 
                                        "PAYMENT": 2, 
                                          "CASH_IN": 3, 
                                          "TRANSFER": 4,
                                          "DEBIT": 5})

  df_clean["isFraud_str"] = df_clean["isFraud"].map({0: "No Fraud", 1: "Fraud"})

  # Indicador de quanto removeu dos dados
  print(f"""
  Sobraram neste dataframe {len(df_clean)} linhas e {len(df_clean.columns)} colunas
  Equivalente à {round(100*(len(df_clean)/len(df)),2)} % das linhas originais e {round(100*(len(df_clean.columns)/len(df.columns)),2)} % das colunas originais
  """)

  display(df_clean)





  Sobraram neste dataframe 6362620 linhas e 17 colunas
  Equivalente à 100.0 % das linas originais e 154.55 % das colunas originais
  
+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+--------+-------------+------------+-------------+------------+----------+
|step|    type|   amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|    type|type_TRANSFER|type_CASH_IN|type_CASH_OUT|type_PAYMENT|type_DEBIT|
+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+--------+-------------+------------+-------------+------------+----------+
|   1| PAYMENT|  9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0| PAYMENT|            0|           0|            0|           1|         0|
|   1| PAYMENT|  1864.28|C1666544295|   

## ANÁLISE

In [80]:
df_info = df_clean.copy()

# pie chart
transaction_type = df_info["type"].value_counts()
transactions = transaction_type.index
quantity = transaction_type.values

import plotly.express as px
figure = px.pie(df_info, 
             values=quantity, 
             names=transactions,hole = 0.5, 
             title="Distribuição de tipos de transação")
figure.show()

# ================= Checking correlation =================
correlation = df_info.corr()
print('\n\n\nCorrelação entre variáveis:\n')
print(correlation["isFraud"].sort_values(ascending=False))

# ================= Sklearn ML prediction =================

# splitting the data
from sklearn.model_selection import train_test_split
x = np.array(df_info[["type_num", "amount", "oldbalanceOrg", "newbalanceOrig"]]) # variaveis independentes
y = np.array(df_info[["isFraud_str"]]) # variaveis dependentes

# training a machine learning model
from sklearn.tree import DecisionTreeClassifier
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.10, random_state=42)
model = DecisionTreeClassifier()
model.fit(xtrain, ytrain)
print('\n\n\nScore do modelo:\n')
print(model.score(xtest, ytest))

# prediction
#features = [type, amount, oldbalanceOrg, newbalanceOrig]
features = np.array([[4, 9000.60, 9000.60, 0.0]])
print('\nPredict para TRANSFER, amount $9000.60, oldbalanceOrg $9000.60, newbalanceOrig $0.00:\n')
print(model.predict(features))

AttributeError: ignored

## PLOT DE RESULTADOS


## ATUAÇÃO COM OUTSIDERS