In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

# 1. Загрузка данных

In [4]:
df = pd.read_csv('../data/raw/credit_card_transactions-ibm_v2.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24386900 entries, 0 to 24386899
Data columns (total 15 columns):
 #   Column          Dtype  
---  ------          -----  
 0   User            int64  
 1   Card            int64  
 2   Year            int64  
 3   Month           int64  
 4   Day             int64  
 5   Time            object 
 6   Amount          object 
 7   Use Chip        object 
 8   Merchant Name   int64  
 9   Merchant City   object 
 10  Merchant State  object 
 11  Zip             float64
 12  MCC             int64  
 13  Errors?         object 
 14  Is Fraud?       object 
dtypes: float64(1), int64(7), object(7)
memory usage: 2.7+ GB


In [6]:
df.head(5)

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No
1,0,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
2,0,0,2002,9,2,06:22,$120.34,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
3,0,0,2002,9,2,17:45,$128.95,Swipe Transaction,3414527459579106770,Monterey Park,CA,91754.0,5651,,No
4,0,0,2002,9,3,06:23,$104.71,Swipe Transaction,5817218446178736267,La Verne,CA,91750.0,5912,,No


In [7]:
df['Is Fraud?'].value_counts(normalize=True)

Is Fraud?
No     0.99878
Yes    0.00122
Name: proportion, dtype: float64

In [8]:
df.isnull().sum()

User                     0
Card                     0
Year                     0
Month                    0
Day                      0
Time                     0
Amount                   0
Use Chip                 0
Merchant Name            0
Merchant City            0
Merchant State     2720821
Zip                2878135
MCC                      0
Errors?           23998469
Is Fraud?                0
dtype: int64

In [9]:
df.isnull().mean()

User              0.000000
Card              0.000000
Year              0.000000
Month             0.000000
Day               0.000000
Time              0.000000
Amount            0.000000
Use Chip          0.000000
Merchant Name     0.000000
Merchant City     0.000000
Merchant State    0.111569
Zip               0.118020
MCC               0.000000
Errors?           0.984072
Is Fraud?         0.000000
dtype: float64

# 2. Базовая предобработка

## Amount - убираем знаки $

In [10]:
df.Amount.head(5)

0    $134.09
1     $38.48
2    $120.34
3    $128.95
4    $104.71
Name: Amount, dtype: object

In [11]:
df["Amount"]=df["Amount"].str.replace("$","").astype(float)

In [12]:
df.Amount.head(5)

0    134.09
1     38.48
2    120.34
3    128.95
4    104.71
Name: Amount, dtype: float64

## Time - выделяем время

In [13]:
df.Time.head(5)

0    06:21
1    06:42
2    06:22
3    17:45
4    06:23
Name: Time, dtype: object

In [14]:
df["Hour"] = df["Time"].str [0:2]
df["Minute"] = df["Time"].str [3:5]
df = df.drop(['Time'],axis=1)

In [15]:
# Добавляем timestamp для последующего деления на Train, Validation и Test
df['Timestamp'] = pd.to_datetime(df[["Year", "Month", "Day", "Hour", "Minute"]])

## Добавим день недели

In [16]:
df['Date'] = pd.to_datetime(df[['Year', 'Month', 'Day']])

# Переведем в день недели
days = {0:'Пн', 1:'Вт', 2:'Ср', 3:'Чт', 4:'Пт', 5:'Сб', 6:'Вс'}
df['Day of Week'] = df['Date'].dt.dayofweek.map(days)

## Is Fraud? - бинаризуем

In [17]:
df["Is Fraud?"] = df["Is Fraud?"].apply(lambda x: 1 if x == 'Yes' else 0)

## Заполняем пропуски

In [18]:
df['Merchant State']=df['Merchant State'].fillna('unknown')
df['Zip']=df['Merchant State'].fillna('0')
df['Errors?']=df['Errors?'].fillna('unknown')

## Сохраняем новый набор данных

In [32]:
df.to_csv('../data/interim/credit_card_transactions-ibm_v2.csv',
              header=df.columns,
              index=False)

In [19]:
df.head()

Unnamed: 0,User,Card,Year,Month,Day,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?,Hour,Minute,Timestamp,Date,Day of Week
0,0,0,2002,9,1,134.09,Swipe Transaction,3527213246127876953,La Verne,CA,CA,5300,unknown,0,6,21,2002-09-01 06:21:00,2002-09-01,Вс
1,0,0,2002,9,1,38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,CA,5411,unknown,0,6,42,2002-09-01 06:42:00,2002-09-01,Вс
2,0,0,2002,9,2,120.34,Swipe Transaction,-727612092139916043,Monterey Park,CA,CA,5411,unknown,0,6,22,2002-09-02 06:22:00,2002-09-02,Пн
3,0,0,2002,9,2,128.95,Swipe Transaction,3414527459579106770,Monterey Park,CA,CA,5651,unknown,0,17,45,2002-09-02 17:45:00,2002-09-02,Пн
4,0,0,2002,9,3,104.71,Swipe Transaction,5817218446178736267,La Verne,CA,CA,5912,unknown,0,6,23,2002-09-03 06:23:00,2002-09-03,Вт
