In [None]:
# Data Analysis
import pandas as pd
import numpy as np

# Data Visualization
from matplotlib import pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

# Warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Loading train data frame
data_train = pd.read_csv('ieee-fraud-detection/train_df.csv')

In [None]:
# Loading train data frame
data_test = pd.read_csv('ieee-fraud-detection/test_df.csv')

In [None]:
data_train.info()

In [None]:
data_test.info()

In [None]:
column_names = data_train.columns.tolist()
print(column_names)

In [None]:
feature_names = data_test.columns.tolist()
print(feature_names)

In [None]:
#Looking at the main statistics
# print(data_train.describe())
# stats_df = data_train.describe()
# stats_df.loc['skewness'] = data_train.skew()
# stats_df.loc['kurtosis'] = data_train.kurt()
# stats_df.loc['median'] = data_train.median()
# print(stats_df)

In [None]:
#Load and print the first 20 observations of the dataset.
# head_20 = data_train.head(20)
# print(head_20)

In [None]:
#Checking the type of the data
datatype = data_train.dtypes
print(datatype)

In [None]:
print(data_train.isnull().sum())

In [None]:
#uniq_cols = data_train['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'ProductCD']
#Histograms
fig,ax = plt.subplots()
sns.countplot(x=data_train.isFraud,ax=ax)
for p in ax.patches:
        val = p.get_height() #height of the bar
        x = p.get_x() + p.get_width()/2 # x- position 
        y = p.get_y() + p.get_height() #y-position
        ax.annotate(val,(x,y))

In [None]:
data_train['TransactionDT'].plot(kind = 'hist',
                            label = 'train',figsize=(16, 5),bins=50,
                            fontsize=12)
data_test['TransactionDT'].plot(kind = 'hist',bins=50,label = 'test')

plt.legend(fontsize=12)
plt.show
plt.ylabel("Count",fontsize=15)
plt.xlabel("Timedelta",fontsize=15)

plt.title('Train vs Test TransactionDT Distribution',fontsize=15)

In [None]:
print('train: max = {}, min = {}'.format
      (max(data_train['TransactionDT']), min(data_train['TransactionDT'])))

print('test: max = {}, min = {}'.format
      (max(data_test['TransactionDT']), min(data_test['TransactionDT'])))

In [None]:
d = (max(data_train['TransactionDT'])-min(data_train['TransactionDT']))/(3600*24)
d2 = (max(data_test['TransactionDT'])-min(data_test['TransactionDT']))/(3600*24)
d1 = (min(data_test['TransactionDT'])-max(data_train['TransactionDT']))/(3600*24)

print('Timespan of the total data set is {} days'.format(d+d2))
print('Timespan of the training set is {} days'.format(d))
print('Timespan of the test set is {} days'.format(d2))
print('The gap between the training and test set is {} days'.format(d1))

In [None]:
# data_train['TransactionDT'].hist(alpha=0.5, figsize=(20, 10))
# plt.tight_layout()
# plt.show()

In [None]:
# data_train['TransactionAmt'].hist(alpha=0.5, figsize=(20, 10))
# plt.tight_layout()
# plt.show()

In [None]:
plt.boxplot(data_train['TransactionAmt'])

In [None]:
plt.boxplot(data_test['TransactionAmt'])

In [None]:

# Pandas dataframe
data = pd.DataFrame({"Train": data_train['TransactionAmt'], "Test": data_test['TransactionAmt']})

# Plot the dataframe
ax = data[['Train', 'Test']].plot(kind='box', title='boxplot')

# Display the plot
plt.show()

In [None]:
print("training set: Transaction Amounts Quantiles:")
print(data_train['TransactionAmt'].quantile([.01, .025, .1, .25, .5, .75, .9, .975, .99]))
print()
print("test set: Transaction Amounts Quantiles:")
print(data_test['TransactionAmt'].quantile([.01, .025, .1, .25, .5, .75, .9, .975, .99]))

In [None]:
fig, ((ax1, ax2)) = plt.subplots(2, 1, figsize=(10, 10))
data_train.loc[data_train['isFraud'] == 1] \
    ['TransactionAmt'] \
    .plot(kind='hist',
          bins=100,
          title='Transaction Amt - Fraud',
         ax= ax1)
data_train.loc[data_train['isFraud'] == 0] \
    ['TransactionAmt'] \
    .plot(kind='hist',
          bins=100,
          title='Transaction Amt - Not Fraud',
         ax=ax2)
plt.show()

In [None]:
# data_train['ProductCD'].hist(alpha=0.5, figsize=(20, 10))
# plt.tight_layout()
# plt.show()

In [None]:
print(data_train['ProductCD'].unique())
print(data_train['ProductCD'].isnull().sum())

In [None]:
fig, ((ax1, ax2)) = plt.subplots(2, 1, figsize=(15, 15))
data_train.groupby('ProductCD') \
    ['TransactionID'].count() \
    .sort_index() \
    .plot(kind='barh',
         title='Count of Observations by ProductCD',
         color=['brown', 'red', 'green', 'blue', 'orange'],
         ax= ax1)
data_train.groupby('ProductCD')['isFraud'] \
    .mean() \
    .sort_index() \
    .plot(kind='barh',
         title='Percentage of Fraud by ProductCD',
         color=['brown', 'red', 'green', 'blue', 'orange'],
         ax= ax2)
plt.show()

In [None]:
print(data_train['DeviceInfo'].unique())
print(data_train['DeviceInfo'].isnull().sum())

In [None]:
data_train['DeviceInfo'].isnull().sum()*100/data_train.shape[0]

In [None]:
fig, ((ax1, ax2)) = plt.subplots(2, 1, figsize=(15, 9))
data_train.groupby('DeviceInfo') \
    ['TransactionID'].count() \
    .sort_index() \
    .plot(kind='barh',
         title='Count of Observations by DeviceInfo',
         color=['blue', 'red', 'green'],
         ax= ax1)
data_train.groupby('DeviceInfo')['isFraud'] \
    .mean() \
    .sort_index() \
    .plot(kind='barh',
         title='Percentage of Fraud by DeviceInfo',
         color=['blue', 'red', 'green'],
         ax= ax2)
plt.show()

In [None]:
print(data_train['DeviceType'].unique())
print(data_train['DeviceType'].isnull().sum())

In [None]:
data_train['DeviceType'].isnull().sum()*100/data_train.shape[0]

In [None]:
data_train['DeviceType'].head()

In [None]:
fig, ((ax1, ax2)) = plt.subplots(2, 1, figsize=(15, 9))
data_train.groupby('DeviceType') \
    ['TransactionID'].count() \
    .sort_index() \
    .plot(kind='barh',
         title='Count of Observations by DeviceType',
         color=['blue', 'red', 'green'],
         ax= ax1)
data_train.groupby('DeviceType')['isFraud'] \
    .mean() \
    .sort_index() \
    .plot(kind='barh',
         title='Percentage of Fraud by DeviceType',
         color=['blue', 'red', 'green'],
         ax= ax2)
plt.show()

In [None]:
card_cols = [c for c in data_train.columns if 'card' in c]
data_train[card_cols].head()

In [None]:
data_train[card_cols].dtypes

In [None]:
data_train[card_cols].isnull().sum()*100/data_train.shape[0]

In [None]:
cor = data_train[['card1','card2','card3','card5']].corr()
sns.heatmap(cor, cmap = 'RdBu_r',annot=True, center=0.0)
plt.title('Card1,2,3,5')
plt.show

In [None]:
fig, ((ax1, ax2)) = plt.subplots(2, 1, figsize=(15, 9))
data_train.groupby('card4') \
    ['TransactionID'].count() \
    .sort_index() \
    .plot(kind='barh',
         title='Count of Observations by card',
         color=['blue', 'red', 'green', 'orange'],
         ax= ax1)
data_train.groupby('card4')['isFraud'] \
    .mean() \
    .sort_index() \
    .plot(kind='barh',
         title='Percentage of Fraud by card',
         color=['blue', 'red', 'green', 'orange'],
         ax= ax2)
plt.show()

In [None]:
fig, ((ax1, ax2)) = plt.subplots(2, 1, figsize=(15, 9))
data_train.groupby('card6') \
    ['TransactionID'].count() \
    .sort_index() \
    .plot(kind='barh',
         title='Count of Observations by card',
         color=['blue', 'red', 'green', 'orange'],
         ax= ax1)
data_train.groupby('card6')['isFraud'] \
    .mean() \
    .sort_index() \
    .plot(kind='barh',
         title='Percentage of Fraud by card',
         color=['blue', 'red', 'green', 'orange'],
         ax= ax2)
plt.show()

In [None]:
print(data_train['card4'].unique())
print(data_train['card6'].unique())

In [None]:
from scipy import stats


con_table = pd.crosstab(data_train['card4'], 
                        data_train['card6'],  
                        margins = False) 
stats.chi2_contingency(con_table)

In [None]:
#Histograms
data_train[card_cols].hist(alpha=0.5, figsize=(20, 10))
plt.tight_layout()
plt.show()

In [None]:
addr_cols = [c for c in data_train.columns if 'addr' in c]
data_train[addr_cols].head()

In [None]:
cor = data_train[['addr1','addr2']].corr()
sns.heatmap(cor, cmap = 'RdBu_r',annot=True, center=0.0)
plt.title('Address')
plt.show

In [None]:
data_train[['addr1','addr2']].dtypes

In [None]:
data_train[['addr1','addr2']].isnull().sum()*100/data_train.shape[0]

In [None]:
data_train[addr_cols].hist(alpha=0.5, figsize=(20, 10))
plt.tight_layout()
plt.show()

In [None]:
dist_cols = [c for c in data_train.columns if 'dist' in c]
data_train[dist_cols].head()

In [None]:
data_train[['dist1','dist2']].isnull().sum()*100/data_train.shape[0]

In [None]:
data_train[dist_cols].hist(alpha=0.5, figsize=(20, 10))
plt.tight_layout()
plt.show()

In [None]:
c_cols = [c for c in data_train if c[0] == 'C']
data_train[c_cols].head()

In [None]:
#Missing values in C
100*data_train[['C'+str(i) for i in range(1,15)]].isna().sum()/data_train.shape[0]

In [None]:
ct_cols = [c for c in data_test if c[0] == 'C']

In [None]:
data_train[c_cols].hist(alpha=0.5, figsize=(20, 10))
plt.tight_layout()
plt.show()

In [None]:
sns.pairplot(data_train, 
             hue='isFraud',
            vars=c_cols)
plt.show()

In [None]:
nan_df = data_train.isna()
nan_groups = {} # dictionary containing colunms with the same sum of nan
for c in c_cols:
    # sum of nan of each columns
    col_sum = nan_df[c].sum()
    if col_sum in nan_groups.keys():
        nan_groups[col_sum].append(c)
    else:
        nan_groups[col_sum] = [c]
del nan_df

In [None]:
col = ['C'+str(i) for i in range(1,15)]
train_c = data_train[col].sample(frac = 0.2)
plt.figure()
plt.figure(figsize=(10,10))
cor = train_c[col].corr()
sns.heatmap(cor, annot=True, center=0.0)
plt.title('C1-C14')
plt.show

In [None]:
Dcol = ['D'+str(i) for i in range(1,16)]
data_train[Dcol].head()

In [None]:
#Missing values in D
100*data_train[['D'+str(i) for i in range(1,15)]].isna().sum()/data_train.shape[0]

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data_train[Dcol].corr(),cmap='RdBu_r',center=0.0,annot=True)

In [None]:
mcol = ['M'+str(i) for i in range(1,10)]
data_train[mcol].head()

In [None]:
pd.DataFrame(100*data_train[mcol].isna().sum()/data_train.shape[0]).sort_values(by = 0)

In [None]:
statistics_of_data = []
for col in data_train.columns:
  statistics_of_data.append((col,
                             data_train[col].nunique(),
                             data_train[col].isnull().sum()*100/data_train.shape[0],
                             data_train[col].value_counts(normalize=True, dropna=False).values[0] * 100, 
                             data_train[col].dtype
                             ))
stats_df = pd.DataFrame(statistics_of_data, columns=['Feature', 'Uniq_val', 'missing_val', 'val_biggest_cat', 'type'])

In [None]:
stats_df.sort_values('missing_val', ascending=False)

In [None]:
stats_df.sort_values('val_biggest_cat', ascending=False)