<a href="https://colab.research.google.com/github/ssawant/ml-jupyter-notebook/blob/master/credit_card_fraud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install kaggle
from google.colab import files
files.upload()

In [0]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle

# This permissions change avoids a warning on Kaggle tool startup.
!chmod 600 ~/.kaggle/kaggle.json

In [0]:
!kaggle competitions download -c ieee-fraud-detection

In [0]:
!unzip train_identity.csv.zip
!unzip train_transaction.csv.zip

**Preprocessing Data**

In [0]:
import pandas as pd
df_train_id = pd.read_csv('train_identity.csv')
df_train_tran = pd.read_csv('train_transaction.csv')

Helper function to reduce dataframe memory

In [0]:
# credit to @guiferviz for the memory reduction 
def memory_usage_mb(df, *args, **kwargs):
    """Dataframe memory usage in MB. """
    return df.memory_usage(*args, **kwargs).sum() / 1024**2

def reduce_memory_usage(df, deep=True, verbose=True):
    # All types that we want to change for "lighter" ones.
    # int8 and float16 are not include because we cannot reduce
    # those data types.
    # float32 is not include because float16 has too low precision.
    numeric2reduce = ["int16", "int32", "int64", "float64"]
    start_mem = 0
    if verbose:
        start_mem = memory_usage_mb(df, deep=deep)

    for col, col_type in df.dtypes.iteritems():
        best_type = None
        if col_type in numeric2reduce:
            downcast = "integer" if "int" in str(col_type) else "float"
            df[col] = pd.to_numeric(df[col], downcast=downcast)
            best_type = df[col].dtype.name
        # Log the conversion performed.
        if verbose and best_type is not None and best_type != str(col_type):
            print(f"Column '{col}' converted from {col_type} to {best_type}")
    
    if verbose:
        end_mem = memory_usage_mb(df, deep=deep)
        diff_mem = start_mem - end_mem
        percent_mem = 100 * diff_mem / start_mem
        print(f"Memory usage decreased from"
              f" {start_mem:.2f}MB to {end_mem:.2f}MB"
              f" ({diff_mem:.2f}MB, {percent_mem:.2f}% reduction)")
        
    return df


In [0]:
df_train = pd.merge(df_train_tran, df_train_id, on='TransactionID', how='left')

In [0]:
print(f'Train dataset has {df_train.shape[0]} rows and {df_train.shape[1]} columns.')

In [0]:
del df_train_tran, df_train_id

In [0]:
# df_train.head()
memory_usage_mb(df_train)

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt
count = df_train['isFraud'].value_counts()

plt.figure(figsize=(8,4))
sns.barplot(count.index, count.values, alpha=0.8)
plt.ylabel('no of transactions', fontsize=12)
plt.xlabel('is transaction fraud or not?', fontsize=12)
plt.show()

In [0]:
no_data_sample = len(df_train.index)

input_features_missing_proportions = df_train.isnull().sum() / no_data_sample

plt.hist(input_features_missing_proportions)
plt.title('Proportion of Missing Values in Input Features')

Removing columns which has more then 60% nan values

In [0]:
df_train = df_train.loc[:, df_train.isnull().mean() <=.6]

In [0]:
df_train.shape
memory_usage_mb(df_train) # 937 MB reduction

In [0]:
count = df_train['card4'].value_counts()

sns.barplot(count.index, count.values, alpha=0.8)
plt.ylabel('no of transactions', fontsize=12)
plt.xlabel('type of cards', fontsize=12)
plt.show()

In [0]:
df_train.select_dtypes(exclude=['int', 'float']).columns

In [0]:
numeric_col = df_train._get_numeric_data().columns
numeric_col

In [0]:
memory_usage_mb(df_train)

In [0]:
# replacing null missing value with mean of dataset
numeric_column = [i for i in numeric_col]
for i in numeric_column:
  df_train[i]=df_train[i].fillna(df_train[i].median())

In [0]:
data_num = df_train[numeric_column]
data_num.head()

In [0]:
num_cat = df_train.select_dtypes(exclude=['int', 'float']).columns
num_cat = [i for i in num_cat]
num_cat

In [0]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy='most_frequent')
imp.fit(df_train[num_cat])

In [0]:
df_train[num_cat] = imp.transform(df_train[num_cat])
data_cat = df_train[num_cat]

In [0]:
data_cat.head()

In [0]:
# finding no of category in categorical columns
data_cat = data_cat.astype('category')
cat_level = data_cat.apply(lambda col: len(col.cat.categories))
cat_level

In [0]:
df_train.head()

In [0]:
data_cat = pd.get_dummies(data_cat[num_cat])

In [0]:
df = pd.concat([data_num, data_cat], axis=1)

In [0]:
del df_train
del data_num
del data_cat
# del cat_level
!free -h

Traning and test data

In [0]:
df = reduce_memory_usage(df, deep=True, verbose=True)

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df.iloc[:,3:], df['isFraud'], test_size=0.20, random_state=42)

In [0]:
X_train.head()

In [0]:
# from scipy.stats import uniform
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.linear_model import SGDClassifier
# import scipy
# tuned_parameters ={'penalty':['l1','l2','elasticnet'],'alpha':[500,100,50,10,5,1,0.5,0.1,0.05,0.01,0.005,0.001,0.0005,0.0001,0.00005,0.00001]}

# model = RandomizedSearchCV(SGDClassifier(),param_distributions=tuned_parameters,scoring='roc_auc',n_jobs = -1)
# model.fit(X_train, y_train)
# pred=model.predict(y_val)
# conf_mat = confusion_matrix(y_val, pred)
# sns.heatmap(conf_mat,annot=True,fmt="d",linewidths=.5)
# plt.show()

# print("The best parameters are %s with a score of %0.2f"% (model.best_params_, model.best_score_))
# sgd_churn_result = roc_auc_score(churn_test_y, pred)
# print("roc_auc for Churn is: %0.3f"%(roc_auc_score(y_val, pred)))
# print("Precision for Churn is: %0.3f"%(precision_score(y_val, pred)))
# print("Recall for Churn is: %0.3f"%(recall_score(y_val, pred)))
# print("F1-Score for Churn is: %0.3f"%(f1_score(y_val, pred)))

# Logistic Regression model
# from sklearn.linear_model import LogisticRegression
# lr = LogisticRegression()

# # Train the model using 'fit' method
# lr.fit(X_train, y_train)

# # Test the model using 'predict' method
# y_pred = lr.predict(X_val)

# # Print the classification report
# from sklearn.metrics import classification_report
# print(classification_report(y_val, y_pred))

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB 

# Train LogisticRegression Model
LGR_Classifier = LogisticRegression(max_iter=500)
LGR_Classifier.fit(X_train, y_train)

# Train Decision Tree Model
RDF_Classifier = RandomForestClassifier(random_state=0)
RDF_Classifier.fit(X_train, y_train)

# Train Bernoulli Navi Baye Model
BNB_Classifier = BernoulliNB()
BNB_Classifier.fit(X_train, y_train)

In [0]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import numpy as np

# Evaluate Models
modelist = [('RandomForest Classifier', RDF_Classifier), ('LogisticRegression', LGR_Classifier), ('Navi Baiya Classifier', BNB_Classifier)]

models = [j for j in modelist]

print('================ Model Evaluation Results ===================' "\n")

for i, v in models:
  scores = cross_val_score(v, X_train, y_train, cv=10)
  accuracy = metrics.accuracy_score(y_train, v.predict(X_train))
  confusion_matrix = metrics.confusion_matrix(y_train, v.predict(X_train))
  classification = metrics.classification_report(y_train, v.predict(X_train))
  print(f'============{i}============')
  print()
  print(f'Cross Validation Mean Score: {np.round(scores.mean(), 3) * 100}')
  print()
  print(f'Model Accuracy: {np.round(accuracy, 3) * 100}')
  print()
  print(f'Confusion Matrix: \n{confusion_matrix}')
  print()
  print(f'Classification Report: \n{classification}')

In [0]:
# Test Model
classdict = {'normal':0, 'fraudulent':1}
print()
print('===================== Model Test Result ======================' "\n")

for i, v in models:
  accuracy = metrics.accuracy_score(y_val, v.predict(X_val))
  confusion_matrix = metrics.confusion_matrix(y_val, v.predict)(X_val)
  classification = metrics.classification_report(y_val, v.predict(X_val))
  print(f'======== {i} =========')
  print (f'Model Accuracy: {np.round(accuracy, 3) * 100)}')
  print()
  print(f'Confusion Matrix: {confusion_matrix}')
  print()
  pf.plot_confusion_matrix(confusion_matrix, classes = list(classdict.keys()), title='Confusion Matrix Plot', cmap=plt.cm.summer)
  print() 
  print(f'Classification Report: \n{classification}') 
  print() 

print('============================= ROC Curve ===============================' "\n")      
pf.plot_roc_auc(arg1=models, arg2=X_test, arg3=y_test)
