In [None]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
Training_identity=pd.read_csv('/content/drive/My Drive/Colab Notebooks/Creditcard/train_identity.csv')
Training_transaction=pd.read_csv('/content/drive/My Drive/Colab Notebooks/Creditcard/train_transaction.csv')

In [None]:
Test_identity=pd.read_csv('/content/drive/My Drive/Colab Notebooks/Creditcard/test_identity.csv')
Test_transaction=pd.read_csv('/content/drive/My Drive/Colab Notebooks/Creditcard/test_transaction.csv')

In [None]:
training=pd.merge(Training_transaction,Training_identity,on='TransactionID',how='left')
training.shape

In [None]:
test=pd.merge(Test_transaction,Test_identity,on='TransactionID',how='left')
test.shape

In [None]:
#Reduce the memory
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                  df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
train=reduce_mem_usage(training)
test=reduce_mem_usage(test)

In [None]:
del Training_identity,Training_transaction,Test_identity,Test_transaction

In [None]:
#Recognize categorical and numerical attributes¶
cat_cols = list(train.select_dtypes(include=['object']).columns)
cat_cols

In [None]:

null_percent = train.isnull().sum()/train.shape[0]*100
cols_to_drop = np.array(null_percent[null_percent > 50].index)
cols_to_drop

In [None]:
train = train.drop(cols_to_drop, axis=1)
test = test.drop(cols_to_drop,axis=1)
train.columns

In [None]:
#Include fake data with mode¶
null_cols = train.columns[train.isna().any()].tolist()

for i in null_cols:
    print('data type of {} is {}'.format(i, str(train[i].dtype)))
    train[i] = train[i].replace(np.nan, train[i].mode()[0])
    print('Filled the null values of column {}'.format(i))
    print('--------------------------------------------')

In [None]:
null_cols=test.columns[test.isna().any()].tolist()

for i in null_cols:
  print('data type of {} is {} '.format(i,str(train[i].dtype)))
  test[i]=test[i].replace(np.nan,test[i].mode()[0])
  print('Filled null values of columns{}'.format(i))
  print('----------------------')

In [None]:
#Use LabelEncoder to change text to int¶
x_train=train.drop('isFraud',axis=1)
y_train=train['isFraud']

In [None]:
categorical=x_train.select_dtypes(include='object')
numerical=x_train.select_dtypes(exclude='object')

cat_cols=categorical.columns.values
num_cols=numerical.columns.values

print('categorical columns:',cat_cols)
print('Numerical columns:',num_cols)

In [None]:
x_train['TransactionAmt']=x_train['TransactionAmt'].apply(np.log)
test['TransactionAmt']=test['TransactionAmt'].apply(np.log)

In [None]:
for i in tqdm(cat_cols):
  label=LabelEncoder()
  label.fit(list(x_train[i].values) + list(test[i].values))
  x_train[i]=label.transform(list(x_train[i].values))
  test[i]=label.transform(list(test[i].values))
  
  
  

In [None]:
fig = plt.figure(figsize=(20,15))

j = 1
for i in cat_cols:
    if(i == 'P_emaildomain'):
        continue
    plt.subplot(3,3,j)
    sns.countplot(x=x_train[i], palette='winter_r')
    j = j + 1
    
plt.show()

In [None]:
sns.countplot(x=y_train, palette='gist_rainbow')
plt.title('Fraud or Not')
plt.show()

In [None]:
## This is a datetime column.
x_train_final = x_train.drop('TransactionDT', axis=1)
test_final = test.drop('TransactionDT', axis=1)

In [None]:
#create Logistic regression
from sklearn import linear_model
filename = 'model.pkl'
logistic_model = linear_model.LogisticRegression(C=0.09,solver='lbfgs',class_weight='balanced')  
logistic_model.fit(x_train_final, y_train)
import pickle
pickle.dump(logistic_model, open(filename, 'wb'))

In [None]:
#create  the prediction
logistic_predictions = logistic_model.predict(test_final)


In [None]:
score = logistic_model.score(x_train_final, y_train)
print(score)

In [None]:
#create submit value
sub = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Creditcard/sample_submission.csv')
sub['isFraud'] = logistic_predictions
sub.head()

In [None]:
sub.to_csv('submission.csv', index=False)