In [1]:
## REQUIRED LIBRARIES
# For data wrangling 
import numpy as np
import pandas as pd

# For visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import tensorflow_data_validation as tfdv


## Grab data from source

In [9]:
df = pd.read_csv('../data/Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [10]:
df = df.drop(["RowNumber", "CustomerId", "Surname"], axis = 1)


In [11]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [13]:
df.describe()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [20]:
def split_to_train_test(df, label_column, train_frac=0.8):
    train_df, test_df = pd.DataFrame(), pd.DataFrame()
    labels = df[label_column].unique()
    for lbl in labels:
        lbl_df = df[df[label_column] == lbl]
        lbl_train_df = lbl_df.sample(frac=train_frac)
        lbl_test_df = lbl_df.drop(lbl_train_df.index)
        print('\n%s:\n---------\ntotal:%d\ntrain_df:%d\ntest_df:%d' % (lbl, len(lbl_df), len(lbl_train_df), len(lbl_test_df)))
        train_df = train_df.append(lbl_train_df)
        test_df = test_df.append(lbl_test_df)

    return train_df, test_df

In [22]:
train, test = split_to_train_test(df,'Exited')


1:
---------
total:2037
train_df:1630
test_df:407

0:
---------
total:7963
train_df:6370
test_df:1593


In [32]:
train.to_csv('../data/train.csv')
test.to_csv('../data/test.csv')

In [48]:
df.to_csv('../data/churn_modelling.csv')

## Visualize statistics

In [33]:
TRAIN_DATA_FILE = '../data/train.csv'
EVAL_DATA_FILE = '../data/test.csv'
TARGET_FEATURE_NAME = 'Exited'
HEADERS = ['CreditScore','Geography','Gender','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember','EstimatedSalary','Exited']

print(TRAIN_DATA_FILE)


../data/train.csv


In [34]:
train_stats = tfdv.generate_statistics_from_csv(
    data_location=TRAIN_DATA_FILE )

In [35]:
tfdv.visualize_statistics(train_stats)

In [36]:
schema = tfdv.infer_schema(statistics=train_stats)
tfdv.display_schema(schema=schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Geography',STRING,required,,'Geography'
'Gender',STRING,required,,'Gender'
'',INT,required,,-
'CreditScore',INT,required,,-
'Age',INT,required,,-
'Tenure',INT,required,,-
'Balance',FLOAT,required,,-
'NumOfProducts',INT,required,,-
'HasCrCard',INT,required,,-
'IsActiveMember',INT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Geography',"'France', 'Germany', 'Spain'"
'Gender',"'Female', 'Male'"


## Infer schema

In [37]:
#  All features are by default in both TRAINING and SERVING environments.
schema.default_environment.append('TRAINING')
schema.default_environment.append('EVALUATION')
schema.default_environment.append('SERVING')

# Specify that the class feature is not in SERVING environment.
tfdv.get_feature(schema, TARGET_FEATURE_NAME).not_in_environment.append('SERVING')

In [38]:
tfdv.display_schema(schema=schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Geography',STRING,required,,'Geography'
'Gender',STRING,required,,'Gender'
'',INT,required,,-
'CreditScore',INT,required,,-
'Age',INT,required,,-
'Tenure',INT,required,,-
'Balance',FLOAT,required,,-
'NumOfProducts',INT,required,,-
'HasCrCard',INT,required,,-
'IsActiveMember',INT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Geography',"'France', 'Germany', 'Spain'"
'Gender',"'Female', 'Male'"


In [40]:
tfdv.get_feature(schema, TARGET_FEATURE_NAME)

name: "Exited"
type: INT
bool_domain {
}
presence {
  min_fraction: 1.0
  min_count: 1
}
not_in_environment: "SERVING"
shape {
  dim {
    size: 1
  }
}

## Validate the evaluation data

In [39]:
eval_stats = tfdv.generate_statistics_from_csv(EVAL_DATA_FILE)

eval_anomalies = tfdv.validate_statistics(eval_stats, schema, environment='EVALUATION')
tfdv.display_anomalies(eval_anomalies)

## Freeze schema

In [44]:
RAW_SCHEMA_LOCATION = '../utils/schema.pbtxt'

In [45]:
from tensorflow.python.lib.io import file_io
from google.protobuf import text_format

tfdv.write_schema_text(schema, RAW_SCHEMA_LOCATION)