In [2]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/Machine Learning/VIB

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Machine Learning/VIB


In [3]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import euclidean_distances

sns.set()
standard_scaler = StandardScaler()

In [4]:
columns = ['AGE', 'TRANS_NO', 'TRANS_AMOUNT', 'RECENCY', 'AVG_CA_BALANCE', 'AVG_TD_BALANCE', 'AVG_LOAN_AMOUNT']

In [5]:
def read_data():
  dateNow = pd.Timestamp('2020-01-01')

  df1 = pd.read_csv('./data/1.Data_Customer.csv')
  df1.dropna(subset = ["DATE_OF_BIRTH"], inplace=True)
  df1['DATE_OF_BIRTH'] = pd.to_datetime(df1['DATE_OF_BIRTH'], errors='coerce')
  df1['AGE'] = df1['DATE_OF_BIRTH'].apply(lambda x: (dateNow.year-x.year))
  df1_ = df1[['CUSTOMER_NUMBER', 'AGE']]

  df2 = pd.read_csv('./data/2.Data_MyVIB_Transaction.csv')
  df2['TRANS_DATE'] = pd.to_datetime(df2['TRANS_DATE'])
  # Lấy ngày giao dịch cuối cùng
  df2_ = pd.pivot_table(data=df2,
                        index=['CUSTOMER_NUMBER'],
                        values=['TRANS_DATE', 'TRANS_NO', 'TRANS_AMOUNT'],
                        aggfunc={'TRANS_DATE': max, 'TRANS_NO': sum, 'TRANS_AMOUNT': sum}).reset_index()

  df4 = pd.read_csv('./data/4.Data_Deposit.csv')
  df4['MONTH'] = pd.to_datetime(df4['MONTH'])
  # Lấy tháng thống kê cuối
  df4_ = df4[['CUSTOMER_NUMBER', 'MONTH', 'AVG_CA_BALANCE', 'AVG_TD_BALANCE']]
  max_index = df4_.groupby('CUSTOMER_NUMBER')["MONTH"].idxmax()
  df4_ = df4_.iloc[max_index]
  df4_ = df4_[['CUSTOMER_NUMBER', 'AVG_CA_BALANCE', 'AVG_TD_BALANCE']]

  df5 = pd.read_csv('./data/5.Data_Lending.csv')
  df5['MONTH'] = pd.to_datetime(df5['MONTH'])
  # Lấy tháng thống kê cuối
  df5_ = df5[['CUSTOMER_NUMBER', 'MONTH', 'AVG_LOAN_AMOUNT']]
  max_index = df5_.groupby('CUSTOMER_NUMBER')["MONTH"].idxmax()
  df5_ = df5_.iloc[max_index]
  df5_ = df5_[['CUSTOMER_NUMBER', 'AVG_LOAN_AMOUNT']]

  data = df1_
  data = pd.merge(data, df2_, on='CUSTOMER_NUMBER', how='left')
  data = pd.merge(data, df4_, on='CUSTOMER_NUMBER', how='left')
  data = pd.merge(data, df5_, on='CUSTOMER_NUMBER', how='left')

  data.dropna(subset=["AGE", 'TRANS_DATE'], inplace=True)
  data = data.fillna(0)
  data['RECENCY'] = -data.TRANS_DATE.apply(lambda x: (dateNow-x).days)

  return data[columns]

In [6]:
def segmentation(df, k):
    cols = df.columns
    for col in cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        df = df[(df[col] >= (Q1 - 1.5*IQR)) & (df[col] <= (Q3 + 1.5*IQR))]
        print('{}: , Q1 - 1.5*IQR: {}, Q3 + 1.5*IQR: {}'.format(col, Q1 - 1.5*IQR, Q3 + 1.5*IQR))
        
    feature_norm = standard_scaler.fit_transform(df)
    kmeans = KMeans(n_clusters=k).fit(feature_norm)
    kmean_label = kmeans.labels_
    kmean_center = kmeans.cluster_centers_
    # print(feature_norm[1, :])

    s = df.std(axis=0)
    m = df.mean(axis=0)

    df['label'] = kmean_label + 1
    
    return df, kmean_center, s, m

# **Phân cụm**

In [7]:
data = read_data()

  if self.run_code(code, result):


In [8]:
k = 5
df, kmean_center, std, mean = segmentation(data[columns], k)

AGE: , Q1 - 1.5*IQR: 9.0, Q3 + 1.5*IQR: 49.0
TRANS_NO: , Q1 - 1.5*IQR: -39.5, Q3 + 1.5*IQR: 76.5
TRANS_AMOUNT: , Q1 - 1.5*IQR: -78012331.25, Q3 + 1.5*IQR: 139887218.75
RECENCY: , Q1 - 1.5*IQR: -79.0, Q3 + 1.5*IQR: 41.0
AVG_CA_BALANCE: , Q1 - 1.5*IQR: -4568313.5975, Q3 + 1.5*IQR: 8332166.6425
AVG_TD_BALANCE: , Q1 - 1.5*IQR: 0.0, Q3 + 1.5*IQR: 0.0
AVG_LOAN_AMOUNT: , Q1 - 1.5*IQR: 0.0, Q3 + 1.5*IQR: 0.0


In [9]:
df['RECENCY'] = -df['RECENCY']
df['label_str'] = df['label'].astype('str')

In [10]:
df.head(5)

Unnamed: 0,AGE,TRANS_NO,TRANS_AMOUNT,RECENCY,AVG_CA_BALANCE,AVG_TD_BALANCE,AVG_LOAN_AMOUNT,label,label_str
2,24.0,50.0,86799392.0,1,3102913.48,0.0,0.0,1,1
8,19.0,19.0,2372000.0,2,1550695.97,0.0,0.0,2,2
17,24.0,41.0,32137106.0,2,373376.1,0.0,0.0,1,1
18,19.0,6.0,2000000.0,13,290183.74,0.0,0.0,2,2
26,24.0,18.0,56695000.0,46,113936.1,0.0,0.0,4,4


# **Visualize with Dash**

In [11]:
!pip install -q jupyter-dash==0.3.0rc1 dash-bootstrap-components

[K     |████████████████████████████████| 45 kB 1.6 MB/s 
[K     |████████████████████████████████| 207 kB 9.7 MB/s 
[K     |████████████████████████████████| 7.3 MB 35.3 MB/s 
[K     |████████████████████████████████| 23.9 MB 14 kB/s 
[K     |████████████████████████████████| 357 kB 35.9 MB/s 
[?25h  Building wheel for dash-core-components (setup.py) ... [?25l[?25hdone
  Building wheel for dash-html-components (setup.py) ... [?25l[?25hdone
  Building wheel for dash-table (setup.py) ... [?25l[?25hdone


In [12]:
import dash
from dash import html
from dash import dcc
import dash_bootstrap_components as dbc
from dash.dependencies import Input, Output, State
from jupyter_dash import JupyterDash
import plotly.express as px
import plotly.graph_objects as go

In [13]:
style_chart = {
    'margin-bottom': '24px',
    'box-shadow': '0 4px 6px 0 rgba(0, 0, 0, 0.18)',
}

In [14]:
num_cluster = df.label.nunique()
arr_cluster = []

for c in range(num_cluster):
    arr_cluster.append(df[df.label == c+1].iloc[:, 0])

# Phần trăm số người mỗi cụm
l = len(df)
value = []
for d in arr_cluster:
    value.append(len(d) / l *100)

index = [i+1 for i in range(num_cluster)]
d = {'value': value}
df_temp1 = pd.DataFrame(d, index=index).reset_index()

percentPeople = html.Div(
            children =  dcc.Graph(
                            id = 'sold-chart',
                            #config = {'displayModeBar': True},
                            figure = {
                                'data': [
                                    {
                                        'x': df_temp1['index'],
                                        'y': df_temp1['value'],
                                        'type': 'bar',
                                        'hovertemplate': '%{y:.2f}<extra></extra>' 
                                    }
                                ],
                                'layout': {
                                    'title': {
                                        'text': 'Tỷ lệ số khách hàng trong mỗi cụm',
                                        'x': 0.05,
                                        'xanchor': 'left'
                                    },
                                    'xaxis': {'fixedrange': True},
                                    'yaxis': {
                                        # 'tickprefix': '$',
                                        'ticksuffix': '%',
                                        'fixedrange': True
                                    },
                                    'colorway': ['#707578']
                                }
                            }
                        ),
            className = 'card',
            style=style_chart
        )

In [15]:
# Trung bình số tuổi mỗi cụm
value = []
for d in arr_cluster:
    value.append(d.mean())

index = [i+1 for i in range(num_cluster)]
d = {'value': value}
df_temp2 = pd.DataFrame(d, index=index).reset_index()

meanAge = html.Div(
            children =  dcc.Graph(
                            # id = 'sold-chart',
                            #config = {'displayModeBar': True},
                            figure = {
                                'data': [
                                    {
                                        'x': df_temp2['index'],
                                        'y': df_temp2['value'],
                                        'type': 'bar',
                                        'hovertemplate': '%{y:.2f}<extra></extra>' 
                                    }
                                ],
                                'layout': {
                                    'title': {
                                        'text': 'Trung bình độ tuổi trong mỗi cụm',
                                        'x': 0.05,
                                        'xanchor': 'left'
                                    },
                                    'xaxis': {'fixedrange': True},
                                    'yaxis': {
                                        # 'tickprefix': '$',
                                        # 'ticksuffix': '%',
                                        'fixedrange': True
                                    },
                                    'colorway': ['#707578']
                                }
                            }
                        ),
            # className = 'card',
            style=style_chart
        )

In [16]:
num_cluster = df.label.nunique()
arr_cluster = []

for c in range(num_cluster):
    arr_cluster.append(df[df.label == c+1].iloc[:, 1])

# Trung bình số lượng giao dịch trong mỗi cụm
value = []
for d in arr_cluster:
    value.append(d.mean())

index = [i+1 for i in range(num_cluster)]
d = {'value': value}
df_temp3 = pd.DataFrame(d, index=index).reset_index()

meanTRANS_No = html.Div(
            children =  dcc.Graph(
                            # id = 'sold-chart',
                            #config = {'displayModeBar': True},
                            figure = {
                                'data': [
                                    {
                                        'x': df_temp3['index'],
                                        'y': df_temp3['value'],
                                        'type': 'bar',
                                        'hovertemplate': '%{y:.2f}<extra></extra>' 
                                    }
                                ],
                                'layout': {
                                    'title': {
                                        'text': 'Trung bình số lượng giao dịch trong mỗi cụm',
                                        'x': 0.05,
                                        'xanchor': 'left'
                                    },
                                    'xaxis': {'fixedrange': True},
                                    'yaxis': {
                                        # 'tickprefix': '$',
                                        # 'ticksuffix': '%',
                                        'fixedrange': True
                                    },
                                    'colorway': ['#707578']
                                }
                            }
                        ),
            # className = 'card',
            style=style_chart
        )

In [17]:
num_cluster = df.label.nunique()
arr_cluster = []

for c in range(num_cluster):
    arr_cluster.append(df[df.label == c+1].iloc[:, 2])

# Trung bình số tiền giao dịch trong mỗi cụm
value = []
for d in arr_cluster:
    value.append(d.mean())

index = [i+1 for i in range(num_cluster)]
d = {'value': value}
df_temp4 = pd.DataFrame(d, index=index).reset_index()

meanTRANS_Amount = html.Div(
            children =  dcc.Graph(
                            # id = 'sold-chart',
                            #config = {'displayModeBar': True},
                            figure = {
                                'data': [
                                    {
                                        'x': df_temp4['index'],
                                        'y': df_temp4['value'],
                                        'type': 'bar',
                                        'hovertemplate': '%{y:.2f}<extra></extra>' 
                                    }
                                ],
                                'layout': {
                                    'title': {
                                        'text': 'Trung bình số tiền giao dịch trong mỗi cụm',
                                        'x': 0.05,
                                        'xanchor': 'left'
                                    },
                                    'xaxis': {'fixedrange': True},
                                    'yaxis': {
                                        # 'tickprefix': '$',
                                        # 'ticksuffix': '%',
                                        'fixedrange': True
                                    },
                                    'colorway': ['#707578']
                                }
                            }
                        ),
            # className = 'card',
            style=style_chart
        )

In [18]:
num_cluster = df.label.nunique()
arr_cluster = []

for c in range(num_cluster):
    arr_cluster.append(df[df.label == c+1].iloc[:, 3])

# Trung bình khoảng thời gian giao dịch gần đây nhất
value = []
for d in arr_cluster:
    value.append(d.mean())

index = [i+1 for i in range(num_cluster)]
d = {'value': value}
df_temp5 = pd.DataFrame(d, index=index).reset_index()

meanRecency = html.Div(
            children =  dcc.Graph(
                            # id = 'sold-chart',
                            #config = {'displayModeBar': True},
                            figure = {
                                'data': [
                                    {
                                        'x': df_temp5['index'],
                                        'y': df_temp5['value'],
                                        'type': 'bar',
                                        'hovertemplate': '%{y:.2f}<extra></extra>' 
                                    }
                                ],
                                'layout': {
                                    'title': {
                                        'text': 'Trung bình khoảng thời gian giao dịch gần đây nhất',
                                        'x': 0.05,
                                        'xanchor': 'left'
                                    },
                                    'xaxis': {'fixedrange': True},
                                    'yaxis': {
                                        # 'tickprefix': '$',
                                        # 'ticksuffix': '%',
                                        'fixedrange': True
                                    },
                                    'colorway': ['#707578']
                                }
                            }
                        ),
            # className = 'card',
            style=style_chart
        )

In [19]:
num_cluster = df.label.nunique()
arr_cluster = []

for c in range(num_cluster):
    arr_cluster.append(df[df.label == c+1].iloc[:, 4])

# Trung bình số dư trong tài khoản
value = []
for d in arr_cluster:
    value.append(d.mean())

index = [i+1 for i in range(num_cluster)]
d = {'value': value}
df_temp6 = pd.DataFrame(d, index=index).reset_index()

meanBalance = html.Div(
            children =  dcc.Graph(
                            # id = 'sold-chart',
                            #config = {'displayModeBar': True},
                            figure = {
                                'data': [
                                    {
                                        'x': df_temp6['index'],
                                        'y': df_temp6['value'],
                                        'type': 'bar',
                                        'hovertemplate': '%{y:.2f}<extra></extra>' 
                                    }
                                ],
                                'layout': {
                                    'title': {
                                        'text': 'Trung bình số dư trong tài khoản',
                                        'x': 0.05,
                                        'xanchor': 'left'
                                    },
                                    'xaxis': {'fixedrange': True},
                                    'yaxis': {
                                        # 'tickprefix': '$',
                                        # 'ticksuffix': '%',
                                        'fixedrange': True
                                    },
                                    'colorway': ['#707578']
                                }
                            }
                        ),
            # className = 'card',
            style=style_chart
        )



# Phần trăm tổng giá trị số dư trong tài khoản
l = df.iloc[:, 4].sum()
value = []
for d in arr_cluster:
    value.append(d.sum() / l * 100)

index = [i+1 for i in range(num_cluster)]
d = {'value': value}
df_temp6 = pd.DataFrame(d, index=index).reset_index()

percentBalance = html.Div(
            children =  dcc.Graph(
                            # id = 'sold-chart',
                            #config = {'displayModeBar': True},
                            figure = {
                                'data': [
                                    {
                                        'x': df_temp6['index'],
                                        'y': df_temp6['value'],
                                        'type': 'bar',
                                        'hovertemplate': '%{y:.2f}<extra></extra>' 
                                    }
                                ],
                                'layout': {
                                    'title': {
                                        'text': 'Phần trăm tổng giá trị số dư trong tài khoản',
                                        'x': 0.05,
                                        'xanchor': 'left'
                                    },
                                    'xaxis': {'fixedrange': True},
                                    'yaxis': {
                                        # 'tickprefix': '$',
                                        'ticksuffix': '%',
                                        'fixedrange': True
                                    },
                                    'colorway': ['#707578']
                                }
                            }
                        ),
            # className = 'card',
            style=style_chart
        )

In [20]:
menu_title = {
    'margin-bottom': '6px',
    'font-weight': 'bold',
    'color': '#079A82',
    'width': '200px !important'
}

menu = {
    # 'height': '112px',
    # 'width': '912px',
    'display': 'flex',
    'justify-content': 'space-evenly',
    'padding-top': '24px',
    # 'margin': '-80px auto 0 auto',
    'background-color': '#FFFFFF',
    'box-shadow': '0 4px 6px 0 rgba(0, 0, 0, 0.18)',
    'padding-bottom': '20px'
}

dropdown = {
    'width': '200px'
}

input = {
    'width': '150px'
}

In [21]:
# Scatter
# fig = px.scatter(df, x="AGE", y="TRANS_NO", color='label')

scatter = html.Div(
    children=[
          html.Div(
            children=[         
                html.Div(
                    children=[
                        html.Div(children="Trục X", style=menu_title),
                        dcc.Dropdown(
                            id="axis_X",
                            options=[
                                {"label": col, "value": col}
                                for col in data.columns
                            ],
                            value="AGE",
                            clearable=False,
                            style=dropdown,
                        ),
                    ]
                ),

                html.Div(
                    children=[
                        html.Div(children="Trục Y", style=menu_title),
                        dcc.Dropdown(
                            id="axis_Y",
                            options=[
                                {"label": col, "value": col}
                                for col in data.columns
                            ],
                            value="TRANS_NO",
                            clearable=False,
                            style=dropdown,
                        ),
                    ]
                ),          
                    
            ],
            style= menu,
        ),

        dcc.Graph(id='scatter',),
    ],

    style = style_chart
)

In [22]:
# Tìm cụm cho khách hàng mới
predict_cluster = html.Div(
    children=[
          html.Div(
            children=[         
                html.Div(
                    children=[
                        html.Div(children="Age", style=menu_title),
                        dcc.Input(
                            id="inp_Age",
                            type="number",
                            # placeholder="input type {}".format(_)
                            style= input,
                        ),
                    ]
                ),

                html.Div(
                    children=[
                        html.Div(children="Transaction Number", style=menu_title),
                        dcc.Input(
                            id="inp_TransNo",
                            type="number",
                            # placeholder="input type {}".format(_)
                            style= input,
                        ),
                    ]
                ),

                html.Div(
                    children=[
                        html.Div(children="Transaction amount", style=menu_title),
                        dcc.Input(
                            id="inp_TransAmount",
                            type="number",
                            # placeholder="input type {}".format(_)
                            style= input,
                        ),
                    ]
                ),

                html.Div(
                    children=[
                        html.Div(children="Recency", style=menu_title),
                        dcc.Input(
                            id="inp_Recency",
                            type="number",
                            # placeholder="input type {}".format(_)
                            style= input,
                        ),
                    ]
                ),

                html.Div(
                    children=[
                        html.Div(children="Average balance", style=menu_title),
                        dcc.Input(
                            id="inp_Balance",
                            type="number",
                            # placeholder="input type {}".format(_)
                            style= input,
                        ),
                    ]
                ),     
                    
            ],
            style= menu,
        ),

        html.H5(id='result_cluster', style={'text-align': 'center',
                                            'margin': '0 0 50px 0',
                                            'background-color': '#fff',
                                            'padding-top': '10px',
                                            'font-weight': 'bold',
                                            }),
    ],

    style = style_chart
)

In [23]:
# external_stylesheets =[ {
#          'href': "https://fonts.googleapis.com/css2?"
#                 "family=Lato:wght@400;700&display=swap",
#          'rel': 'stylesheet'
#      }
# ]

external_stylesheets = [dbc.themes.BOOTSTRAP]

app = JupyterDash(__name__, external_stylesheets=external_stylesheets) 

In [24]:
@app.callback(
    [Output("scatter", "figure"), Output("result_cluster", "children")],
    [Input("axis_X", "value"),
     Input("axis_Y", "value"),
     Input('inp_Age', 'value'),
     Input('inp_TransNo', 'value'),
     Input('inp_TransAmount', 'value'),
     Input('inp_Recency', 'value'),
     Input('inp_Balance', 'value'),],
)
def update(axis_X, axis_Y, inp_Age, inp_TransNo, inp_TransAmount, inp_Recency, inp_Balance):
    fig_scatter = px.scatter(df, x=axis_X, y=axis_Y, color='label_str')

    if inp_Age != None and inp_TransNo != None and inp_TransAmount != None and inp_Recency != None and inp_Balance != None:
      origin_vector = [inp_Age, inp_TransNo, inp_TransAmount, -inp_Recency, inp_Balance]
      norm_vector = (origin_vector - mean[:5]) / std[:5]
      norm_vector = norm_vector.values
      norm_vector = np.append(norm_vector, [0,0])
      distance = euclidean_distances(kmean_center, [norm_vector])
      distance = distance.flatten()
      cluster = np.argmin(distance) + 1
      result_cluster = 'Khách hàng thuộc cụm {}'.format(cluster)
    else:
      result_cluster = ''

    return fig_scatter, result_cluster

In [25]:
import base64
image_filename = './ibm.png'

def b64_image(image_filename):
    with open(image_filename, 'rb') as f:
        image = f.read()
    return 'data:image/png;base64,' + base64.b64encode(image).decode('utf-8')

In [26]:
header = html.Div(
            children = [
                html.Img(src=b64_image(image_filename), style={
                    'display': 'block',
                    'margin-left': 'auto',
                    'margin-right': 'auto',
                    'width': '10%',
                }),
                # html.P(children='🥑', className='header-emoji', style={'text-align': 'center'}),
                html.H1(children='Mini hackathon', className='header-title', style= {
                    'color': '#FFF',
                    'font-size': '48px',
                    'font-weight': 'bold',
                    'text-align': 'center'
                }),
                html.H3(
                    children="Phân khúc khách hàng", 
                    className='header-description',
                    style={
                        'color': '#CFCFCF',
                        'text-align': 'center',
                        'margin': '4px auto',
                        'max-width': '338px'
                    }
                )
            ],
            className = 'header',
            style = {
                'background-color': '#000',
                'height': '270px',
                'padding': '16px 0'}
        )

In [27]:
app.layout = html.Div(children=[
      header,
      html.Div(children=[
          percentPeople,
          meanAge,
          meanTRANS_No,
          meanTRANS_Amount,
          meanRecency,
          meanBalance,
          percentBalance,
          scatter,
          predict_cluster,

          ],
          style= {
              'margin-right': 'auto',
              'margin-left': 'auto',
              'max-width': '1024px',
              'padding-right': '10px',
              'padding-left': '10px',
              'margin-top': '32px',
          } 
      ),
    ],
    style={
        'margin': '0',
        'font-family': 'sans-serif',
        'background-color': '#CFCFCf'
    }
)



In [28]:
app.run_server(mode='external')

Dash app running on:


<IPython.core.display.Javascript object>