In [1]:
import json
import datetime
import pprint
import numpy as np
import pandas as pd
import scipy

In [2]:
%%time
graph_data = []
with open('./dataForGraph.txt', 'r') as f:  
    for line in f:
         graph_data.append(json.loads(line))
graph_data = sorted(graph_data, key=lambda x: x.get('apply_info').get('apply_time'))

CPU times: user 1min 19s, sys: 23.8 s, total: 1min 43s
Wall time: 1min 48s


In [3]:
graph_data[0]

{'apply_info': {'apply_time': '2017-10-28 23:58:16',
  'apply_device_id': '2698c32936cb43f53328d136976c1f10',
  'apply_user_id': '986a42c99fd140d980aa4e1e285ee4c4',
  'is_reject': 1,
  'num_of_applications': 2,
  'is_new_client': 1,
  'is_overdue': 0,
  'overdue_day': 0},
 'device_info': {'device_id': ['2698c32936cb43f53328d136976c1f10'],
  'imsi': ['460079293188619'],
  'phone': ['b3bc0e2f26fda4cdcad142f6ea3fe91b'],
  'mac': ['60f2a725dfb7af36a8a3d8faff9c6440'],
  'idfv': [],
  'idfa': []},
 'contacts_info': {'00fe77d9bded75a1211eb5925babad79': ['1504560963118',
   '1509157454241'],
  '03cbc814338df0af9d312b4eff3cba83': ['1478278525035', '1478278525035'],
  '0505a389c7db29da33faf485f04ff55b': ['1509089534598', '1509089534598'],
  '7451c31c54349dbefc730299955e7022': ['1509089534598', '1509089534598'],
  '06c31f94dd356901b31c6b6860cd197b': ['1504560962767', '1509157454241'],
  '0713a0233c612ebd7a4d0ce33fb9f0a1': ['1504560962886', '1509157454571'],
  '0775e689b3118607efefdfd4abd51e39': [

In [4]:
graph_data[1]['device_info']

{'device_id': ['050f5f76e40c290eb013975b628f2b16',
  '6f4112922cdfa0bbfa3a7735fc5536d9',
  '570a887995ba62552a891cd6fec322e1'],
 'imsi': ['460000142543345'],
 'phone': [],
 'mac': ['0f607264fc6318a92b9e13c65db7cd3c'],
 'idfv': [],
 'idfa': []}

In [5]:
graph_data[3]['apply_info']

{'apply_time': '2017-10-29 00:00:21',
 'apply_device_id': '233f8ffb310e833be955ee30f4a8ee29',
 'apply_user_id': '9a91c2a58c794079b7a2f58df1ee3d70',
 'is_reject': 1,
 'num_of_applications': 1,
 'is_new_client': 1,
 'is_overdue': 0,
 'overdue_day': 0}

In [14]:
am = np.load('adjacent_matrix1.npy')

In [15]:
am_2 = np.matmul(am,am)
am_3 = np.matmul(am_2,am)

In [18]:
# construct a feature matrix
features = pd.DataFrame()
#features from graph (i.e. adjacency matrix)
features['node_degree'] = list(am.mean(axis=0))
features['transitivity'] = list(3*np.diag(am_3)/np.trace(am_3)) #fraction of all possible triangles
features['cluster_coef'] = list(np.nan_to_num(np.diag(am_3)/(am_2.sum(axis=0)-np.diag(am_2))))
#By definition (Networks: An Introduction, M.E.J Newman); fraction of friends who are friends
# basic features extracted from data
features['is_reject'] = [a['apply_info']['is_reject'] for a in graph_data]
features['num_of_applications'] = [a['apply_info']['num_of_applications'] for a in graph_data]
features['is_new_client'] = [a['apply_info']['is_new_client'] for a in graph_data]
features['overdue_day'] = [a['apply_info']['overdue_day'] for a in graph_data]
features['num_of_devices'] = [len(a['device_info']['device_id']) for a in graph_data]
features['num_of_phones'] = [len(a['device_info']['phone']) for a in graph_data]
def avg_call_duration(a, call_type, days):
    """
    Args:
        a: dict, application record in graph_data
        call_type: str, '1' for call in and '2' for call out
        days: int, how many days call to compute
    Return: average call duration
    """
    call_logs = [int(c['duration']) for c in a['calls_info']
                if c['type'] == call_type and
                (datetime.datetime.strptime(a['apply_info']['apply_time'], '%Y-%m-%d %H:%M:%S') -
                 datetime.datetime.fromtimestamp(int(c['date'])/1000)).days <= days]
    try:
        avg_duration = sum(call_logs) / len(call_logs)
    except:
        return 0.0
    return avg_duration
def call_counts(a, days):
    """
    Args:
        a: dict, application record in graph_data
        days: int, how many days call to compute
    Return: call counts
    """
    call_list = [int(c['duration']) for c in a['calls_info']
                if (datetime.datetime.strptime(a['apply_info']['apply_time'], '%Y-%m-%d %H:%M:%S') - 
                    datetime.datetime.fromtimestamp(int(c['date'])/1000)).days <= days]
    return len(call_list)

features['last_7d_avg_call_in_duration'] = [avg_call_duration(a, call_type='1', days=7) for a in graph_data]
features['last_7d_avg_call_out_duration'] = [avg_call_duration(a, call_type='2', days=7) for a in graph_data]
features['last_14d_avg_call_in_duration'] = [avg_call_duration(a, call_type='1', days=14) for a in graph_data]
features['last_14d_avg_call_out_duration'] = [avg_call_duration(a, call_type='2', days=14) for a in graph_data]
features['last_1m_avg_call_in_duration'] = [avg_call_duration(a, call_type='1', days=30) for a in graph_data]
features['last_1m_avg_call_out_duration'] = [avg_call_duration(a, call_type='2', days=30) for a in graph_data]
features['last_7d_call_counts'] = [call_counts(a, days=7) for a in graph_data]
features['last_14d_call_counts'] = [call_counts(a, days=14) for a in graph_data]
features['last_1m_call_counts'] = [call_counts(a, days=30) for a in graph_data]

features['is_overdue'] = [a['apply_info']['is_overdue'] for a in graph_data]

  


In [19]:
features.shape

(35373, 19)

In [20]:
features[:10]

Unnamed: 0,node_degree,transitivity,cluster_coef,is_reject,num_of_applications,is_new_client,overdue_day,num_of_devices,num_of_phones,last_7d_avg_call_in_duration,last_7d_avg_call_out_duration,last_14d_avg_call_in_duration,last_14d_avg_call_out_duration,last_1m_avg_call_in_duration,last_1m_avg_call_out_duration,last_7d_call_counts,last_14d_call_counts,last_1m_call_counts,is_overdue
0,0.0,0.0,0.0,1,2,1,0,1,1,41.765957,15.941176,50.517647,17.506494,49.715596,22.173913,88,178,246,0
1,0.106578,7.039628e-06,0.170548,1,6,1,0,3,0,47.0,9.5625,46.333333,13.675,153.306452,21.186335,27,60,263,0
2,0.322647,6.528597e-05,0.521737,1,8,1,0,1,0,58.34375,29.096386,54.592593,59.220859,58.074561,73.299099,178,368,1104,0
3,0.341306,7.199947e-05,0.555817,1,1,1,0,1,0,30.380952,24.868687,113.927273,26.720554,96.161458,22.555648,127,545,1579,0
4,0.516298,0.000147955,0.782738,1,2,1,0,4,0,91.178862,61.75,100.66,119.666667,82.756579,122.429268,215,289,553,0
5,0.051197,1.919958e-06,0.089571,1,10,1,0,1,0,40.5,7.0,40.5,30.153846,43.2,35.416667,7,16,54,0
6,0.532044,0.0001564713,0.808576,1,2,1,0,3,1,70.888889,193.0,93.055556,151.666667,101.242424,93.315789,11,22,54,0
7,0.028242,5.742416e-07,0.053124,1,1,1,0,1,1,46.444444,28.216667,59.939024,40.626168,58.434426,41.061433,112,336,904,0
8,0.009329,5.943324e-08,0.015614,1,3,1,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0
9,0.461623,0.0001260879,0.722677,0,2,0,0,2,1,42.1,26.6,46.146067,34.769912,98.587302,61.0,109,228,455,0


In [21]:
np.save('feature_matrix1', features)