In [1]:
import pandas as pd
import time
import math
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score

date_range = ("2016-01-01T00:00:00", "2016-03-31T23:59:59")
time_start = int(time.mktime(time.strptime(date_range[0], '%Y-%m-%dT%H:%M:%S')))
time_end = int(time.mktime(time.strptime(date_range[1], '%Y-%m-%dT%H:%M:%S')))

# Get logs within the time range
t = pd.read_csv("C:/Users/Administrator/timeline/timeline_event_gpapp.txt", sep='\t', header = None)
t1 = t[t[2] >= time_start]
data_all = t1[t1[2] <= time_end]


In [10]:

# Get all unique event ids
unique_event_ids = data_all[1].unique()

grouped_by_uid = data_all.groupby([0])
uids = grouped_by_uid.groups.keys()

# def foo()
# 7 days as a period
period = 6047800.0
n_period = int(math.ceil((time_end - time_start)/(period)))


In [3]:

# binning whole logs in periods as a big dict()
whole_dict = {i : {} for i in range(n_period)}
# for each log
for log in data_all.iterrows():
    # find its binned period
    idx = math.floor((log[1][2] - time_start)/period)

    if log[1][0] not in whole_dict[idx]:
        whole_dict[idx][log[1][0]] = []

    # append event id and timestamp to the list
    whole_dict[idx][log[1][0]].append((log[1][1], log[1][2]))


In [149]:
# creating labels for churn = 1, stay = 0
labels = {i : {} for i in range(n_period)}
#for each period
for i in range(n_period - 1):
    # for each uid
    for uid in whole_dict[i]:
        # check if the uid appears in the next period, if yes-> stay; no-> churn
        if uid in whole_dict[i + 1]:
            labels[i][uid] = 0
        else:
            labels[i][uid] = 1

In [19]:
# simple features?
grouped_by_eventid = data_all.groupby([1])
eventids = grouped_by_eventid.groups.keys()

#features = {i : {} for i in range(n_period)}
features = []
num_handcraft_feature = 4
for i in range(n_period - 1):
    for uid in whole_dict[i]:
        row = np.zeros(len(eventids) + num_handcraft_feature)

        times = []
        for each in whole_dict[i][uid]:
            eid = each[0]
            ts = each[1]
            idx = eventids.index(eid)
            row[idx] += 1
            times.append(ts)

        # add handcraft features here
        row[len(eventids) + 0] = len(whole_dict[i][uid])
        row[len(eventids) + 1] = max(times) - min(times)
        row[len(eventids) + 2] = (time_start + period*(i+1)) - max(times)
        
        # label
        row[len(eventids) + 3] = labels[i][uid]

        features.append(row)


In [21]:
features_df.shape[1]

203

In [25]:
features_df = pd.DataFrame(features)
features_df.to_csv('features_simple.csv')
data = features_df[range(features_df.shape[1] - 2)]
target = features_df[features_df.shape[1]-1]

In [120]:

X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, target, test_size=0.4, random_state=0)


In [121]:
clf = GradientBoostingClassifier(learning_rate = 0.4, loss='exponential', n_estimators = 300, max_features=50).fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [122]:
print "Below are the accuracy, AUC, precision, and recall: "
print clf.score(X_test, y_test)
print roc_auc_score(y_test, y_pred)
print precision_score(y_test, y_pred)
print recall_score(y_test, y_pred)

Below are the accuracy, AUC, precision, and recall: 
0.886740890688
0.743590937017
0.90421529484
0.964693835757


In [90]:
print "Below are the accuracy, AUC, precision, and recall: "
print clf.score(X_test, y_test)
print roc_auc_score(y_test, y_pred)
print precision_score(y_test, y_pred)
print recall_score(y_test, y_pred)

Below are the accuracy, AUC, precision, and recall: 
0.88677462888
0.744473005501
0.903664438811
0.96520882908


In [83]:
from sklearn.metrics import confusion_matrix

In [88]:
cm = confusion_matrix(y_test, y_pred)

In [142]:
cm[0][1]

1254

In [95]:
11763./(11763+424) #false negative

0.9652088290801674

In [94]:
11763./(11763+1254) #false positive

0.9036644388107858

In [99]:
len(y_test) - sum(y_test) # total stays

2633.0

In [125]:
sw = np.array([3 if i == 0 else 1 for i in y_train])

In [126]:
clf = GradientBoostingClassifier(learning_rate = 0.4, loss='exponential', n_estimators = 300, max_features=50).fit(X_train, y_train, sample_weight = sw)
y_pred = clf.predict(X_test)

In [140]:
print "Below are the accuracy, AUC, F1 score, precision, and recall: "
print clf.score(X_test, y_test)
print roc_auc_score(y_test, y_pred)
print f1_score(y_test, y_pred)
print precision_score(y_test, y_pred)
print recall_score(y_test, y_pred)

Below are the accuracy, AUC, F1 score, precision, and recall: 
0.864777327935
0.791333196805
0.916825765751
0.929205401085
0.904771656768


In [128]:
confusion_matrix(y_test, y_pred)

array([[ 3542,  1683],
       [ 2325, 22090]])

In [131]:
(len(target) - sum(target))/len(target)

0.17641263714759983

In [133]:
np.histogram(data_all[2], bins = 13)

(array([   3965,    3567,    1602,   37406,  908535,  815228,  441544,
         604743,  560399,  819281,  968802, 1046555,  963279], dtype=int64),
 array([  1.45160384e+09,   1.45220662e+09,   1.45280940e+09,
          1.45341218e+09,   1.45401496e+09,   1.45461774e+09,
          1.45522052e+09,   1.45582330e+09,   1.45642608e+09,
          1.45702886e+09,   1.45763164e+09,   1.45823442e+09,
          1.45883720e+09,   1.45943998e+09]))

In [134]:
import matplotlib.pyplot as plt

In [137]:
plt.hist(data_all[2], bins='auto')
plt.show()

In [143]:
print "Confusion Matrix: "
cm = confusion_matrix(y_test, y_pred)
print cm
#false positive
print 'false positive' + str(cm[0][1])
#false negative
print 'false negative' + str(cm[1][0])

Confusion Matrix: 
[[ 3542  1683]
 [ 2325 22090]]
false positive1683
false negative2325


In [145]:
count = 0
for each in whole_dict:
    count += len(whole_dict[each])

In [146]:
count

123406

In [147]:
len(target)

74099

In [148]:
from sklearn import svm

In [None]:
clf = svm.SVC().fit(X_train, y_train)

In [None]:
print "Below are the accuracy, AUC, F1 score, precision, and recall: "
print clf.score(X_test, y_test)
print roc_auc_score(y_test, y_pred)
print f1_score(y_test, y_pred)
print precision_score(y_test, y_pred)
print recall_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [7]:
u0 = pd.read_csv("C:/Users/Administrator/timeline/user_base_0.txt", sep='\t', header = None)
u1 = pd.read_csv("C:/Users/Administrator/timeline/user_base_1.txt", sep='\t', header = None)
u2 = pd.read_csv("C:/Users/Administrator/timeline/user_base_2.txt", sep='\t', header = None)
u3 = pd.read_csv("C:/Users/Administrator/timeline/user_base_3.txt", sep='\t', header = None)
u4 = pd.read_csv("C:/Users/Administrator/timeline/user_base_4.txt", sep='\t', header = None)
u5 = pd.read_csv("C:/Users/Administrator/timeline/user_base_5.txt", sep='\t', header = None)
u6 = pd.read_csv("C:/Users/Administrator/timeline/user_base_6.txt", sep='\t', header = None)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
u = u0.append(u1).append(u2).append(u3).append(u4).append(u5).append(u6)

In [33]:
u[2].value_counts().head()

113.25.103.146    54690
113.25.106.228    20328
113.25.92.34      17634
27.44.168.235     15458
222.161.52.56     12996
Name: 2, dtype: int64

In [34]:
u[3].value_counts().head()

警察蜀黍就是这个人，不写签名    182
啦啦啦                 9
警察蜀黍就是这个人           9
哈哈                  7
。。。                 7
Name: 3, dtype: int64

In [35]:
u[4].value_counts().head()

None     1271728
0             15
慕凝             2
果果果果说          2
圣域天韵           2
Name: 4, dtype: int64

In [36]:
u[5].value_counts().head()

None    3439535
1         13153
0         12613
2          1997
100           8
Name: 5, dtype: int64

In [37]:
u[6].value_counts().head()

102    1643794
101    1348213
102     235911
101     192651
0        40854
Name: 6, dtype: int64

In [38]:
u[7].value_counts().head()

0       3032837
0        434436
1031         21
[]            7
1029          3
Name: 7, dtype: int64

In [39]:
u[8].value_counts().head()

100     1443495
1208     613223
100      206649
9831     101134
1208      87358
Name: 8, dtype: int64

In [40]:
u[9].value_counts().head()

2016-02-17 13:57:40    44
2016-03-08 15:17:16    42
2016-02-16 19:05:50    39
2016-03-15 22:50:20    39
2016-02-17 19:47:48    39
Name: 9, dtype: int64

In [41]:
u[10].value_counts().head()

[]                               3425729
[39]                                6766
[67]                                5595
[39,37,97,67,71,360,45,46,48]       4000
[362]                               3082
Name: 10, dtype: int64

In [42]:
u[11].value_counts().head()

None         752205
1                16
vbxrr5623         1
gp7741458         1
gp7741454         1
Name: 11, dtype: int64

In [44]:
u[0].value_counts().head()

d41d8cd98f00b204e9800998ecf8427e    11
974b90be681b2099ddc58da8adb6ef0c     1
e13efb9406c09938fe971e40530304e8     1
702f0559cda8ebea8191fd41b929d2c3     1
295dcd4b05cc490af011fc6a1d7698c6     1
Name: 0, dtype: int64

In [47]:
u[1].value_counts()

cfcd208495d565ef66e7dff9f98764da    1438817
6835f917ada02b5305d9ccd2c9e9fe00     150390
803f7f6112181135e317943563b3c3c5      28887
1465983e74faa0d3ea6cf690b4a30c40      24034
4ab5fde56b6c2cb8172be0d7b05b3a82      20979
e8fe123ecc37fde29d9a54839a19b0da      20628
4576f2bbb65f42a8baa0f42ca5ba71e1      20449
706514537266f50272ead427180253b0      18752
534a55354fed765d42201874b1c00517      17591
b7de342bffc02ce7b33ac67f24ecf19b      16788
3105781c9fce0609cad84a230fdbd561      15930
e0eaeb139a39c33bbe6cfd1a8ca81ba0      15557
ab4c45d070973301b12fa08e95124fe9      15502
a2cf41762984bfcf97dcf625b935e11b      15051
d80e193616499506d679b9997c88da1f      14570
acb872bd6a5ff1befb015b35b31a11c6      14333
7ea26a5c13b43d188088a79fe5f7a424       8527
1505ea6d6c259b52d0f740af862bc138       8142
3d9864d5194ddf1e061f3fcf1214be1f       7388
db40bc44810ec260943e0072ebc5b201       7328
aed4ff5c3d890c1e342c76aff9410afb       7109
f624203ae24080ccd8a76e28fbd2ec67       6751
e71889e112b6964e615b81e8a787aa00

In [56]:
del whole_dict

In [50]:
uid_df = pd.DataFrame(uids)

In [51]:
len(uids)

110334

In [12]:
u_by_uuid = u[u[1].isin(set(uids))]

In [13]:
u_by_uuid.shape

(1554157, 19)

In [83]:
u_by_uuid.to_csv('userbase_event.txt')

In [90]:
temp_uuid = u_by_uuid[1].unique()

In [91]:
temp_uuid

array(['cfcd208495d565ef66e7dff9f98764da',
       '70ed8c56f32faf570516b01d20260236',
       '95d67916fa1592ad9d0980666db24289', ...,
       '7da4e68a9f9013406a3e5e16227db155',
       'e25ec2ae7ebe81b4d4672bf959081178',
       'e85358f87975e58526ea52db9446b16e'], dtype=object)

In [94]:
u_by_uuid2 = u_by_uuid[u_by_uuid[1]!= 'cfcd208495d565ef66e7dff9f98764da']

In [100]:
u_by_uuid2.to_csv('userbase_event.txt')

In [99]:
len(uids)

110334

In [132]:
u_by_uuid2[11].value_counts().head()

None         14810
b225588          1
gp7646124        1
gp5682586        1
gp3251923        1
Name: 11, dtype: int64

In [114]:
grouped_by_u = u_by_uuid2.groupby([1])

In [140]:
u_by_uuid2.loc[grouped_by_u.groups['8effc629e4789156b85646698b27f943']]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
137818,0bb1ff68ba5202177cd4a82cb22e708d,8effc629e4789156b85646698b27f943,42.49.49.165,,,,102,0,10889,2016-03-13 09:23:40,[],gp7442967,1.0,1.0,1.0,1.0,1.0,0.0,1.0
111268,b975824026a6a55c463974d6128d47dc,8effc629e4789156b85646698b27f943,42.49.49.165,,,,102,0,10889,2016-03-14 06:39:42,[],gp7481248,1.0,1.0,1.0,1.0,1.0,0.0,1.0


In [59]:
def time_str2int(in_time):
    return int(time.mktime(time.strptime(in_time, '%Y-%m-%d %H:%M:%S'))) 

In [82]:
def isNone(string):
    return string == 'None'

'None'

In [32]:
u = pd.read_csv('userbase_event.txt')

In [16]:
grouped_by_u = u.groupby('1')

In [97]:
u_by_uuid2 = u

In [100]:
sum(u['11'].map(lambda x: x != 'None'))

100530

In [102]:
data_frame['11'].map(lambda x: x != 'None')

4616      True
103360    True
Name: 11, dtype: bool

In [145]:
features_uuid = {}
for each in grouped_by_u.groups:
    
    index_list = grouped_by_u.groups[each]
    data_frame = u_by_uuid2.loc[index_list]
    
    try:
        # uid
        temp = [len(index_list)]

        # reg_ip
        temp.append(data_frame['2'].nunique())

        # signature
        if sum(data_frame['3'].isnull()) > 0:
            temp.append(1)
        else:
            temp.append(0)

        # nickname
        #nn = data_frame['4'].map(lambda x: x == 'None')
        if sum(data_frame['4'].map(lambda x: x == 'None')) > 0:
            temp.append(1)
        else:
            temp.append(0)

        # sex majority
        temp.append(str(data_frame['5'].value_counts().index[0]))
        # sex unique count
        temp.append(data_frame['5'].nunique())

        # platform majority
        temp.append(str(data_frame['6'].value_counts().index[0]))
        # platform unique count
        temp.append(data_frame['6'].nunique())

        # ucid majority
        temp.append(str(data_frame['8'].value_counts().index[0]))
        # ucid unique count
        temp.append(data_frame['8'].nunique())

        # reg time max
        temp.append(max(data_frame['9'].map(time_str2int)))
        # reg time min
        temp.append(min(data_frame['9'].map(time_str2int)))
        # reg time span
        temp.append(max(data_frame['9'].map(time_str2int)) - min(data_frame['9'].map(time_str2int)))

        # group: 1, 0
        if sum(data_frame['10'].map(lambda x: x != '[]')) > 0:
            temp.append(1)
        else:
            temp.append(0)
        # group: number of groups
        temp.append(len(','.join(data_frame['10']).split(',')))

        # name
        #nnn = data_frame['11'].map(lambda x: x != 'None')
        if sum(data_frame['11'].map(lambda x: x != 'None')) > 0:
            temp.append(1)
        else:
            temp.append(0)

        if sum(data_frame['12']) > 0:
            temp.append(1)
        else: 
            temp.append(0)

        if sum(data_frame['13']) > 0:
            temp.append(1)
        else: 
            temp.append(0)

        if sum(data_frame['14']) > 0:
            temp.append(1)
        else: 
            temp.append(0)

        if sum(data_frame['15']) > 0:
            temp.append(1)
        else: 
            temp.append(0)

        if sum(data_frame['16']) > 0:
            temp.append(1)
        else: 
            temp.append(0)    

        if sum(data_frame['17']) > 0:
            temp.append(1)
        else: 
            temp.append(0)

        if sum(data_frame['18']) > 0:
            temp.append(1)
        else: 
            temp.append(0)

        features_uuid[each] = temp
    except:
        print data_frame
        continue

           -1                                 0  \
74456  240245  0252592529ded94f52ee57bc143b3b6d   

                                      1            2    3    4    5   6   7  \
74456  34d253f365546356dad11d3fba91ab21  58.46.41.39  NaN  NaN  NaN NaN NaN   

        8    9   10   11  12  13  14  15  16  17  18  
74456 NaN  NaN  NaN  NaN NaN NaN NaN NaN NaN NaN NaN  
            -1                                 0  \
103477  115131  0ce036e92175501804587603fe81140e   

                                       1               2          3    4    5  \
103477  4d40aa7f2f836307d29ad524f85beaaa  113.93.111.136  无敌是我，我是无敌  NaN  NaN   

         6   7   8    9   10   11  12  13  14  15  16  17  18  
103477 NaN NaN NaN  NaN  NaN  NaN NaN NaN NaN NaN NaN NaN NaN  
           -1                                 0  \
89285  187197  fe163c205f88c88bd83c9eeefb5cf5f7   

                                      1               2      3    4    5   6  \
89285  fb676ab5d8a34b663b8ff588e83b3cd4  153.99.1

In [117]:
u.drop(u.index[[74456]]).shape

(115339, 20)

In [118]:
u.shape

(115340, 20)

In [125]:
u = uuu

In [120]:
u = u.drop(u.index[[74456]])

In [160]:
len(features_uuid['8effc629e4789156b85646698b27f943'])

23

In [188]:
# simple features?
grouped_by_eventid = data_all.groupby([1])
eventids = grouped_by_eventid.groups.keys()

#features = {i : {} for i in range(n_period)}
features = []
num_handcraft_feature = 4
for i in range(n_period - 1):
    for uid in whole_dict[i]:
        row = np.zeros(len(eventids) + num_handcraft_feature)

        times = []
        for each in whole_dict[i][uid]:
            eid = each[0]
            ts = each[1]
            idx = eventids.index(eid)
            row[idx] += 1
            times.append(ts)

        # add handcraft features here
        row[len(eventids) + 0] = len(whole_dict[i][uid])
        row[len(eventids) + 1] = max(times) - min(times)
        row[len(eventids) + 2] = (time_start + period*(i+1)) - max(times)
        
        # label
        row[len(eventids) + 3] = labels[i][uid]
        
        row = row.tolist()
        if uid in features_uuid:
            row = features_uuid[uid] + row
        else:
            row = ([np.nan] * 23) + row
            
        features.append(row)

In [189]:
features_df = pd.DataFrame(features)

In [229]:
features_df[9].fillna(value = 1, inplace = True)

In [234]:
max(features_df[10].fillna(value = 1))

1459439926.0

In [248]:
import matplotlib.pyplot as plt

In [260]:
sum(features_df[10].fillna(value = 0)) / (len(features_df[10]) - sum(features_df[10].isnull()))

1455765418.9086359

In [261]:
1455765419 - 1455255684

509735

In [272]:
del features_df[16]

In [290]:
features_df[23].fillna(value = 0, inplace = True)

In [298]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

In [303]:
def foo(i):
    if x == 'None':
        return '-1'
    return str(x)

In [305]:
ohe = OneHotEncoder()
ohe.fit_transform(features_df[4].map(foo))

ValueError: invalid literal for long() with base 10: '[1, 2, 3]'

In [313]:
features_df.to_csv('temp.txt', header = None)

In [314]:
features_df = pd.read_csv('temp.txt', header = None)

In [320]:
features_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,210,211,212,213,214,215,216,217,218,219
0,1.0,1.0,1.0,0.0,1.0,102.0,1.0,100.0,1.0,1.455765e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2908874.0,1.0
1,1.0,1.0,1.0,0.0,1.0,102.0,1.0,100.0,1.0,1.455765e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,14.0,292.0,2945545.0,1.0
2,1.0,1.0,1.0,0.0,1.0,102.0,1.0,100.0,1.0,1.455765e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,47.0,1592721.0,1508309.0,1.0
3,1.0,1.0,1.0,0.0,1.0,102.0,1.0,100.0,1.0,1.455765e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,9284.0,1.0
4,1.0,1.0,1.0,0.0,1.0,102.0,1.0,100.0,1.0,1.455765e+09,...,0.0,0.0,0.0,1.0,2.0,0.0,43.0,2570.0,3024278.0,1.0
5,1.0,1.0,1.0,0.0,1.0,102.0,1.0,100.0,1.0,1.455765e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3010497.0,1.0
6,1.0,1.0,1.0,0.0,1.0,102.0,1.0,3597.0,1.0,1.453692e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,303069.0,1.0
7,1.0,1.0,1.0,0.0,1.0,102.0,1.0,9367.0,1.0,1.454650e+09,...,0.0,0.0,0.0,1.0,0.0,0.0,184.0,2589784.0,2341.0,0.0
8,1.0,1.0,1.0,0.0,1.0,102.0,1.0,9507.0,1.0,1.454682e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2943774.0,1.0
9,1.0,1.0,1.0,0.0,1.0,102.0,1.0,100.0,1.0,1.454137e+09,...,1.0,2.0,0.0,1.0,2.0,0.0,82.0,669674.0,2450314.0,1.0


In [326]:
data = features_df[r]

In [327]:
target = features_df[219]

In [328]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, target, test_size=0.4, random_state=0)

In [366]:
clf = GradientBoostingClassifier(learning_rate = 0.18, loss='exponential', n_estimators = 300, max_features=70).fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [342]:
print "Below are the accuracy, AUC, F1 score, precision, and recall: "
print clf.score(X_test, y_test)
print roc_auc_score(y_test, y_pred)
print f1_score(y_test, y_pred)
print precision_score(y_test, y_pred)
print recall_score(y_test, y_pred)

Below are the accuracy, AUC, F1 score, precision, and recall: 
0.895209176788
0.763473274626
0.938277492945
0.911259505153
0.966946549253


In [388]:
print "Below are the accuracy, AUC, F1 score, precision, and recall: "
print clf.score(X_test, y_test)
print roc_auc_score(y_test, y_pred)
print f1_score(y_test, y_pred)
print precision_score(y_test, y_pred)
print recall_score(y_test, y_pred)

Below are the accuracy, AUC, F1 score, precision, and recall: 
0.894736842105
0.762058347141
0.938018991617
0.910735640165
0.96698750768


In [390]:
from sklearn.metrics import confusion_matrix

In [391]:
confusion_matrix(y_test, y_pred)

array([[ 2911,  2314],
       [  806, 23609]])

In [382]:
'1522b7e68452c6c8690d959d79821694' in u['0']

False

In [4]:
e = pd.read_csv("C:/Users/Administrator/timeline/expenditure_timeline.txt", sep='\t', header = None)

In [5]:
u = pd.read_csv('userbase_event.txt')

In [73]:
matched_uuids = u['1'].unique()

In [6]:
e[0].shape

(3473763L,)

In [9]:
e[e[0].isin(u['0'])].shape

(115340, 4)

In [10]:
exp = e[e[0].isin(u['0'])]

In [65]:
exp[0].nunique()

115340

In [32]:
def time_str2int(in_time):
    return int(time.mktime(time.strptime(in_time, '%Y-%m-%d %H:%M:%S'))) 

In [57]:
exp_dict = {}
for row in exp.iterrows():
    uid = row[1][0]
    
    reg_df = string2list(row[1][1])
    rec_df = string2list(row[1][2])
    pay_df = string2list(row[1][3])
    
    temp = np.zeros(21)

    # reg
    if reg_df[3].any() == '0':
        temp[0:6] = np.nan
    else:
        temp[0] = reg_df[1].nunique()
        temp[1] = reg_df[2].nunique()
        temp[2] = max(reg_df[3].map(time_str2int))
        temp[3] = min(reg_df[3].map(time_str2int))
        temp[4] = temp[2] - temp[3]
        temp[5] = reg_df[5].nunique()

    
    # pay
    if pay_df[2].any() == '0':
        temp[6:13] = np.nan
    else:
        temp[6] = pay_df[0].nunique()
        temp[7] = pay_df[1].nunique()
        temp[8] = max(pay_df[2].map(time_str2int))
        temp[9] = min(pay_df[2].map(time_str2int))
        temp[10] = temp[8] - temp[9]
        temp[11] = pay_df[4].nunique()
        temp[12] = pay_df[5].nunique()

    
    # rec
    if rec_df[2].any() == '0':
        temp[13:21] = np.nan
    else:
        temp[13] = rec_df[0].nunique()
        temp[14] = rec_df[1].nunique()
        temp[15] = max(rec_df[2].map(time_str2int))
        temp[16] = min(rec_df[2].map(time_str2int))
        temp[17] = temp[15] - temp[16]
        temp[18] = rec_df[3].nunique()
        temp[19] = rec_df[4].nunique()
        temp[20] = rec_df[6].nunique()

    
    exp_dict[uid] = temp
        

In [42]:
def string2list(in_str):
    ret = in_str.split('],[')
    temp = []
    for each in ret:
        temp.append(each.strip('[').strip(']').split(','))
    return pd.DataFrame(temp)

In [47]:
x = np.zeros(21)

In [48]:
x

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [50]:
x[1:3] = 1

In [52]:
x[3]

0.0

In [76]:
for each in uids:
    if each not in matched_uuids:
        print each 
        break

16199893d0f1ad519b0ec88752539aec


In [63]:
len(exp_dict)

115340

In [77]:
866530028607496

866530028607496L

In [None]:
exp_dict

In [2]:
import pickle
def save_obj(obj, name):
    with open( name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open( name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [87]:
uuid2uid = load_obj("C:/Users/Administrator/timeline/uuid2uid")

In [88]:
uuids = uuid2uid.keys()

In [90]:
uuids

2922878

In [103]:
uuids = set(uuids)

In [104]:
uuid_missing = map(lambda x: x in uuids, uids)

In [114]:
data_all[data_all[0] == '16199893d0f1ad519b0ec88752539aec']

Unnamed: 0,0,1,2
925887,16199893d0f1ad519b0ec88752539aec,1000,1454679563
926736,16199893d0f1ad519b0ec88752539aec,1000,1454679563
926737,16199893d0f1ad519b0ec88752539aec,1301,1454679568
926738,16199893d0f1ad519b0ec88752539aec,4302,1454679582
926739,16199893d0f1ad519b0ec88752539aec,4400,1454679590
926740,16199893d0f1ad519b0ec88752539aec,4302,1454679593
926741,16199893d0f1ad519b0ec88752539aec,4402,1454679711
926742,16199893d0f1ad519b0ec88752539aec,4402,1454679745
926743,16199893d0f1ad519b0ec88752539aec,4403,1454679803
926744,16199893d0f1ad519b0ec88752539aec,4408,1454679840


In [116]:
save_obj(exp_dict, 'expenditure_dict')

In [3]:
exp_dict = load_obj('expenditure_dict')

In [6]:
exp_dict

{'ec48b60747c66084051f1f405d2d3156': array([  1.00000000e+00,   1.00000000e+00,   1.45198540e+09,
          1.45198540e+09,   0.00000000e+00,   1.00000000e+00,
                     nan,              nan,              nan,
                     nan,              nan,              nan,
                     nan,              nan,              nan,
                     nan,              nan,              nan,
                     nan,              nan,              nan]),
 '75f59ced278a27fa5fe69adaddc44c5e': array([  1.00000000e+00,   1.00000000e+00,   1.45244016e+09,
          1.45244016e+09,   0.00000000e+00,   1.00000000e+00,
          1.00000000e+00,   1.00000000e+00,   1.45346781e+09,
          1.45346781e+09,   0.00000000e+00,   1.00000000e+00,
          1.00000000e+00,   1.00000000e+00,   1.00000000e+00,
          1.45346781e+09,   1.45346781e+09,   0.00000000e+00,
          1.00000000e+00,   1.00000000e+00,   1.00000000e+00]),
 'c4fdce435e0fcbeaec3fe9f6423ae7d5': array([  1.00000000