# import functions

In [1]:
import pandas as pd
import time
import math
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

import pickle

"""
Some global parameters to be tuned here.
"""
date_range = ("2016-01-01T00:00:00", "2016-03-31T23:59:59")

time_start = int(time.mktime(time.strptime(date_range[0], '%Y-%m-%dT%H:%M:%S')))
time_end = int(time.mktime(time.strptime(date_range[1], '%Y-%m-%dT%H:%M:%S')))

# TODO: def foo()
# 7 days as a period
period = 6047800.0
n_period = int(math.ceil((time_end - time_start)/(period)))

In [2]:
def save_obj(obj, name):
    with open( name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open( name + '.pkl', 'rb') as f:
        return pickle.load(f)

def time_str2int(in_time):
    return int(time.mktime(time.strptime(in_time, '%Y-%m-%d %H:%M:%S'))) 

def feature_expenditure(filename, uuid_list):
    """
    Input: filename-- expenditure_timeline
           uuid_list-- uuids to be extracted from expenditure_timeline

    Output: a dictionary of (uuid, feature) pairs

    TODO: check possible repetitions of features
    """
    e = pd.read_csv(filename, sep='\t', header = None)
    exp = e[e[0].isin(set(uuid_list))]

    exp_dict = {}
    for row in exp.iterrows():
        uid = row[1][0]
        
        reg_df = string2list(row[1][1])
        rec_df = string2list(row[1][2])
        pay_df = string2list(row[1][3])
        
        temp = np.zeros(21)

        # reg
        if reg_df[3].any() == '0':
            temp[0:6] = np.nan
        else:
            temp[0] = reg_df[1].nunique()
            temp[1] = reg_df[2].nunique()
            temp[2] = max(reg_df[3].map(time_str2int))
            temp[3] = min(reg_df[3].map(time_str2int))
            temp[4] = temp[2] - temp[3]
            temp[5] = reg_df[5].nunique()

        
        # pay
        if pay_df[2].any() == '0':
            temp[6:13] = np.nan
        else:
            temp[6] = pay_df[0].nunique()
            temp[7] = pay_df[1].nunique()
            temp[8] = max(pay_df[2].map(time_str2int))
            temp[9] = min(pay_df[2].map(time_str2int))
            temp[10] = temp[8] - temp[9]
            temp[11] = pay_df[4].nunique()
            temp[12] = pay_df[5].nunique()

        
        # rec
        if rec_df[2].any() == '0':
            temp[13:21] = np.nan
        else:
            temp[13] = rec_df[0].nunique()
            temp[14] = rec_df[1].nunique()
            temp[15] = max(rec_df[2].map(time_str2int))
            temp[16] = min(rec_df[2].map(time_str2int))
            temp[17] = temp[15] - temp[16]
            temp[18] = rec_df[3].nunique()
            temp[19] = rec_df[4].nunique()
            temp[20] = rec_df[6].nunique()

        exp_dict[uid] = temp
    return exp_dict

In [3]:
def feature_userbase(userbase_dataframe, uuid_list):
    """
    Input: userbase_dataframe-- stacked userbases by pandas.read_csv()
           uuid_list-- -- uuids to be extracted from userbase

    Output: a dictionary of (uuid, feature) pairs
    """
    u_by_uuid = userbase_dataframe[userbase_dataframe[1].isin(set(uuid_list))]

    # remove empty uuids: 'cfcd208495d565ef66e7dff9f98764da' 
    u_by_uuid = u_by_uuid[u_by_uuid[1] != 'cfcd208495d565ef66e7dff9f98764da']

    grouped_by_u = u_by_uuid.groupby([1])
    
    userbase_dict = {}
    for each in grouped_by_u.groups:
        
        index_list = grouped_by_u.groups[each]
        data_frame = u_by_uuid.loc[index_list]
        
        try:
            # uid
            temp = [len(index_list)]

            # reg_ip
            temp.append(data_frame[2].nunique())

            # signature
            if sum(data_frame[3].isnull()) > 0:
                temp.append(1)
            else:
                temp.append(0)

            # nickname
            #nn = data_frame['4'].map(lambda x: x == 'None')
            if sum(data_frame[4].map(lambda x: x == 'None')) > 0:
                temp.append(1)
            else:
                temp.append(0)

            # sex majority
            temp.append(str(data_frame[5].value_counts().index[0]))
            # sex unique count
            temp.append(data_frame[5].nunique())

            # platform majority
            temp.append(str(data_frame[6].value_counts().index[0]))
            # platform unique count
            temp.append(data_frame[6].nunique())

            # ucid majority
            temp.append(str(data_frame[8].value_counts().index[0]))
            # ucid unique count
            temp.append(data_frame[8].nunique())

            # reg time max
            temp.append(max(data_frame[9].map(time_str2int)))
            # reg time min
            temp.append(min(data_frame[9].map(time_str2int)))
            # reg time span
            temp.append(max(data_frame[9].map(time_str2int)) - min(data_frame[9].map(time_str2int)))

            # group: 1, 0
            if sum(data_frame[10].map(lambda x: x != '[]')) > 0:
                temp.append(1)
            else:
                temp.append(0)
            # group: number of groups
            temp.append(len(','.join(data_frame[10]).split(',')))

            # name
            if sum(data_frame[11].map(lambda x: x != 'None')) > 0:
                temp.append(1)
            else:
                temp.append(0)

            if sum(data_frame[12]) > 0:
                temp.append(1)
            else: 
                temp.append(0)

            if sum(data_frame[13]) > 0:
                temp.append(1)
            else: 
                temp.append(0)

            if sum(data_frame[14]) > 0:
                temp.append(1)
            else: 
                temp.append(0)

            if sum(data_frame[15]) > 0:
                temp.append(1)
            else: 
                temp.append(0)

            if sum(data_frame[16]) > 0:
                temp.append(1)
            else: 
                temp.append(0)    

            if sum(data_frame[17]) > 0:
                temp.append(1)
            else: 
                temp.append(0)

            if sum(data_frame[18]) > 0:
                temp.append(1)
            else: 
                temp.append(0)

            userbase_dict[each] = temp

        except:
            print data_frame
            del data_frame
            continue

    return userbase_dict

In [4]:
def feature_event(filename):

    # Get logs within the time range
    t = pd.read_csv(filename, sep='\t', header = None)
    t1 = t[t[2] >= time_start]
    data_all = t1[t1[2] <= time_end]

    # Get all unique event ids
    unique_event_ids = data_all[1].unique()

    grouped_by_uuid = data_all.groupby([0])
    uuids = grouped_by_uuid.groups.keys()

    # binning whole logs in periods as a big dict()
    whole_dict = {i : {} for i in range(n_period)}
    # for each log
    for log in data_all.iterrows():
        # find its binned period
        idx = int(math.floor((log[1][2] - time_start)/period))

        if log[1][0] not in whole_dict[idx]:
            whole_dict[idx][log[1][0]] = []

        # append event id and timestamp to the list
        whole_dict[idx][log[1][0]].append((log[1][1], log[1][2]))

    return whole_dict

def get_labels(whole_dict):

    # creating labels for churn = 1, stay = 0
    labels = {i : {} for i in range(n_period)}
    #for each period
    for i in range(n_period - 1):
        # for each uuid
        for uuid in whole_dict[i]:
            # check if the uuid appears in the next period, if yes-> stay; no-> churn
            if uuid in whole_dict[i + 1]:
                labels[i][uuid] = 0
            else:
                labels[i][uuid] = 1

    return labels

In [5]:
def get_uuids(filename):
    # Get logs within the time range
    t = pd.read_csv(filename, sep='\t', header = None)
    t1 = t[t[2] >= time_start]
    data_all = t1[t1[2] <= time_end]

    uuids = data_all[0].unique()

    return uuids

# get event dict

In [10]:
whole_dict = feature_event("timeline/timeline_event_gpapp.txt")

In [8]:
uuid_list = get_uuids("timeline/timeline_event_gpapp.txt")

In [9]:
len(uuid_list)

110334

In [22]:
save_obj(whole_dict, "whole_dict")

# get userbase dict

In [6]:
u0 = pd.read_csv("timeline/user_base_0.txt", sep='\t', header = None)
u1 = pd.read_csv("timeline/user_base_1.txt", sep='\t', header = None)
u2 = pd.read_csv("timeline/user_base_2.txt", sep='\t', header = None)
u3 = pd.read_csv("timeline/user_base_3.txt", sep='\t', header = None)
u4 = pd.read_csv("timeline/user_base_4.txt", sep='\t', header = None)
u5 = pd.read_csv("timeline/user_base_5.txt", sep='\t', header = None)
u6 = pd.read_csv("timeline/user_base_6.txt", sep='\t', header = None)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
u = u0.append(u1).append(u2).append(u3).append(u4).append(u5).append(u6)

In [20]:
userbase_dict = feature_userbase(u, uuid_list)

                                      0                                 1   \
240245  0252592529ded94f52ee57bc143b3b6d  34d253f365546356dad11d3fba91ab21   

                 2    3    4    5    6    7    8    9    10   11  12  13  14  \
240245  58.46.41.39  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN NaN NaN NaN   

        15  16  17  18  
240245 NaN NaN NaN NaN  
                                      0                                 1   \
115131  1b86c7940a229b6f7defd5b0058b4a21  6042e177e50a64a42eec091cddd617d9   
115131  0ce036e92175501804587603fe81140e  4d40aa7f2f836307d29ad524f85beaaa   

                    2          3     4     5    6    7    8   \
115131    36.45.13.103        NaN  None  None  102    0  144   
115131  113.93.111.136  无敌是我，我是无敌   NaN   NaN  NaN  NaN  NaN   

                         9    10         11   12   13   14   15   16   17   18  
115131  2016-03-27 21:30:22   []  gp7895353  1.0  1.0  1.0  1.0  1.0  0.0  1.0  
115131                  NaN  NaN        Na

In [21]:
save_obj(userbase_dict, "userbase_dict")

In [98]:
uid_list = uid_list[0].unique()

In [107]:
len(uid_list)

115340

# get expenditure dict

In [21]:
def get_uids(userbase_dataframe, uuid_list):
    # TODO
    u_by_uuid = userbase_dataframe[userbase_dataframe[1].isin(set(uuid_list))]
    u_by_uuid = u_by_uuid[u_by_uuid[1]!= 'cfcd208495d565ef66e7dff9f98764da']
    return u_by_uuid[0].unique()

In [65]:
uid_list = get_uids(u, uuid_list)

In [78]:
def string2list(in_str):
    ret = in_str.split('],[')
    temp = []
    for each in ret:
        temp.append(each.strip('[').strip(']').split(','))
    return pd.DataFrame(temp)

In [108]:
exp_dict = feature_expenditure("C:/Users/Administrator/timeline/expenditure_timeline.txt", uid_list)

In [109]:
save_obj(exp_dict, "expenditure_dict")

In [66]:
len(uid_list)

115340

# start with whole_dict, exp_dict, userbase_dict

In [3]:
whole_dict = load_obj('whole_dict')

In [4]:
exp_dict = load_obj('expenditure_dict')

In [5]:
userbase_dict = load_obj('userbase_dict')

In [11]:
labels = get_labels(whole_dict)

In [12]:
def get_eventids(filename):
    # Get logs within the time range
    t = pd.read_csv(filename, sep='\t', header = None)
    t1 = t[t[2] >= time_start]
    data_all = t1[t1[2] <= time_end]

    grouped_by_eventid = data_all.groupby([1])
    eventids = grouped_by_eventid.groups.keys()
    return eventids

In [13]:
eventids = get_eventids('timeline/timeline_event_gpapp.txt')

# userbase feature v2

In [8]:
import sys

In [13]:
def cleaning_filter(input_element, filter_list, replacement):
    if input_element not in filter_list:
        return replacement
    return input_element

def ucid_cleaner(input_element):
    try:
        ret = eval(input_element)
    except:
        ret = []
    if type(ret) is list:
        return ret
    elif type(ret) is dict:
        return map(lambda x: int(x), ret.values())
    elif type(ret) is int:
        return [ret]
    else:
        print "Error in ucid_cleaner"

def feature_userbase(userbase_dataframe, uuid_list):
    """
    Input: userbase_dataframe-- stacked userbases by pandas.read_csv()
           uuid_list-- -- uuids to be extracted from userbase

    Output: a dictionary of (uuid, feature) pairs
    """

    u_by_uuid = userbase_dataframe[userbase_dataframe[1].isin(set(uuid_list))]
    # remove empty uuids: 'cfcd208495d565ef66e7dff9f98764da' 
    u_by_uuid = u_by_uuid[u_by_uuid[1] != 'cfcd208495d565ef66e7dff9f98764da']
    # remove NaNs
    u_by_uuid.drop(u_by_uuid[u_by_uuid[9].isnull()].index, inplace=True)

    """Some data cleaning"""
    # clean sex
    value_counts = u_by_uuid[5].value_counts()
    u_by_uuid[5] = u_by_uuid[5].map(lambda x: cleaning_filter(x, value_counts.index[:4], value_counts.index[0]))

    # clean platform
    u_by_uuid[6] = u_by_uuid[6].map(lambda x: str(x))
    value_counts = u_by_uuid[6].value_counts()
    u_by_uuid[6] = u_by_uuid[6].map(lambda x: cleaning_filter(x, value_counts.index[:3], '0'))

    # clean status
    u_by_uuid[7] = u_by_uuid[7].map(lambda x: str(x))
    u_by_uuid[7] = u_by_uuid[7].map(lambda x: cleaning_filter(x, ['0'], '1'))

    # clean ucid
    u_by_uuid[8] = u_by_uuid[8].map(lambda x: str(x))

    """some processing about ucid"""
    # clean ucid
    u_by_uuid[10] = u_by_uuid[10].map(ucid_cleaner)
    ucids = []
    for each in u_by_uuid[10]:
        ucids += each
    dictinct_ucids = list(set(ucids))

    # Begin feature engineering
    grouped_by_u = u_by_uuid.groupby([1])
    
    userbase_dict = {}
    for each in grouped_by_u.groups:
        
        index_list = grouped_by_u.groups[each]
        data_frame = u_by_uuid.loc[index_list]
        
#         try:
        #[0] uid
        temp = [len(index_list)]

        #[1] reg_ip
        temp.append(data_frame[2].nunique())

        #[2] has signature or not
        if sum(data_frame[3].isnull()) > 0:
            temp.append(1)
        else:
            temp.append(0)

        #[3] has nickname or not
        #nn = data_frame['4'].map(lambda x: x == 'None')
        if sum(data_frame[4].map(lambda x: x == 'None')) > 0:
            temp.append(1)
        else:
            temp.append(0)

        #[4] sex majority -- One-Hot needed
        temp.append(str(data_frame[5].value_counts().index[0]))
        #[5] sex unique count
        temp.append(data_frame[5].nunique())

        #[6] platform majority -- One-Hot needed
        temp.append(str(data_frame[6].value_counts().index[0]))
        #[7] platform unique count
        temp.append(data_frame[6].nunique())

        #[8] ucid majority
        temp.append(str(data_frame[8].value_counts().index[0]))
        #[9] ucid unique count
        temp.append(data_frame[8].nunique())

        #[10] reg time max
        temp.append(max(data_frame[9].map(time_str2int)))
        #[11] reg time min
        temp.append(min(data_frame[9].map(time_str2int)))
        #[12] reg time span
        temp.append(max(data_frame[9].map(time_str2int)) - min(data_frame[9].map(time_str2int)))
        #[13] reg time mean
        temp.append(data_frame[9].map(time_str2int).quantile(0.5))
        #[14] reg time std
        temp.append(np.std(data_frame[9].map(time_str2int)))


        #[15] group: number of groups
        temp.append( sum(data_frame[10].map(lambda x: len(x))) )
        #[16] group: dummy
        temp.append(int(temp[len(temp)-1] > 0))

        """TODO: a huge feature line of ucid down in the bottom"""

        #[18] name
        if sum(data_frame[11].map(lambda x: x != 'None')) > 0:
            temp.append(1)
        else:
            temp.append(0)

        #[19-25]
        if sum(data_frame[12]) > 0:
            temp.append(1)
        else: 
            temp.append(0)

        if sum(data_frame[13]) > 0:
            temp.append(1)
        else: 
            temp.append(0)

        if sum(data_frame[14]) > 0:
            temp.append(1)
        else: 
            temp.append(0)

        if sum(data_frame[15]) > 0:
            temp.append(1)
        else: 
            temp.append(0)

        if sum(data_frame[16]) > 0:
            temp.append(1)
        else: 
            temp.append(0)    

        if sum(data_frame[17]) > 0:
            temp.append(1)
        else: 
            temp.append(0)

        if sum(data_frame[18]) > 0:
            temp.append(1)
        else: 
            temp.append(0)


        """the huge line promised above"""
        df_ucids = []
        for ucids_list in data_frame[10]:
            df_ucids += ucids_list
        ucid_feature = [0] * len(dictinct_ucids)
        for i, e in enumerate(dictinct_ucids):
            if e in df_ucids:
                ucid_feature[i] += 1

        temp += ucid_feature

        userbase_dict[each] = temp

#         except:
#             e = sys.exc_info()[0]
#             print data_frame
#             print e
#             break
        #    del data_frame
        #    continue

    return userbase_dict

In [14]:
ub_feature = feature_userbase(u, uuid_list)

In [15]:
save_obj(ub_feature, "userbase_dict_2")

In [19]:
len(ub_feature)

0

In [25]:
print each

8effc629e4789156b85646698b27f943


In [27]:
ucid_feature

NameError: name 'ucid_feature' is not defined

In [31]:
hi = {}
hi[each] = [1, 2, 3]
hi

{'8effc629e4789156b85646698b27f943': [1, 2, 3]}

# debug

In [11]:
    u_by_uuid = u[u[1].isin(set(uuid_list))]
    # remove empty uuids: 'cfcd208495d565ef66e7dff9f98764da' 
    u_by_uuid = u_by_uuid[u_by_uuid[1] != 'cfcd208495d565ef66e7dff9f98764da']
    # remove NaNs
    u_by_uuid.drop(u_by_uuid[u_by_uuid[9].isnull()].index, inplace=True)

    """Some data cleaning"""
    # clean sex
    value_counts = u_by_uuid[5].value_counts()
    u_by_uuid[5] = u_by_uuid[5].map(lambda x: cleaning_filter(x, value_counts.index[:4], value_counts.index[0]))

    # clean platform
    u_by_uuid[6] = u_by_uuid[6].map(lambda x: str(x))
    value_counts = u_by_uuid[6].value_counts()
    u_by_uuid[6] = u_by_uuid[6].map(lambda x: cleaning_filter(x, value_counts.index[:3], '0'))

    # clean status
    u_by_uuid[7] = u_by_uuid[7].map(lambda x: str(x))
    u_by_uuid[7] = u_by_uuid[7].map(lambda x: cleaning_filter(x, ['0'], '1'))

    # clean ucid
    u_by_uuid[8] = u_by_uuid[8].map(lambda x: str(x))

    """some processing about ucid"""
    # clean ucid
    u_by_uuid[10] = u_by_uuid[10].map(ucid_cleaner)
    ucids = []
    for each in u_by_uuid[10]:
        ucids += each
    dictinct_ucids = list(set(ucids))

In [13]:
dictinct_ucids[1]

36

In [14]:
    grouped_by_u = u_by_uuid.groupby([1])
    
    for each in grouped_by_u.groups:
        
        index_list = grouped_by_u.groups[each]
        data_frame = u_by_uuid.loc[index_list]
        break

In [16]:
data_frame[10]

137818    []
111268    []
Name: 10, dtype: object

In [None]:
ucid_feature = [0] * len(dictinct_ucids)
for i, e in enumerate(dictinct_ucids):
    if e in data_frame[10]:
        ucid_feature[i] += 1

temp += ucid_feature

userbase_dict[each] = temp

# expenditure v2

In [11]:
uuid_list = get_uuids("timeline/timeline_event_gpapp.txt")

In [9]:
def get_uuid2uid(userbase_dataframe, uuid_list):
    # TODO
    u_by_uuid = userbase_dataframe[userbase_dataframe[1].isin(set(uuid_list))]
    u_by_uuid = u_by_uuid[u_by_uuid[1]!= 'cfcd208495d565ef66e7dff9f98764da']
    uuid2uid = {}
    for each in u_by_uuid.iterrows():
        if each[1][1] not in uuid2uid:
            uuid2uid[each[1][1]] = []
        uuid2uid[each[1][1]].append(each[1][0])
    return uuid2uid

In [12]:
uuid2uid = get_uuid2uid(u, uuid_list)

In [15]:
u_by_uuid = u[u[1].isin(set(uuid_list))]
u_by_uuid = u_by_uuid[u_by_uuid[1]!= 'cfcd208495d565ef66e7dff9f98764da']
uid2uuid = {}
for each in u_by_uuid.iterrows():
    if each[1][0] not in uid2uuid:
        uid2uuid[each[1][0]] = []
    uid2uuid[each[1][0]].append(each[1][1])

In [16]:
len(uuid2uid)

45122

In [117]:
len(uid2uuid)

115340

In [None]:
def feature_expenditure(filename, uid_list, uuid2uid):
    """
    Input: filename-- expenditure_timeline
           uid_list-- uids to be extracted from expenditure_timeline

    Output: a dictionary of (uuid, feature) pairs

    TODO: check possible repetitions of features
    """
    e = pd.read_csv(filename, sep='\t', header = None)
    exp = e[e[0].isin(set(uid_list))]

    exp_dict = {}
    for row in exp.iterrows():
        uid = row[1][0]
        
        reg_df = string2list(row[1][1])
        rec_df = string2list(row[1][2])
        pay_df = string2list(row[1][3])
        
        temp = np.zeros(21)

        # reg
        if reg_df[3].any() == '0':
            temp[0:6] = np.nan
        else:
            temp[0] = reg_df[1].nunique()
            temp[1] = reg_df[2].nunique()
            temp[2] = max(reg_df[3].map(time_str2int))
            temp[3] = min(reg_df[3].map(time_str2int))
            temp[4] = temp[2] - temp[3]
            temp[5] = reg_df[5].nunique()

        
        # pay
        if pay_df[2].any() == '0':
            temp[6:13] = np.nan
        else:
            temp[6] = pay_df[0].nunique()
            temp[7] = pay_df[1].nunique()
            temp[8] = max(pay_df[2].map(time_str2int))
            temp[9] = min(pay_df[2].map(time_str2int))
            temp[10] = temp[8] - temp[9]
            temp[11] = pay_df[4].nunique()
            temp[12] = pay_df[5].nunique()

        
        # rec
        if rec_df[2].any() == '0':
            temp[13:21] = np.nan
        else:
            temp[13] = rec_df[0].nunique()
            temp[14] = rec_df[1].nunique()
            temp[15] = max(rec_df[2].map(time_str2int))
            temp[16] = min(rec_df[2].map(time_str2int))
            temp[17] = temp[15] - temp[16]
            temp[18] = rec_df[3].nunique()
            temp[19] = rec_df[4].nunique()
            temp[20] = rec_df[6].nunique()

        exp_dict[uid] = temp

    return exp_dict

In [25]:
len(uid2uuid)

115340

In [26]:
e = pd.read_csv('timeline/expenditure_timeline.txt', sep='\t', header = None)
exp = e[e[0].isin(set(uid2uuid.keys()))]

In [27]:
exp[-1] = exp[0].map(lambda x: uid2uuid[x][0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [57]:
print exp[2][10]

[航海王强者之路,512,2016-03-16 23:00:44,100,金币,101147,0],[航海王强者之路,512,2016-03-31 15:37:52,144,金币,101147,0],[航海王强者之路,512,2016-04-01 08:12:13,144,金币,101147,0]


In [30]:
exp_by_uuid = exp.groupby([-1])

In [32]:
len(exp_by_uuid)

45122

In [39]:
string2list(exp[1][10])

Unnamed: 0,0,1,2,3,4,5
0,航海王强者之路,101147,100,2016-02-26 14:44:13,ZS4ADN4UDRE8XVE5,100
1,部落冲突-2,101728,100,2016-02-26 10:25:13,1803607402,100


In [58]:
len(exp)

115340

In [70]:
ret = {}
for each in exp_by_uuid.groups:
    index_list = exp_by_uuid.groups[each]
    data_frame = exp.loc[index_list]
    
    reg = []
    rec = []
    pay = []
    for row in data_frame.iterrows():
        reg += string2list(row[1][1])
        rec += string2list(row[1][2])
        pay += string2list(row[1][3])
    reg = pd.DataFrame(reg)
    rec = pd.DataFrame(rec)
    pay = pd.DataFrame(pay)
    
    
    """reg"""
    reg = reg[reg[3]!='0']
    try:
        #[0] #uid
        temp = [len(index_list)]
        #[1] #games
        temp.append(len(reg[0]))
        #[2] # unique games
        temp.append(len(reg[1].unique()))
        #[3] # unique cid
        temp.append(len(reg[2].unique()))
        #[4] # unique game_uid
        temp.append(len(reg[4].unique()))
        #[5] # unique ucid
        temp.append(len(reg[5].unique()))

        reg_time = reg[3].map(time_str2int)
        #[6] max reg time
        temp.append(max(reg_time))
        #[7] min reg time
        temp.append(min(reg_time))
        #[8] reg time span
        temp.append(max(reg_time) - min(reg_time))
        #[9] reg time mean
        temp.append(reg_time.quantile(0.5))
        #[10] reg time std
        temp.append(np.std(reg_time))
    except:
        continue
    
    """rec"""
    rec = rec[rec[2]!='0']
    if len(rec) == 0:
        temp += [np.nan] * 12
    else:
        #[11] # rec
        temp.append(len(rec[0]))
        #[12] # unique games
        temp.append(len(rec[0].unique()))
        #[13] # unique types
        temp.append(len(rec[1].unique()))
        #[14] # unique cid
        temp.append(len(rec[3].unique()))
        #[15] # unique item
        temp.append(len(rec[4].unique()))    
        #[16] # unique appid
        temp.append(len(rec[5].unique()))  
        #[17] # unique pay_from
        temp.append(len(rec[6].unique()))  

        rec_time = rec[2].map(time_str2int)
        #[18] max reg time
        temp.append(max(rec_time))
        #[19] min reg time
        temp.append(min(rec_time))
        #[20] reg time span
        temp.append(max(rec_time) - min(rec_time))
        #[21] reg time mean
        temp.append(rec_time.quantile(0.5))
        #[22] reg time std
        temp.append(np.std(rec_time))
    
    pay = pay[pay[2]!='0']
    if len(pay) == 0:
        temp += [np.nan] * 11
    else:
        #[23] # payments
        temp.append(len(pay[0]))
        #[24] # unique games
        temp.append(len(pay[0].unique()))
        #[25] # unique cid
        temp.append(len(pay[1].unique()))
        #[26] # unique appid
        temp.append(len(pay[3].unique()))
        #[27] # unique item
        temp.append(len(pay[4].unique()))    
        #[28] # unique porder
        temp.append(len(pay[5].unique()))  


        pay_time = pay[2].map(time_str2int)
        #[29] max reg time
        temp.append(max(pay_time))
        #[30] min reg time
        temp.append(min(pay_time))
        #[31] reg time span
        temp.append(max(pay_time) - min(pay_time))
        #[32] reg time mean
        temp.append(pay_time.quantile(0.5))
        #[33] reg time std
        temp.append(np.std(pay_time))
    
    ret[each] = temp

        


In [49]:
def string2list(in_str):
    ret = in_str.split('],[')
    temp = []
    for each in ret:
        temp.append(each.strip('[').strip(']').split(','))
    return temp