# Import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pickle
from IPython.display import display
import gc
import warnings
import nltk
from nltk.util import ngrams, everygrams
from nltk.probability import ConditionalProbDist, MLEProbDist

nltk.download('punkt')
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

with open("../Data/dtypes.pkl", 'rb') as f:
    dtypes = pickle.load(f)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def viewinfo(df:pd.DataFrame) -> None:
    print(df.shape)
    print()
    df.info()
    print()
    display(df.head())

# Pivot Table Generation

In [None]:
log = pd.read_csv("../Data/2022빅콘테스트_데이터분석리그_데이터분석분야_퓨처스부문_데이터셋_220908/log_data.csv", engine='c', low_memory=True, parse_dates=['timestamp', 'date_cd'])
viewinfo(log)

(17843993, 6)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17843993 entries, 0 to 17843992
Data columns (total 6 columns):
 #   Column          Dtype         
---  ------          -----         
 0   user_id         int64         
 1   event           object        
 2   timestamp       datetime64[ns]
 3   mp_os           object        
 4   mp_app_version  object        
 5   date_cd         datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(3)
memory usage: 816.8+ MB



Unnamed: 0,user_id,event,timestamp,mp_os,mp_app_version,date_cd
0,576409,StartLoanApply,2022-03-25 11:12:09,Android,3.8.2,2022-03-25
1,576409,ViewLoanApplyIntro,2022-03-25 11:12:09,Android,3.8.2,2022-03-25
2,72878,EndLoanApply,2022-03-25 11:14:44,Android,3.8.4,2022-03-25
3,645317,OpenApp,2022-03-25 11:15:09,iOS,3.6.1,2022-03-25
4,645317,UseLoanManage,2022-03-25 11:15:11,iOS,3.6.1,2022-03-25


* Main events: ViewLoanApplyIntro & StartLoanApply / EndLoanApply / UseLoanManage / UsePrepayCalc / UseDSRCalc / GetCreditInfo

In [None]:
log.sort_values(by=['user_id', 'timestamp'], inplace=True)
log.head(10)

Unnamed: 0,user_id,event,timestamp,mp_os,mp_app_version,date_cd
11709372,1,GetCreditInfo,2022-05-03 14:52:28,android,464,2022-05-03
11709374,1,GetCreditInfo,2022-05-03 14:52:35,android,464,2022-05-03
2451691,1,UseLoanManage,2022-06-16 23:58:41,Android,3.12.1,2022-06-16
2451693,1,Login,2022-06-16 23:58:41,Android,3.12.1,2022-06-16
7071607,1,GetCreditInfo,2022-06-16 23:58:42,android,464,2022-06-16
10428909,7,GetCreditInfo,2022-05-22 16:39:49,android,465,2022-05-22
9627339,9,GetCreditInfo,2022-05-21 23:37:58,android,465,2022-05-21
9627368,9,GetCreditInfo,2022-05-21 23:43:33,android,465,2022-05-21
9627370,9,GetCreditInfo,2022-05-21 23:43:52,android,465,2022-05-21
9505105,11,OpenApp,2022-03-24 10:53:59,iOS,3.6.1,2022-03-24


In [None]:
main_events = ['ViewLoanApplyIntro', 'StartLoanApply', 'UserLoanManage', 'UsePrepayCalc', 'UseDSRCalc', 'GetCreditInfo']
# log.drop(['mp_os', 'mp_app_version'], axis=1, inplace=True)

In [None]:
def user_separate(df:pd.DataFrame, key:str) -> list:
    app_subdfs = []
    for app in tqdm(df[key].unique()):
        subdf = df[df[key] == app]
        app_subdfs.append(subdf)
        
    return app_subdfs

In [None]:
def get_event(array:list) -> pd.DataFrame:
    event_log = pd.DataFrame(columns=['user_id', 'events'])
    for item in tqdm(array):
        user_id = item['user_id'].unique().item()
        main_ = []
        for date_cd in item['date_cd'].unique():
            subdf = item[item['date_cd'] == date_cd]
            if subdf[subdf['event'].isin(main_events)].empty:
                main_.append('ETC')
            else:
                main_ += list(subdf[subdf['event'].isin(main_events)]['event'].unique())
        
        event_log = event_log.append({'user_id': np.uint32(user_id), 'events': main_}, ignore_index=True)
    
    return event_log

In [None]:
user_arrays = user_separate(log, key='user_id')

100%|██████████| 584636/584636 [2:23:41<00:00, 67.81it/s]


In [None]:
threshold = len(user_arrays) // 10
listA = user_arrays[:threshold]
listB = user_arrays[threshold:2*threshold]
listC = user_arrays[2*threshold:3*threshold]
listD = user_arrays[3*threshold:4*threshold]
listE = user_arrays[4*threshold:5*threshold]
listF = user_arrays[5*threshold:6*threshold]
listG = user_arrays[6*threshold:7*threshold]
listH = user_arrays[7*threshold:8*threshold]
listI = user_arrays[8*threshold:9*threshold]
listJ = user_arrays[9*threshold:]

In [None]:
resultA = get_event(listA)
viewinfo(resultA)

100%|██████████| 58463/58463 [12:48<00:00, 76.10it/s]

(58463, 2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58463 entries, 0 to 58462
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user_id  58463 non-null  object
 1   events   58463 non-null  object
dtypes: object(2)
memory usage: 913.6+ KB






Unnamed: 0,user_id,events
0,1,"[GetCreditInfo, GetCreditInfo]"
1,7,[GetCreditInfo]
2,9,[GetCreditInfo]
3,11,"[GetCreditInfo, UsePrepayCalc, StartLoanApply,..."
4,12,"[StartLoanApply, ViewLoanApplyIntro, StartLoan..."


In [None]:
resultB = get_event(listB)
resultC = get_event(listC)
resultD = get_event(listD)
resultE = get_event(listE)
resultF = get_event(listF)
resultG = get_event(listG)
resultH = get_event(listH)
resultI = get_event(listI)
resultJ = get_event(listJ)

100%|██████████| 58463/58463 [12:42<00:00, 76.72it/s]
100%|██████████| 58463/58463 [12:20<00:00, 78.92it/s]
100%|██████████| 58463/58463 [12:37<00:00, 77.16it/s]
100%|██████████| 58463/58463 [12:45<00:00, 76.35it/s]
100%|██████████| 58463/58463 [11:45<00:00, 82.90it/s]
100%|██████████| 58463/58463 [12:17<00:00, 79.29it/s]
100%|██████████| 58463/58463 [12:13<00:00, 79.74it/s]
100%|██████████| 58463/58463 [12:09<00:00, 80.12it/s]
100%|██████████| 58469/58469 [12:15<00:00, 79.54it/s]


In [None]:
result = pd.concat([resultA, resultB, resultC, resultD, resultE, resultF, resultG, resultH, resultI, resultJ], axis=0)
result.reset_index(drop=True, inplace=True)

In [None]:
viewinfo(result)

(584636, 2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 584636 entries, 0 to 584635
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   user_id  584636 non-null  object
 1   events   584636 non-null  object
dtypes: object(2)
memory usage: 8.9+ MB



Unnamed: 0,user_id,events
0,1,"[GetCreditInfo, GetCreditInfo]"
1,7,[GetCreditInfo]
2,9,[GetCreditInfo]
3,11,"[GetCreditInfo, UsePrepayCalc, StartLoanApply,..."
4,12,"[StartLoanApply, ViewLoanApplyIntro, StartLoan..."


In [None]:
result.to_csv("../Data/main_events.csv", index=False)

# N-grams

* references
    - [N-gram 모델 구현하기 기초 | Python, NLTK](https://seanpark11.tistory.com/89?category=962465)
    - [확률론적 언어 모형](https://datascienceschool.net/03%20machine%20learning/03.01.05%20%ED%99%95%EB%A5%A0%EB%A1%A0%EC%A0%81%20%EC%96%B8%EC%96%B4%20%EB%AA%A8%ED%98%95.html)

In [None]:
event_dict = {
    'SignUp': 'sign up',
    'OpenApp': 'open app',
    'Login': 'login',
    'ViewLoanApplyIntro': 'view loan apply intro',
    'StartLoanApply': 'start loan apply',
    'CompleteIDCertification': 'complete id certification',
    'EndLoanApply': 'end loan apply',
    'UseLoanManage': 'use loan manage',
    'UsePrepayCalc': 'use prepay calculator',
    'UseDSRCalc': 'use debt service ratio calculator',
    'GetCreditInfo': 'get credit information'
}

main_events = ['ViewLoanApplyIntro', 'StartLoanApply', 'UserLoanManage', 'UsePrepayCalc', 'UseDSRCalc', 'GetCreditInfo']

In [None]:
df = pd.read_csv("../Data/main_events.csv")
df.head()

Unnamed: 0,user_id,events
0,1,"['GetCreditInfo', 'GetCreditInfo']"
1,7,['GetCreditInfo']
2,9,['GetCreditInfo']
3,11,"['GetCreditInfo', 'UsePrepayCalc', 'StartLoanA..."
4,12,"['StartLoanApply', 'ViewLoanApplyIntro', 'EndL..."


In [None]:
type(df['events'].iloc[0])

str

In [None]:
df['events'] = df['events'].map(lambda x: x[1:-1])
df['events'] = df['events'].map(lambda x: [item[1:-1].replace("'", "") for item in x.split(', ')])

In [None]:
df.head()

Unnamed: 0,user_id,events
0,1,"[GetCreditInfo, GetCreditInfo]"
1,7,[GetCreditInfo]
2,9,[GetCreditInfo]
3,11,"[GetCreditInfo, UsePrepayCalc, StartLoanApply,..."
4,12,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp..."


In [None]:
type(df['events'].iloc[0])

list

In [None]:
df['num_events'] = df['events'].map(lambda x: len(x))
df.head()

Unnamed: 0,user_id,events,num_events
0,1,"[GetCreditInfo, GetCreditInfo]",2
1,7,[GetCreditInfo],1
2,9,[GetCreditInfo],1
3,11,"[GetCreditInfo, UsePrepayCalc, StartLoanApply,...",15
4,12,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",46


In [None]:
df['num_events'].value_counts()

1      143969
3       61223
4       53599
2       50047
5       32489
        ...  
310         1
231         1
402         1
240         1
355         1
Name: num_events, Length: 328, dtype: int64

## People with more than 3 tags

In [None]:
df['event_sent'] = df['events'].map(lambda x: ' '.join(x))

In [None]:
to_ngram = df[df['num_events'] >= 3]
no_ngram = df[df['num_events'] < 3]

In [None]:
print(no_ngram['num_events'].max())
print(no_ngram['num_events'].min())

2
1


In [None]:
to_ngram.reset_index(drop=True, inplace=True)

In [None]:
to_ngram['trigrams'] = to_ngram['events'].map(lambda x: ngrams(x, n=3))
to_ngram.head(10)

Unnamed: 0,user_id,events,num_events,event_sent,trigrams
0,11,"[GetCreditInfo, UsePrepayCalc, StartLoanApply,...",15,GetCreditInfo UsePrepayCalc StartLoanApply Vie...,<zip object at 0x7f741d105b90>
1,12,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",46,StartLoanApply ViewLoanApplyIntro EndLoanApply...,<zip object at 0x7f741d105a50>
2,17,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",7,StartLoanApply ViewLoanApplyIntro EndLoanApply...,<zip object at 0x7f741d105820>
3,19,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",3,StartLoanApply ViewLoanApplyIntro EndLoanApply,<zip object at 0x7f741d105550>
4,20,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",25,StartLoanApply ViewLoanApplyIntro EndLoanApply...,<zip object at 0x7f741d105370>
5,21,"[ETC, GetCreditInfo, GetCreditInfo, StartLoanA...",8,ETC GetCreditInfo GetCreditInfo StartLoanApply...,<zip object at 0x7f741d1050a0>
6,24,"[GetCreditInfo, StartLoanApply, ViewLoanApplyI...",5,GetCreditInfo StartLoanApply ViewLoanApplyIntr...,<zip object at 0x7f741d172eb0>
7,25,"[GetCreditInfo, StartLoanApply, ViewLoanApplyI...",13,GetCreditInfo StartLoanApply ViewLoanApplyIntr...,<zip object at 0x7f741d172d20>
8,26,"[ETC, ETC, ViewLoanApplyIntro, StartLoanApply,...",7,ETC ETC ViewLoanApplyIntro StartLoanApply View...,<zip object at 0x7f741d172a50>
9,27,"[GetCreditInfo, GetCreditInfo, GetCreditInfo, ...",35,GetCreditInfo GetCreditInfo GetCreditInfo GetC...,<zip object at 0x7f741d172640>


In [None]:
to_ngram['trigrams_list'] = to_ngram['events'].map(lambda x: list(ngrams(x, n=3)))
to_ngram.head(10)

Unnamed: 0,user_id,events,num_events,event_sent,trigrams,trigrams_list
0,11,"[GetCreditInfo, UsePrepayCalc, StartLoanApply,...",15,GetCreditInfo UsePrepayCalc StartLoanApply Vie...,<zip object at 0x7f741d105b90>,"[(GetCreditInfo, UsePrepayCalc, StartLoanApply..."
1,12,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",46,StartLoanApply ViewLoanApplyIntro EndLoanApply...,<zip object at 0x7f741d105a50>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA..."
2,17,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",7,StartLoanApply ViewLoanApplyIntro EndLoanApply...,<zip object at 0x7f741d105820>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA..."
3,19,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",3,StartLoanApply ViewLoanApplyIntro EndLoanApply,<zip object at 0x7f741d105550>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA..."
4,20,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",25,StartLoanApply ViewLoanApplyIntro EndLoanApply...,<zip object at 0x7f741d105370>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA..."
5,21,"[ETC, GetCreditInfo, GetCreditInfo, StartLoanA...",8,ETC GetCreditInfo GetCreditInfo StartLoanApply...,<zip object at 0x7f741d1050a0>,"[(ETC, GetCreditInfo, GetCreditInfo), (GetCred..."
6,24,"[GetCreditInfo, StartLoanApply, ViewLoanApplyI...",5,GetCreditInfo StartLoanApply ViewLoanApplyIntr...,<zip object at 0x7f741d172eb0>,"[(GetCreditInfo, StartLoanApply, ViewLoanApply..."
7,25,"[GetCreditInfo, StartLoanApply, ViewLoanApplyI...",13,GetCreditInfo StartLoanApply ViewLoanApplyIntr...,<zip object at 0x7f741d172d20>,"[(GetCreditInfo, StartLoanApply, ViewLoanApply..."
8,26,"[ETC, ETC, ViewLoanApplyIntro, StartLoanApply,...",7,ETC ETC ViewLoanApplyIntro StartLoanApply View...,<zip object at 0x7f741d172a50>,"[(ETC, ETC, ViewLoanApplyIntro), (ETC, ViewLoa..."
9,27,"[GetCreditInfo, GetCreditInfo, GetCreditInfo, ...",35,GetCreditInfo GetCreditInfo GetCreditInfo GetC...,<zip object at 0x7f741d172640>,"[(GetCreditInfo, GetCreditInfo, GetCreditInfo)..."


In [None]:
to_ngram['dist'] = to_ngram['trigrams'].map(lambda x: nltk.ConditionalFreqDist([((t[0], t[1]), t[2]) for t in x]))
to_ngram

Unnamed: 0,user_id,events,num_events,event_sent,trigrams,trigrams_list,dist
0,11,"[GetCreditInfo, UsePrepayCalc, StartLoanApply,...",15,GetCreditInfo UsePrepayCalc StartLoanApply Vie...,<zip object at 0x7f741d105b90>,"[(GetCreditInfo, UsePrepayCalc, StartLoanApply...","{('GetCreditInfo', 'UsePrepayCalc'): {'StartLo..."
1,12,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",46,StartLoanApply ViewLoanApplyIntro EndLoanApply...,<zip object at 0x7f741d105a50>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA...","{('StartLoanApply', 'ViewLoanApplyIntro'): {'E..."
2,17,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",7,StartLoanApply ViewLoanApplyIntro EndLoanApply...,<zip object at 0x7f741d105820>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA...","{('StartLoanApply', 'ViewLoanApplyIntro'): {'E..."
3,19,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",3,StartLoanApply ViewLoanApplyIntro EndLoanApply,<zip object at 0x7f741d105550>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA...","{('StartLoanApply', 'ViewLoanApplyIntro'): {'E..."
4,20,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",25,StartLoanApply ViewLoanApplyIntro EndLoanApply...,<zip object at 0x7f741d105370>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA...","{('StartLoanApply', 'ViewLoanApplyIntro'): {'E..."
...,...,...,...,...,...,...,...
390615,879692,"[GetCreditInfo, StartLoanApply, ViewLoanApplyI...",16,GetCreditInfo StartLoanApply ViewLoanApplyIntr...,<zip object at 0x7f72a2bdc4b0>,"[(GetCreditInfo, StartLoanApply, ViewLoanApply...","{('GetCreditInfo', 'StartLoanApply'): {'ViewLo..."
390616,879693,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",19,StartLoanApply ViewLoanApplyIntro EndLoanApply...,<zip object at 0x7f72a2bdc690>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA...","{('StartLoanApply', 'ViewLoanApplyIntro'): {'E..."
390617,879694,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",3,StartLoanApply ViewLoanApplyIntro EndLoanApply,<zip object at 0x7f72a2bdc870>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA...","{('StartLoanApply', 'ViewLoanApplyIntro'): {'E..."
390618,879695,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",3,StartLoanApply ViewLoanApplyIntro EndLoanApply,<zip object at 0x7f72a2bdca50>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA...","{('StartLoanApply', 'ViewLoanApplyIntro'): {'E..."


In [None]:
to_ngram['prob'] = to_ngram['dist'].map(lambda x: ConditionalProbDist(x, MLEProbDist))
to_ngram

Unnamed: 0,user_id,events,num_events,event_sent,trigrams,trigrams_list,dist,prob
0,11,"[GetCreditInfo, UsePrepayCalc, StartLoanApply,...",15,GetCreditInfo UsePrepayCalc StartLoanApply Vie...,<zip object at 0x7f741d105b90>,"[(GetCreditInfo, UsePrepayCalc, StartLoanApply...","{('GetCreditInfo', 'UsePrepayCalc'): {'StartLo...","{('GetCreditInfo', 'UsePrepayCalc'): <MLEProbD..."
1,12,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",46,StartLoanApply ViewLoanApplyIntro EndLoanApply...,<zip object at 0x7f741d105a50>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA...","{('StartLoanApply', 'ViewLoanApplyIntro'): {'E...","{('StartLoanApply', 'ViewLoanApplyIntro'): <ML..."
2,17,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",7,StartLoanApply ViewLoanApplyIntro EndLoanApply...,<zip object at 0x7f741d105820>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA...","{('StartLoanApply', 'ViewLoanApplyIntro'): {'E...","{('StartLoanApply', 'ViewLoanApplyIntro'): <ML..."
3,19,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",3,StartLoanApply ViewLoanApplyIntro EndLoanApply,<zip object at 0x7f741d105550>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA...","{('StartLoanApply', 'ViewLoanApplyIntro'): {'E...","{('StartLoanApply', 'ViewLoanApplyIntro'): <ML..."
4,20,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",25,StartLoanApply ViewLoanApplyIntro EndLoanApply...,<zip object at 0x7f741d105370>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA...","{('StartLoanApply', 'ViewLoanApplyIntro'): {'E...","{('StartLoanApply', 'ViewLoanApplyIntro'): <ML..."
...,...,...,...,...,...,...,...,...
390615,879692,"[GetCreditInfo, StartLoanApply, ViewLoanApplyI...",16,GetCreditInfo StartLoanApply ViewLoanApplyIntr...,<zip object at 0x7f72a2bdc4b0>,"[(GetCreditInfo, StartLoanApply, ViewLoanApply...","{('GetCreditInfo', 'StartLoanApply'): {'ViewLo...","{('GetCreditInfo', 'StartLoanApply'): <MLEProb..."
390616,879693,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",19,StartLoanApply ViewLoanApplyIntro EndLoanApply...,<zip object at 0x7f72a2bdc690>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA...","{('StartLoanApply', 'ViewLoanApplyIntro'): {'E...","{('StartLoanApply', 'ViewLoanApplyIntro'): <ML..."
390617,879694,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",3,StartLoanApply ViewLoanApplyIntro EndLoanApply,<zip object at 0x7f72a2bdc870>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA...","{('StartLoanApply', 'ViewLoanApplyIntro'): {'E...","{('StartLoanApply', 'ViewLoanApplyIntro'): <ML..."
390618,879695,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",3,StartLoanApply ViewLoanApplyIntro EndLoanApply,<zip object at 0x7f72a2bdca50>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA...","{('StartLoanApply', 'ViewLoanApplyIntro'): {'E...","{('StartLoanApply', 'ViewLoanApplyIntro'): <ML..."


In [None]:
i = 0
dist = to_ngram['dist'].iloc[i]
last_prev, last = to_ngram['events'].iloc[0][-2:]

cpd = ConditionalProbDist(dist, MLEProbDist)
print(cpd)
print(cpd[(last_prev, last)])

for item in main_events:
    print(f"Prob of {item} given {last_prev, last} -> {cpd[(last_prev, last)].prob(item)}")

<ConditionalProbDist with 8 conditions>
<MLEProbDist based on 3 samples>
Prob of ViewLoanApplyIntro given ('ViewLoanApplyIntro', 'EndLoanApply') -> 0.0
Prob of StartLoanApply given ('ViewLoanApplyIntro', 'EndLoanApply') -> 0.6666666666666666
Prob of UserLoanManage given ('ViewLoanApplyIntro', 'EndLoanApply') -> 0.0
Prob of UsePrepayCalc given ('ViewLoanApplyIntro', 'EndLoanApply') -> 0.0
Prob of UseDSRCalc given ('ViewLoanApplyIntro', 'EndLoanApply') -> 0.0
Prob of GetCreditInfo given ('ViewLoanApplyIntro', 'EndLoanApply') -> 0.3333333333333333


In [None]:
maximal_action = []
minimal_action = []
for i in tqdm(range(to_ngram.shape[0])):
    dist = to_ngram['dist'].iloc[i]
    last_prev, last = to_ngram['events'].iloc[i][-2:]

    cpd = to_ngram['prob'].iloc[i]
    scores = np.array([cpd[(last_prev, last)].prob(item) for item in main_events])

    max_idx = np.where(scores == scores.max())[0]
    max_item = np.random.choice(np.array(main_events)[max_idx])

    try:
        min_idx = np.where((scores == scores.min()) & (scores > 0))[0]
        min_item = np.random.choice(np.array(main_events)[min_idx])
    except:
        min_idx = np.where((scores == scores.min()))[0]
        min_item = np.random.choice(np.array(main_events)[min_idx])        

    maximal_action.append(max_item)
    minimal_action.append(min_item)
    
to_ngram['maximal_action'] = maximal_action
to_ngram['minimal_action'] = minimal_action

100%|██████████| 390620/390620 [01:04<00:00, 6014.14it/s]


In [None]:
to_ngram

Unnamed: 0,user_id,events,num_events,event_sent,trigrams,trigrams_list,dist,prob,maximal_action,minimal_action
0,11,"[GetCreditInfo, UsePrepayCalc, StartLoanApply,...",15,GetCreditInfo UsePrepayCalc StartLoanApply Vie...,<zip object at 0x7f741d105b90>,"[(GetCreditInfo, UsePrepayCalc, StartLoanApply...","{('GetCreditInfo', 'UsePrepayCalc'): {'StartLo...","{('GetCreditInfo', 'UsePrepayCalc'): <MLEProbD...",StartLoanApply,ViewLoanApplyIntro
1,12,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",46,StartLoanApply ViewLoanApplyIntro EndLoanApply...,<zip object at 0x7f741d105a50>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA...","{('StartLoanApply', 'ViewLoanApplyIntro'): {'E...","{('StartLoanApply', 'ViewLoanApplyIntro'): <ML...",StartLoanApply,UserLoanManage
2,17,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",7,StartLoanApply ViewLoanApplyIntro EndLoanApply...,<zip object at 0x7f741d105820>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA...","{('StartLoanApply', 'ViewLoanApplyIntro'): {'E...","{('StartLoanApply', 'ViewLoanApplyIntro'): <ML...",GetCreditInfo,ViewLoanApplyIntro
3,19,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",3,StartLoanApply ViewLoanApplyIntro EndLoanApply,<zip object at 0x7f741d105550>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA...","{('StartLoanApply', 'ViewLoanApplyIntro'): {'E...","{('StartLoanApply', 'ViewLoanApplyIntro'): <ML...",GetCreditInfo,UseDSRCalc
4,20,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",25,StartLoanApply ViewLoanApplyIntro EndLoanApply...,<zip object at 0x7f741d105370>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA...","{('StartLoanApply', 'ViewLoanApplyIntro'): {'E...","{('StartLoanApply', 'ViewLoanApplyIntro'): <ML...",StartLoanApply,ViewLoanApplyIntro
...,...,...,...,...,...,...,...,...,...,...
390615,879692,"[GetCreditInfo, StartLoanApply, ViewLoanApplyI...",16,GetCreditInfo StartLoanApply ViewLoanApplyIntr...,<zip object at 0x7f72a2bdc4b0>,"[(GetCreditInfo, StartLoanApply, ViewLoanApply...","{('GetCreditInfo', 'StartLoanApply'): {'ViewLo...","{('GetCreditInfo', 'StartLoanApply'): <MLEProb...",UsePrepayCalc,GetCreditInfo
390616,879693,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",19,StartLoanApply ViewLoanApplyIntro EndLoanApply...,<zip object at 0x7f72a2bdc690>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA...","{('StartLoanApply', 'ViewLoanApplyIntro'): {'E...","{('StartLoanApply', 'ViewLoanApplyIntro'): <ML...",UsePrepayCalc,GetCreditInfo
390617,879694,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",3,StartLoanApply ViewLoanApplyIntro EndLoanApply,<zip object at 0x7f72a2bdc870>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA...","{('StartLoanApply', 'ViewLoanApplyIntro'): {'E...","{('StartLoanApply', 'ViewLoanApplyIntro'): <ML...",UserLoanManage,UsePrepayCalc
390618,879695,"[StartLoanApply, ViewLoanApplyIntro, EndLoanAp...",3,StartLoanApply ViewLoanApplyIntro EndLoanApply,<zip object at 0x7f72a2bdca50>,"[(StartLoanApply, ViewLoanApplyIntro, EndLoanA...","{('StartLoanApply', 'ViewLoanApplyIntro'): {'E...","{('StartLoanApply', 'ViewLoanApplyIntro'): <ML...",UseDSRCalc,UseDSRCalc


In [None]:
to_ngram['maximal_action'].value_counts()

GetCreditInfo         113675
StartLoanApply         96258
ViewLoanApplyIntro     52329
UsePrepayCalc          42829
UserLoanManage         42812
UseDSRCalc             42717
Name: maximal_action, dtype: int64

In [None]:
to_ngram['minimal_action'].value_counts()

UserLoanManage        73086
UsePrepayCalc         72983
UseDSRCalc            72864
ViewLoanApplyIntro    66554
StartLoanApply        54347
GetCreditInfo         50786
Name: minimal_action, dtype: int64

In [None]:
to_ngram.to_csv("../Data/ngram_.csv", index=False)

In [None]:
Datano_ngram.to_csv("../Data/no_ngram_.csv", index=False)