In [1]:
import json
import pandas as pd
import numpy as np
import os

In [2]:
### define functions here

# mongodb needs encoding
def open_mongojson(jsonpath):
    with open(jsonpath, 'rb') as infile:    
        json_block = [json.loads(line.decode('utf-8')) for line in infile]
    return json_block
# remove trashkeys from df
def refine_df(df, disposing_keys_list):
    return df.drop(disposing_keys_list, axis=1)

In [3]:
# subkeys extraction with refinement 
def get_subkey(txtpath):
    with open(txtpath, 'r', encoding= 'utf-8') as f: #if encoding error, run fix_encoding.sh
        subkeys = f.readlines()
    subkeys_refined = [subkeys[i].rstrip('\n') for i in range(len(subkeys))]
    return subkeys_refined

# list only .(ext) files 
def list_only_ext(target_path, fext):
    extf_list = []
    for entity in os.listdir(target_path):
        if entity.split('.')[-1]==fext:
            extf_list.append(entity)
        else: pass
    return extf_list

def extract_model_user( fname, port2modelname):
    #there is no _ in email addresses 
    user = fname.split('_')[-1].split('.')[0]
    port = fname.split('_')[0]
    modelname = port2modelname[port]
    return modelname, user

def wrap_by_user(target_path, port2modelname):
    res_wrap = {}
    txt_list = list_only_ext(target_path, 'txt')
    for file in txt_list:
        model, user = extract_model_user(file, port2modelname)
        sub_keys = get_subkey(file)
        res_wrap[user] = [user, model, sub_keys] 
    return res_wrap

In [4]:
### declare vars here
models = ['GT', 'glacnet', 'glac_no_cas', 'glac_no_glob','glac_no_loc','crcn5ep', 'srt_mmonly']
ports = [str(entry) for entry in [33003,33007,33011,33015,33019,33023,33027] ]
port2modelname = dict(zip(ports, models))
jsonfiles_list = list_only_ext('./jsons2245', 'json')
jsonfiles_list

['crcn_2251.json',
 'glac_noloc2245.json',
 'glac_noglob2245.json',
 'srt_mmonly2251.json',
 'GT2245.json',
 'glac_nocas2245.json',
 'glac2245.json']

In [5]:
json_dir = 'jsons2245/'
jsonfiles_list = list_only_ext(json_dir, 'json')
txt_files_list = list_only_ext('./', 'txt')
jsons_dfs = [refine_df(pd.DataFrame(open_mongojson(json_dir+file)), ['__v', '_id']) for file in jsonfiles_list]

print(len(jsons_dfs))
for i,df in enumerate(jsons_dfs):
    df['original_file'] = jsonfiles_list[i].split('.')[0] 


7


In [6]:
whole_models_df = pd.concat(jsons_dfs)


In [7]:
# add empty cols for recording
whole_models_df['model'] = ""
whole_models_df['user'] = ""

len(whole_models_df)

990

In [8]:
user_info = wrap_by_user('./', port2modelname)

In [9]:
### fill the empty column with username, models from port

# username == email
for username in user_info.keys():
    sub_keys = user_info[username][2]
    for sub in sub_keys:
        subkey_filter = (whole_models_df['submission_key'] == sub)
        whole_models_df.loc[subkey_filter, 'user'] = username
        whole_models_df.loc[subkey_filter, 'model'] = user_info[username][1] #modelname
empty_user_filter = (whole_models_df['user']=="")
whole_models_df.loc[empty_user_filter,'user'] = "N/A"


In [10]:
whole_models_df[['time_spent','user','model', 'original_file']][600:700]

Unnamed: 0,time_spent,user,model,original_file
67,247.134,,,GT2245
68,248.647,,,GT2245
69,250.430,,,GT2245
70,252.090,,,GT2245
71,255.646,,,GT2245
72,257.180,,,GT2245
73,258.478,,,GT2245
74,73.595,,,GT2245
75,1.038,,,GT2245
76,109.528,,,GT2245


In [11]:
### users, columns
user_info.keys(), whole_models_df.columns

(dict_keys(['hanseongyou', 'wschoi', 'sungjy80', 'rlawjdghks', 'bona', 'mkjh', 'lillysong31', 'hckim6077', 'jinyoungwoo', 'jisunkoh', 'plcmj', 'mwg', 'yoonjiheo', 'nohrosa', 'kbkim', 'leahpark0928', 'yunsiksung', 'rlaalsdn567', 'yunjiheo', 'jhjun', 'sorayoon', 'dai712', 'woochankim', 'seheehan', 'sonminji', 'eulkim', 'ymt']),
 Index(['coherent0', 'coherent1', 'coherent2', 'coherent3', 'coherent4',
        'detailed0', 'detailed1', 'detailed2', 'detailed3', 'detailed4',
        'focused0', 'focused1', 'focused2', 'focused3', 'focused4', 'grounded0',
        'grounded1', 'grounded2', 'grounded3', 'grounded4', 'human0', 'human1',
        'human2', 'human3', 'human4', 'share0', 'share1', 'share2', 'share3',
        'share4', 'story_id0', 'story_id1', 'story_id2', 'story_id3',
        'story_id4', 'submission_key', 'time_spent', 'original_file', 'model',
        'user'],
       dtype='object'))

In [46]:
### scanning

def filter_df_by_user(df,user): #
    filter_ = whole_models_df['user'] == user
    return whole_models_df[filter_][['user', 'time_spent','submission_key', 'original_file', 'model']]

def filter_df_by_model(df, original_file_model):
    filter_ = whole_models_df['original_file'] == original_file_model
    return whole_models_df[filter_]

def user_with_0_record(df,user): #need to track what's happening with this user.
    filtered_df = filter_df_by_user(df,user)
    if len(filtered_df)>0: 
        print(user, "\t\tdata seemed ok.\t", len(filtered_df), "\trecords in df")
    else:
        print(user, " has no record!!!!")
        return user

    
    
    
### filtrate the df

def remove_row(df):
    

# 아래껄 볼 column으로 할 수도 있겠다. 그래서 remove_row에 집어넣는거지
#1. remove duplicated by repeat-post

def remove_repeated_post(df):
    # for repeated post, it's uncertain whether keys recorded are regenerated or first generated (exclude 'user')
    # it is possible that quite long delay btw re-post (exclude 'time_spent' )
    tobe_inspected = ['coherent0', 'coherent1', 'coherent2', 'coherent3', 'coherent4',
        'detailed0', 'detailed1', 'detailed2', 'detailed3', 'detailed4',
        'focused0', 'focused1', 'focused2', 'focused3', 'focused4', 'grounded0',
        'grounded1', 'grounded2', 'grounded3', 'grounded4', 'human0', 'human1',
        'human2', 'human3', 'human4', 'share0', 'share1', 'share2', 'share3',
        'share4', 'story_id0', 'story_id1', 'story_id2', 'story_id3',
        'story_id4']
    target_df = df
    for idx, row in df.iterrows():
        refrow = row[tobe_inspected][index]
        for idx_, row_ in df.iterrows():
            if (refrow == row_[tobe_inspected]).all(): # if the same row detected
                df = remove_row(df, row)
    return df

                
#2. remove_repeated_pattern for the user
def remove_repeated_pattern(df):
    tobe_inspected = ['coherent0', 'coherent1', 'coherent2', 'coherent3', 'coherent4',
    'detailed0', 'detailed1', 'detailed2', 'detailed3', 'detailed4',
    'focused0', 'focused1', 'focused2', 'focused3', 'focused4', 'grounded0',
    'grounded1', 'grounded2', 'grounded3', 'grounded4', 'human0', 'human1',
    'human2', 'human3', 'human4', 'share0', 'share1', 'share2', 'share3',
    'share4', 'user']
    for idx,row in df.iterrows():
        refrow = row[tobe_inspected]
        if (refrow == row_[tobe_inspected]).all():
            for idx_, row_ in df.iterrows():
                if (refrow == row_[tobe_inspected]).all(): # if the same row detected
                    df = remove_row(row_, df)
    return df

def remove_too_rushed_survey(df, model):
    models_list = list(set(df['original_file'].values))
    for model in models_list:
        df = filter_df_by_model(df, model)
    
    

SyntaxError: invalid syntax (<ipython-input-46-47dce2a541ee>, line 40)

In [48]:
problem = []
for user in user_info.keys():
    user = user_with_0_record(whole_models_df, user)
    if user ==None: pass
    else: problem.append(user)
problem

hanseongyou 		data seemed ok.	 8 	records in df
wschoi 		data seemed ok.	 13 	records in df
sungjy80  has no record!!!!
rlawjdghks 		data seemed ok.	 12 	records in df
bona 		data seemed ok.	 18 	records in df
mkjh 		data seemed ok.	 8 	records in df
lillysong31 		data seemed ok.	 12 	records in df
hckim6077 		data seemed ok.	 8 	records in df
jinyoungwoo 		data seemed ok.	 18 	records in df
jisunkoh  has no record!!!!
plcmj 		data seemed ok.	 18 	records in df
mwg 		data seemed ok.	 17 	records in df
yoonjiheo 		data seemed ok.	 8 	records in df
nohrosa 		data seemed ok.	 7 	records in df
kbkim 		data seemed ok.	 30 	records in df
leahpark0928  has no record!!!!
yunsiksung 		data seemed ok.	 18 	records in df
rlaalsdn567 		data seemed ok.	 18 	records in df
yunjiheo 		data seemed ok.	 18 	records in df
jhjun 		data seemed ok.	 20 	records in df
sorayoon 		data seemed ok.	 18 	records in df
dai712 		data seemed ok.	 40 	records in df
woochankim 		data seemed ok.	 21 	records in df
sehe

['sungjy80', 'jisunkoh', 'leahpark0928']

In [45]:
remove_repeated_post(whole_models_df)

# From here we need to fill column with email, modelname from filename

1. from file extract portnumber and email_id
2. read sub_keys
3. find matching keys to fill in the empty column

In [15]:
mmtest_df.columns

NameError: name 'mmtest_df' is not defined

In [52]:
user_info = wrap_by_email(os.getcwd(), port2modelname)
# user_info is instance of users
# user_info['email(or username)'] = [email, model, [listofkeys... ]]

Unnamed: 0,email,modelname
40,hanseongyou,crcn5ep
43,hanseongyou,crcn5ep
45,hanseongyou,crcn5ep
48,hanseongyou,crcn5ep
49,hanseongyou,crcn5ep
51,hanseongyou,crcn5ep
53,hanseongyou,crcn5ep
55,hanseongyou,crcn5ep
