In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

df = pd.read_csv('../data/ods_slack_all.csv')

df.index = df['user'] + '_' + df['ts'].round(0).astype(str)
df['new_ind'] = df.index

def get_pos_score(reactions):
    positive_reactions = {
        'heavy_plus_sign',
        '+1',
        'tnx',
        'omgtnx',
        'muscle',
        'fireball',
        'shaka',
        'fire',
        'heart'
    }
    total_pos_score = 0
    for reaction in reactions:
        if reaction['name'] in positive_reactions:
            total_pos_score += reaction['count']
    return total_pos_score


pos_score_dict = {}
for row_ind, row in tqdm(df['reactions'].dropna().iteritems()):
    pos_score = get_pos_score(eval(row))
    pos_score_dict[row_ind] = pos_score

  interactivity=interactivity, compiler=compiler, result=result)


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [2]:
text_dict = {k: v for k, v in zip(df['new_ind'].values, df['text'].values)}

In [3]:
channel_dict = {k: v for k, v in zip(df['new_ind'].values, df['channel'].values)}

In [4]:
from collections import defaultdict

all_index = set(df.index)
replies_dict = defaultdict(list)
best_ans = {}
best_ans_pos_score = {}
success_count = 0
fail_count = 0
eval_fail_count = 0
best_ans_replies = {}

lines = []
for row_ind, row in tqdm(df[~df['replies'].isnull()].iterrows()):
    try:
        replies = eval(row['replies'])
    except:
        eval_fail_count += 1
    
    for reply in replies:
        new_ind = f'{reply["user"]}_{round(float(reply["ts"]), 0)}'
        
        if new_ind in all_index:
            success_count += 1
            replies_dict[row_ind].append(new_ind)
        else:
            fail_count += 1
            
    if replies_dict[row_ind]:
        row_replies = [(x, pos_score_dict.get(x, 0)) for x in replies_dict[row_ind]]
        best_k_replies = sorted(row_replies, key=lambda x: x[1], reverse=True)[:10] # get top 10 replies
        replies_id_list = [k for k, v in best_k_replies]
        replies_pos_score_list = [v for k, v in best_k_replies]
        
        best_k_ans_text = ' '.join(text_dict[k] for k in replies_id_list)
        best_k_score = sum(replies_pos_score_list)
        line = dict( new_ind=row_ind, 
                     text=text_dict[row_ind],
                     answer_text=best_k_ans_text,
                     ans_score=best_k_score,
                     pos_score=pos_score_dict.get(row_ind, 0),
                     channel=channel_dict.get(row_ind)
                    )
        lines.append(line)
    
#         best_k_replies_text = ' '.join(df.loc[x[0]]['text'] for x in row_replies)
#         best_ans_replies[row_ind] = replies_id_list
#         best_ans[row_ind] = cur_best_ans_ind
#         best_ans_pos_score[row_ind] = cur_best_ans_score
df_export = pd.DataFrame(lines)
df_export.head()

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Unnamed: 0,new_ind,text,answer_text,ans_score,pos_score,channel
0,U1NPQ2WPN_1548139899.0,"Добрый день, коллеги. Ищу человека, который см...","Еще накину: лучше не переводи 1:1, а переписыв...",21,8,_call_4_collaboration
1,UK188NRKM_1585138381.0,Всем привет! Кто-нибудь нуждается в услугах се...,если вы занимались разметкой дорожного траффик...,0,3,_call_4_collaboration
2,U2LF67NRH_1585152478.0,пытаюсь добыть исторический датасет с <https:/...,"""предиктит"" - звучит как хроническое заболеван...",0,0,_call_4_collaboration
3,U2X57QH7X_1490439541.0,Всем привет! Ищу R-гуру для моего проекта. Нуж...,"в <#C044227RA|lang_r> спрашивали? Нет, решил п...",0,0,_call_4_collaboration
4,U2KFFGDT8_1490461140.0,"День добрый! \nИщу Ментора/Наставника, с котор...",(продолжая общую мысль про спрашивать в публич...,22,2,_call_4_collaboration


In [5]:
df_export.to_csv('../data/ods_answers_eval.csv', index=False)

In [None]:
print('total questios with answers:', len(df[~df['replies'].isnull()]))

In [None]:
df['is_answer'] = df['new_ind'].apply(lambda x: x in replies_dict)
print(df['is_answer'].value_counts())

In [None]:
df['new_ind'] = df.index
df['best_ans'] = df['new_ind'].apply(lambda x: best_ans.get(x, np.NaN))
df['best_ans_score'] = df['new_ind'].apply(lambda x: best_ans_pos_score.get(x, np.NaN))
df['pos_score'] = df['new_ind'].apply(lambda x: pos_score_dict.get(x, 0))

In [None]:
ans_text_lines = []
for row_id, replies_id_list in best_ans_replies.items():
    best_k_ans_text = ' '.join(text_dict[k] for k in replies_id_list)
    ans_text_lines.append({'new_ind': row_id, 'best_k_ans_text': best_k_ans_text})
df_replies_text = pd.DataFrame(ans_text_lines)
df_replies_text.head()

In [None]:
df_new = df[['new_ind', 'text']][~df['is_answer'].isnull()].rename(
    columns={'new_ind': 'best_ans', 'text': 'answer_text'}
)

In [None]:
df_replies_text.shape

In [None]:
df.shape

In [None]:
df_new.shape

In [None]:
df_prepared = pd.merge(df, df_new, on='best_ans', how='left')
df_prepared.shape