In [1]:
import pandas as pd
import json
import base64

from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
with open('data/chat.json', 'r') as f:
    data = json.load(f)

In [3]:
chatdf = pd.json_normalize(data)

In [4]:
fp = pd.json_normalize(chatdf['embeddedData.firstParty'][0])
tp = pd.json_normalize(chatdf['embeddedData.thirdParty'][0])
chat = pd.json_normalize(chatdf['comments'][0])
starttime = chatdf['video.created_at'][0]

In [5]:
f = lambda x: (x['emoticon']['emoticon_id'], x['text']) if x['emoticon'] is not None else None
emotedict = chat['message.fragments'].apply(lambda x: [z for y in x if (z:=f(y)) is not None]).values.tolist()
emotedict = dict(list(set([z for y in emotedict for z in y])))

fp['name'] = fp['id'].apply(lambda x: emotedict[x])

In [6]:
tpf = lambda x: tp.loc[tp['name'] == x, 'id'].item() if len(tp.loc[tp['name'] == x, 'id']) > 0 else None

chat = (chat
        .assign(time = lambda df: (pd.to_datetime(df['created_at']) - pd.to_datetime(starttime)))
        .assign(time = lambda df: df['time'].apply(lambda x: x.seconds))
        .assign(tpset = lambda df: df['message.body'].apply(lambda x: [z for y in x.split(' ') if (z:=tpf(y)) is not None]))
        .assign(fpset = lambda df: df['message.emoticons'].apply(lambda x: [d.get('_id') for d in x]))
        .assign(emoteset = lambda df: df['fpset'] + df['tpset'])
        .assign(emoteset = lambda df: df['emoteset'].apply(lambda x: set(x)))
        [['time', '_id', 'commenter._id', 'emoteset']]
)

mlb = MultiLabelBinarizer()
chat = chat.join(pd.DataFrame(
    mlb.fit_transform(chat.pop('emoteset')),
    index=chat.index,
    columns=mlb.classes_
))

topemotes = list(chat.drop(columns=['time', '_id', 'commenter._id']).sum().sort_values(ascending=False)[:250].index)

chat = chat[['time', '_id', 'commenter._id'] + topemotes]

In [7]:
fp['source'] = 'fp'
tp['source'] = 'tp'

emotes = pd.concat([fp,tp]).reset_index(drop=True)
emotes = emotes.loc[emotes['id'].isin(topemotes)]

for row in emotes.values:
    with open(f'data/emotes/{row[0]}.png', 'wb') as fh:
        fh.write(base64.b64decode(row[2]))

emotes = (emotes
          [['name', 'id', 'source']]
          .join(pd.DataFrame(chat.drop(columns=['time', '_id', 'commenter._id']).sum().sort_values(ascending=False), columns=['count']), on='id'))

In [8]:
emotes.to_csv('data/tables/emotes.csv', index=False)

In [10]:
meltdf = pd.melt(chat, id_vars=['time', '_id', 'commenter._id'])
meltdf.loc[meltdf['value'] == 0, 'variable'] = None
meltdf = meltdf['variable'].groupby([meltdf.time, meltdf._id, meltdf['commenter._id']]).apply(set).reset_index()
meltdf['variable'].apply(lambda x: x.remove(None))
meltdf['emotes'] = meltdf['variable'].apply(lambda x: list(x))
meltdf = meltdf[['time', '_id', 'commenter._id', 'emotes']]

meltdf.to_csv('data/tables/chat.csv', index=False)

In [11]:
meltdf.loc[meltdf['emotes'].apply(lambda x: '505187' in x)]

Unnamed: 0,time,_id,commenter._id,emotes
287,376,0d20e82d-4e34-47d4-bd2e-978db0135280,764740029,[505187]
502,693,dc49bbcd-f848-4a2f-875e-cf5c94ceffb8,647158624,[505187]
594,824,72ddd9bc-75ee-4bc2-80af-a1a88ca9990a,49208995,[505187]
669,929,b1a31c20-ae1c-4f93-ba15-a9648aae8594,445428595,[505187]
858,1190,13f62d28-e059-45b9-a7be-3b7281a494c5,616849753,[505187]
...,...,...,...,...
148946,24086,233f0f78-3e84-442c-b8c2-a335bc75f839,44071662,[505187]
149437,24137,e0604007-3d1c-4073-8683-c1e021822879,64375413,[505187]
150264,24403,5a7d3c8a-40a6-400a-ab82-2d06d7995eea,153597785,[505187]
150356,24464,2e92eb61-50d7-4add-994f-3b892ad7caeb,610779468,[505187]


In [4]:
import cv2

vidcap = cv2.VideoCapture('data/video.mp4')
success, image = vidcap.read()
count = 0
while success:
    if count % (60 * 5) == 0:
        image = cv2.resize(image, (1280, 720))
        cv2.imwrite(f'data/images/{count // 60:05d}.jpg', image, [cv2.IMWRITE_JPEG_QUALITY, 85])
        if count % (60 * 500) == 0:
            print(f'output frame for second: {count // 60}')
    success, image = vidcap.read()
    count += 1

output frame for second: 0
output frame for second: 500
output frame for second: 1000
output frame for second: 1500
output frame for second: 2000
output frame for second: 2500
output frame for second: 3000
output frame for second: 3500
output frame for second: 4000
output frame for second: 4500
output frame for second: 5000
output frame for second: 5500
output frame for second: 6000
output frame for second: 6500
output frame for second: 7000
output frame for second: 7500
output frame for second: 8000
output frame for second: 8500
output frame for second: 9000
output frame for second: 9500
output frame for second: 10000
output frame for second: 10500
output frame for second: 11000
output frame for second: 11500
output frame for second: 12000
output frame for second: 12500
output frame for second: 13000
output frame for second: 13500
output frame for second: 14000
output frame for second: 14500
output frame for second: 15000
output frame for second: 15500
output frame for second: 16000
o