In [197]:
from analysis import load_chat, load_highlights, remove_missing_matches, cut_same_length
import re
import numpy as np
import pandas as pd

from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline


## Data Loading

In [99]:
def chat_hl_to_documents(ch_match, hl_match, frames_per_doc=210, hl_threshold=0.5, offset=0):
    # make non-overlapping windows of frames_per_doc/30 seconds
    # TODO: implement offset to adjust for comment lag
    chm = [ch_match[i:i+frames_per_doc] for i in range(0,len(ch_match),frames_per_doc)]
    hlm = [hl_match[i:i+frames_per_doc] for i in range(0,len(hl_match),frames_per_doc)]
    
    chm_docs = list()
    hlm_docs = list()
    
    for m_doc, h_doc in zip(chm, hlm):
        chm_docs.append(re.sub("\n+", " ", "".join(m_doc)).strip())
        
        # chm_docs.append("".join(m_doc))
        hlm_docs.append(int(sum(h_doc) >= hl_threshold)
    
    return chm_docs, hlm_docs

In [159]:
def shuffle_two_iters(a1, a2):
    # shuffles two iterables in the same way and turns them into numpy arrays
    shuffle_inds = list(range(len(a1)))
    np.random.default_rng(42).shuffle(shuffle_inds)
    
    ret1 = np.asarray(a1)[shuffle_inds]
    ret2 = np.asarray(a2)[shuffle_inds]
    return ret1, ret2

In [160]:
def load_train_val_docs(mode="train", shuffle=True, frames_per_doc=120, hl_threshold=0.5, offset=0):    
    if mode == "train":
        file_regex = "nalcs_w[134579]*_g[13]" # weeks 1,3,4,5,7,9 games 1 and 3 of each match for training
    if mode == "val":
        file_regex = "nalcs_w[268]*_g[13]" # weeks 2,6,8 games 1 and 3 of each match for validation

    chat = load_chat("data/final_data", load_random=5, random_state=42, file_identifier=file_regex)
    highlights = load_highlights("data/gt", file_identifier=file_regex)

    remove_missing_matches(chat, highlights)

    chat_docs = list()
    hl_docs = list()

    for match in chat.keys():
        ch_match, hl_match = cut_same_length(chat[match], highlights[match])
        cd, hd  = chat_hl_to_documents(ch_match, hl_match, frames_per_doc=frames_per_doc, hl_threshold=hl_threshold, offset=offset)
        # put all documents together
        chat_docs.extend(cd)
        hl_docs.extend(hd)

    # garbage collect chat and highlights    
    
    if shuffle:
        chat_docs, hl_docs = shuffle_two_iters(chat_docs, hl_docs)
        
    return chat_docs, hl_docs

In [161]:
ch_train, hl_train = load_train_val_docs(mode="train")

missing in highlights:	 {'nalcs_w5d2_C9_TMS_g1'}
missing in chat:	 {'nalcs_w4d3_CLG_P1_g3', 'nalcs_w3d2_C9_NV_g1', 'nalcs_w5d3_FLY_TSM_g3', 'nalcs_w7d2_NV_TSM_g3', 'nalcs_w5d2_CLG_IMT_g1', 'nalcs_w1d3_NV_CLG_g1', 'nalcs_w4d2_TL_IMT_g1', 'nalcs_w9d2_TL_TSM_g3', 'nalcs_w5d2_NV_FOX_g1', 'nalcs_w5d3_FLY_TSM_g1', 'nalcs_w7d1_TL_C9_g1', 'nalcs_w3d3_DIG_CLG_g1', 'nalcs_w9d2_TL_TSM_g1', 'nalcs_w1d3_TL_FLY_g3', 'nalcs_w9d3_CLG_NV_g3', 'nalcs_w9d3_CLG_NV_g1', 'nalcs_w3d2_TL_FOX_g1', 'nalcs_w7d1_FLY_P1_g1', 'nalcs_w9d2_FOX_NV_g1', 'nalcs_w9d2_C9_P1_g1', 'nalcs_w3d2_CLG_TSM_g1', 'nalcs_w1d2_CLG_TL_g1', 'nalcs_w1d2_DIG_P1_g3', 'nalcs_w9d2_IMT_CLG_g1', 'nalcs_w7d3_FOX_DIG_g3', 'nalcs_w4d3_FLY_C9_g3', 'nalcs_w5d3_P1_C9_g1', 'nalcs_w4d2_P1_FLY_g1', 'nalcs_w3d3_IMT_FLY_g1', 'nalcs_w1d2_FOX_IMT_g3', 'nalcs_w4d1_C9_TL_g3', 'nalcs_w4d3_CLG_P1_g1', 'nalcs_w3d2_FLY_DIG_g1', 'nalcs_w7d2_P1_FOX_g1', 'nalcs_w7d3_TL_NV_g3', 'nalcs_w1d3_TSM_IMT_g1', 'nalcs_w3d1_NV_TL_g1', 'nalcs_w7d2_CLG_DIG_g1', 'nalcs_w7d3_TL_

### Oversampling for balancing data

In [167]:
# over-sample
ch_pos = ch_train[hl_train == 1]
hl_pos = hl_train[hl_train == 1]
ch_neg = ch_train[hl_train == 0]
hl_neg = hl_train[hl_train == 0]

df_sample_numbers = pd.DataFrame({"step": "before_over_sampling", "total_train_samples": [len(hl_train)], "num_highlight_samples": [len(hl_pos)],  "num_non_highlight_samples": [len(hl_neg)]})
df_sample_numbers

Unnamed: 0,step,total_train_samples,num_highlight_samples,num_non_highlight_samples
0,before_over_sampling,2768,355,2413


In [168]:
hl_pos_rep = np.repeat(hl_pos, int(len(hl_neg) / len(hl_pos)))
ch_pos_rep = np.repeat(ch_pos, int(len(hl_neg) / len(hl_pos)))

hl_train_balanced = np.concatenate([hl_pos_rep, hl_neg])
ch_train_balanced = np.concatenate([ch_pos_rep, ch_neg])

ch_train_balanced_shuffled, hl_train_balanced_shuffled = shuffle_two_iters(ch_train_balanced, hl_train_balanced)

df_sample_numbers = df_sample_numbers.append({"step": "after_over_sampling", "total_train_samples": len(hl_train_balanced_shuffled), "num_highlight_samples": len(hl_pos_rep),  "num_non_highlight_samples": len(hl_neg)}, ignore_index=True)
df_sample_numbers

Unnamed: 0,step,total_train_samples,num_highlight_samples,num_non_highlight_samples
0,before_over_sampling,2768,355,2413
1,after_over_sampling,4543,2130,2413


In [133]:
ch_train[hl_train == 1][:10]

array(['FeelsBad YOUR TIME TO SHINE FeelsBadMan WHAT HAPPENED TO FLYQUEST fly feelsbad FREEQUEST LUL CLOWN FIESTA KreyGasm easy lmao DIG PogChamp NA SO BAD 4Head j4 mid no more Jebaited DIGNITAS DIGNITAS PogChamp DIG PogChamp PogChamp PogChamp PogChamp gg POG lose vs dig = disband LUL DIG PogChamp WutFace FAILED QUEST LUL fly trolling DIG PogChamp NV LUL PLOWED haHAA m DIG PogChamp (poolparty) WutFace nv 9>2 Kappa DIG PogChamp',
       'QUEM É BRASIL, DIGITE 1 NICOLE ANISTON Kreygasm LUL LUL WTF WTF LOLOL balls? SSUMDAY PogChamp sumday 1 v9 FLYQUEST LL ??????? ????? HELLO??!? WutFace ???? ???? DIG PogChamp altec? kleane Kreygasm PogChamp team? KARTHUS YOOOO DOG keaneW keaneW keaneW keaneW keaneW LUL triple TAUNT Keane PogChamp ????? Throw LUL DIG PogChamp FLY????????? KEANE PogChamp ??????? BALLS LUL FIESTA BALLS ? KEANE PogChamp DIG IN 2017 PogChamp PogChamp LUL DIG PogChamp DIG PogChamp DIG PogChamp report balls LUL D2 flyquest wtf PogChamp PogChamp PogChamp PogChamp PogChamp PogCham

## Testing text classification with TFIDF and NaiveBayes
https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

### In individual steps


In [183]:
count_vect = CountVectorizer() # TODO add parameters
X_train_counts = count_vect.fit_transform(ch_train)
#X_train_counts = count_vect.fit_transform(ch_train_balanced_shuffled)

X_train_counts.shape

(2768, 6608)

In [184]:
count_vect.vocabulary_.get(u'pogchamp')

4541

In [185]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2768, 6608)

In [186]:
# y_train = hl_train
y_train = hl_train_balanced_shuffled

In [187]:
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [188]:
# load validation data
ch_val, hl_val = load_train_val_docs(mode="val")

missing in highlights:	 {'nalcs_w6d2_FOX_C9_g1'}
missing in chat:	 {'nalcs_w2d3_FOX_FLY_g3', 'nalcs_w8d2_TSM_FLY_g1', 'nalcs_w2d2_IMT_C9_g1', 'nalcs_w8d2_NV_DIG_g1', 'nalcs_w8d2_C9_IMT_g3', 'nalcs_w2d2_TSM_TL_g3', 'nalcs_w6d1_IMT_TL_g1', 'nalcs_w8d2_NV_DIG_g3', 'nalcs_w2d2_DIG_FOX_g3', 'nalcs_w8d2_TL_P1_g1', 'nalcs_w6d1_P1_CLG_g3', 'nalcs_w2d3_C9_CLG_g3', 'nalcs_w2d1_TSM_DIG_g1', 'nalcs_w6d1_P1_CLG_g1', 'nalcs_w2d3_C9_CLG_g1', 'nalcs_w8d2_C9_IMT_g1', 'nalcs_w2d3_NV_IMT_g3', 'nalcs_w8d2_TSM_FLY_g3', 'nalcs_w6d2_FLY_NV_g3', 'nalcs_w2d2_P1_NV_g1', 'nalcs_w2d2_DIG_FOX_g1', 'nalcs_w2d1_FLY_CLG_g1', 'nalcs_w2d1_TSM_DIG_g3', 'nalcs_w6d3_C9_FLY_g1', 'nalcs_w8d1_DIG_TL_g1', 'nalcs_w8d1_FOX_CLG_g1', 'nalcs_w6d2_TSM_CLG_g1', 'nalcs_w2d2_TSM_TL_g1', 'nalcs_w6d3_FOX_TL_g3', 'nalcs_w8d3_CLG_FLY_g1', 'nalcs_w8d3_P1_IMT_g1', 'nalcs_w2d3_P1_TL_g1', 'nalcs_w8d3_NV_C9_g1', 'nalcs_w6d2_P1_DIG_g3', 'nalcs_w6d1_IMT_TL_g3', 'nalcs_w8d3_TSM_FOX_g1', 'nalcs_w6d3_DIG_TSM_g1', 'nalcs_w2d3_FOX_FLY_g1', 'nalcs_w6d

In [189]:
y_true = hl_val

In [196]:
docs_new = ch_val
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

y_pred = clf.predict(X_new_tfidf)

"""
for doc, pred, true in zip(docs_new, y_pred, y_true):
        if pred == true == 1:
             print(doc, pred, true)
"""
print(precision_recall_fscore_support(y_true, y_pred, average="binary"))
print(accuracy_score(y_true, y_pred))

  _warn_prf(average, modifier, msg_start, len(result))


TypeError: 'numpy.float64' object is not iterable

### In a pipeline

In [193]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

In [None]:
# load validation data
ch_val, hl_val = load_train_val_docs(mode="val")


In [200]:
x_train = ch_train_balanced_shuffled
y_train = hl_train_balanced_shuffled

x_val = ch_val
y_val = hl_val

In [201]:
text_clf.fit(x_train, y_train)
y_pred = text_clf.predict(x_val)

print(precision_recall_fscore_support(y_val, y_pred, average="binary"))
print(accuracy_score(y_true, y_pred))

(0.2006079027355623, 0.3656509695290859, 0.25907752698724235, None)
0.7488356620093147
