In [1]:


import os
import re
import string
import glob
import nltk
import pickle
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path
from nltk.corpus import stopwords
from collections import Counter
from nltk import word_tokenize
from simpletransformers.classification import ClassificationModel

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score



In [2]:
text_dir = "../data/raw/text"
labels_dir =  "../data/raw/labels"
audio_dir = "../data/raw/audio"



In [3]:
text_files = glob.glob(f"{text_dir}/*.txt")


def remove_stamps_str(line)->str:
    #clip_num = re.search('.+___\d\d?\d?___.+', line).group(0)
    stamp = re.search('.+___', line).group(0)
    new_line = line.strip(stamp)
    return new_line

In [4]:

## Pour python < 3.9, sinon str.removeprefix() de base
def removeprefix(self: str, prefix: str, /) -> str:
    if self.startswith(prefix):
        return self[len(prefix):]
    else:
        return self[:]



#Retire chaque stamps, chaque texte devien 1 seul str

def text_list_generator(files_list):
    text_list = []
    for filename in files_list:
        with open(file = filename, encoding = 'utf-8') as f:

            ##WINDOWS SPECIFIC
            if sys.platform == 'win32':
           # videoid = filename.removeprefix(text_dir + '\\').rstrip('.txt')
                videoid = removeprefix(filename, text_dir + '\\').rstrip('.txt')
            else :
                videoid = removeprefix(filename, text_dir + '/').rstrip('.txt')
            lines = f.readlines()
            for line_number, text_line in enumerate(lines):
                clean_line = remove_stamps_str(text_line)
                clip_id = videoid +'_'+ text_line.split('___')[1]
                #clip_id = videoid +'_' +str(line_number)
                yield (clip_id, clean_line.rstrip())

"""
corpus = []
text = []
text_list = []
for filename in text_files:
    text_str = ''
    with open(file=filename, encoding='utf-8') as f:
        lines = f.readlines()
        for count, line in enumerate(lines):
            clean_line = remove_stamps_str(line)
            text.append(clean_line)
    text_list.append(text)
    """

"\ncorpus = []\ntext = []\ntext_list = []\nfor filename in text_files:\n    text_str = ''\n    with open(file=filename, encoding='utf-8') as f:\n        lines = f.readlines()\n        for count, line in enumerate(lines):\n            clean_line = remove_stamps_str(line)\n            text.append(clean_line)\n    text_list.append(text)\n    "

In [5]:
stop_words = stopwords.words('English')

#création d'un dict pour lookup en O(1)
stopwords_dict = Counter(stop_words)


#Retire tous les timestamps en début de ligne, présents dans chaque transcript
def remove_stamps_str(line)->str:
    stamp = re.search('.+___', line).group(0)
    new_line = line.strip(stamp)
    return new_line

#Retire les charactères non-ascii 
def remove_nonascii(line)->str:
    ascii_line = line.encode(encoding = 'ascii', errors = 'ignore').decode()
    return ascii_line

#met tout en minuscules, retire les nombres et stopwords
def clean_stopwords_digits(line)->str:
    new_line = line.translate(str.maketrans('', '', string.punctuation))
    new_line = ' '.join([word.lower() for word in new_line.split() if (len(word) >=2 and word.isalpha() and word not in stopwords_dict)])
    return new_line



In [6]:
label_df = pd.read_csv("../data/interim/labels/labels.csv")
display(label_df)

Unnamed: 0,id,anger,disgust,fear,happiness,sadness,surprise,sentiment
0,--qXJuDtHPw_5,0,0,0,1,0,0,1
1,-3g5yACwYnA_10,0,0,0,0,0,0,1
2,-3g5yACwYnA_13,0,0,0,0,0,0,1
3,-3g5yACwYnA_2,0,0,0,1,0,0,0
4,-3g5yACwYnA_3,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
21813,zwTrXwi54us_6,0,0,0,0,0,0,0
21814,zwTrXwi54us_7,0,0,0,0,0,0,0
21815,zwTrXwi54us_8,0,0,0,0,0,0,0
21816,zwTrXwi54us_9,0,0,0,0,0,0,1


In [7]:
corpus = (text for text in text_list_generator(text_files))
df_text = pd.DataFrame(corpus)
df_text.columns = ['id', 'text']

In [8]:
display(df_text.head(5))

Unnamed: 0,id,text
0,--qXJuDtHPw_0,I see that there are three category of writers.
1,--qXJuDtHPw_1,"I define them as being an author, a writer, an..."
2,--qXJuDtHPw_2,"An author, I like to classify as somebody who ..."
3,--qXJuDtHPw_3,These are the well-known authors of our time a...
4,--qXJuDtHPw_4,"Then, there is the writer."


In [9]:

df_text['text'] =  df_text.text.apply(remove_nonascii)


df_text['clean_text'] = df_text.text.apply(lambda s : clean_stopwords_digits(s))
df_text['tokens'] = df_text.clean_text.apply(lambda x : x.split(' '))

#df_text['lines'] = df_text.groupby('unique_id')
df_text['count'] = df_text['text'].apply(lambda x : len(x.split(' ')))




display(df_text)

Unnamed: 0,id,text,clean_text,tokens,count
0,--qXJuDtHPw_0,I see that there are three category of writers.,see three category writers,"[see, three, category, writers]",9
1,--qXJuDtHPw_1,"I define them as being an author, a writer, an...",define author writer story teller,"[define, author, writer, story, teller]",13
2,--qXJuDtHPw_2,"An author, I like to classify as somebody who ...",an author like classify somebody writes great ...,"[an, author, like, classify, somebody, writes,...",14
3,--qXJuDtHPw_3,These are the well-known authors of our time a...,these wellknown authors time past,"[these, wellknown, authors, time, past]",11
4,--qXJuDtHPw_4,"Then, there is the writer.",then writer,"[then, writer]",5
...,...,...,...,...,...
44972,_ZlF5Q9W1DM_1,Clearly because everything is data driven toda...,clearly everything data driven today measurabl...,"[clearly, everything, data, driven, today, mea...",17
44973,_ZlF5Q9W1DM_2,"But the conceptual drivers, what sort of drive...",but conceptual drivers sort drives passions th...,"[but, conceptual, drivers, sort, drives, passi...",28
44974,_ZlF5Q9W1DM_3,So those two things actually I think rub up ag...,so two things actually think rub collide reall...,"[so, two, things, actually, think, rub, collid...",21
44975,_ZlF5Q9W1DM_4,So the idea of numbers telling the story up ag...,so idea numbers telling story idea explaining ...,"[so, idea, numbers, telling, story, idea, expl...",40


In [10]:
#frame = frame[~frame[target_features].sum(axis = 1) == 0]

In [11]:
frame = pd.merge(df_text, label_df, on = 'id', how = 'inner')
frame

Unnamed: 0,id,text,clean_text,tokens,count,anger,disgust,fear,happiness,sadness,surprise,sentiment
0,--qXJuDtHPw_5,I see that a writer is somebody who has an inc...,see writer somebody incredible command mechani...,"[see, writer, somebody, incredible, command, m...",18,0,0,0,1,0,0,1
1,-3g5yACwYnA_2,Key Polymer brings a technical aspect to our o...,key polymer brings technical aspect operation ...,"[key, polymer, brings, technical, aspect, oper...",14,0,0,0,1,0,0,0
2,-3g5yACwYnA_3,We're a huge user of adhesives for our operati...,were huge user adhesives operation called floc...,"[were, huge, user, adhesives, operation, calle...",29,0,0,0,0,0,0,0
3,-3g5yACwYnA_4,Key brings those types of aspects to a busines...,key brings types aspects business new markets ...,"[key, brings, types, aspects, business, new, m...",34,0,0,0,0,0,0,0
4,-3g5yACwYnA_9,We have many new opportunities through the way...,we many new opportunities way things changed y...,"[we, many, new, opportunities, way, things, ch...",21,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
21593,_xjBqNfiEY4_6,"And once again, U of I students are so well pr...",and students well prepared asset student,"[and, students, well, prepared, asset, student]",24,0,0,0,0,0,0,0
21594,_XWUKMkxa-Q_4,So its become more of an iterative process wit...,so become iterative process fast results fast ...,"[so, become, iterative, process, fast, results...",13,0,0,0,1,0,0,1
21595,_YzIvwvyIq4_5,"Secondly, using social, you know, things like ...",secondly using social know things like twitter...,"[secondly, using, social, know, things, like, ...",30,0,0,0,1,0,0,1
21596,_ZlF5Q9W1DM_0,JOHN GERZEMA: When I think about finding insig...,john gerzema when think finding insights consu...,"[john, gerzema, when, think, finding, insights...",24,0,0,0,0,0,0,0


In [12]:
target_emotions = ['anger', 'disgust', 'fear','happiness', 'sadness','surprise']

In [13]:
frame['len'] = frame.clean_text.apply(lambda x : len(x.split()))


In [14]:
frame = frame[frame[target_emotions].sum(axis=1) >0]

frame = frame[~(frame['len'] > 30)]


Positive_negative_frame : split all emotions into just 2 values, positive and negative.

happiness + surprise = positive

anger, disgust, fear, sadness = negative

In [15]:
positive_negative_frame = frame.copy()
positive_negative_frame = positive_negative_frame[positive_negative_frame[target_emotions].sum(axis=1) >= 1]

positive_negative_frame = positive_negative_frame.drop(columns=['text','tokens', 'count', 'len', 'sentiment'])


In [16]:
def pos_or_neg(row):
    if row['happiness'] or row['surprise']:
        return 1
    else :
        return 0

In [17]:
positive_negative_frame['pos'] = positive_negative_frame[target_emotions].apply(lambda row : pos_or_neg(row), axis= 1)

In [18]:
positive_negative_frame['pos'].value_counts()

1    6304
0    2340
Name: pos, dtype: int64

In [43]:
positive_negative_frame = positive_negative_frame.groupby(by=['pos']).sample(1600, random_state = 28)

In [45]:
#nltk.download('averaged_perceptron_tagger')
#df_text['tagged'] = df_text['tokens'].apply(nltk.tag.pos_tag )

In [46]:
"""
data = tts(frame.clean_text, frame[target_emotions], test_size = 0.2)
X_train, X_test, y_train, y_test = data
"""

'\ndata = tts(frame.clean_text, frame[target_emotions], test_size = 0.2)\nX_train, X_test, y_train, y_test = data\n'

Bag of words

In [47]:
"""

count_vector = CountVectorizer(lowercase=False, ngram_range=(1,3), max_features=250)
count_vector.fit(frame.clean_text)
counts = count_vector.transform(frame.clean_text)

"""

'\n\ncount_vector = CountVectorizer(lowercase=False, ngram_range=(1,3), max_features=250)\ncount_vector.fit(frame.clean_text)\ncounts = count_vector.transform(frame.clean_text)\n\n'

#### TFIDF

In [48]:
"""
tfidf_vector = TfidfVectorizer(lowercase=False, ngram_range=(1,3), max_features = 250)
tfidf_vector.fit(data[0], data[2])
tfidfs = tfidf_vector.transform(frame.clean_text)

"""

'\ntfidf_vector = TfidfVectorizer(lowercase=False, ngram_range=(1,3), max_features = 250)\ntfidf_vector.fit(data[0], data[2])\ntfidfs = tfidf_vector.transform(frame.clean_text)\n\n'

In [49]:
"""
nb_pipeline = Pipeline([
    ('BoW', count_vector),
    ('classifier', OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior = None)))
])
for feature in target_emotions:
    print(f'Fitting for {feature}')
    nb_pipeline.fit(X_train, y_train[feature])
    preds = nb_pipeline.predict(X_test)
    print(f'Test accuracy : {accuracy_score(y_test[feature], preds)}')


mod1 = OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior = None))

mod1.fit(CountVectorizer.transform(X_train), y_train['happiness'])


"""

"\nnb_pipeline = Pipeline([\n    ('BoW', count_vector),\n    ('classifier', OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior = None)))\n])\nfor feature in target_emotions:\n    print(f'Fitting for {feature}')\n    nb_pipeline.fit(X_train, y_train[feature])\n    preds = nb_pipeline.predict(X_test)\n    print(f'Test accuracy : {accuracy_score(y_test[feature], preds)}')\n\n\nmod1 = OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior = None))\n\nmod1.fit(CountVectorizer.transform(X_train), y_train['happiness'])\n\n\n"

In [50]:
positive_negative_frame

Unnamed: 0,id,clean_text,anger,disgust,fear,happiness,sadness,surprise,pos
13788,JX-mwjSw0dk_0,work office lot paperwork administrative work ...,0,0,0,0,1,0,0
11190,EO_5o9Gup6g_8,will congress convene lameduck session winter ...,1,0,0,0,0,0,0
5468,28006_14,its horrible stupid movie,1,1,0,0,0,0,0
18053,SZRxuS6fn9s_2,first people id like first address,1,1,0,0,0,0,0
11941,g8Cl74tS3oY_3,its true federal support part funding necessar...,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
15627,nw6Kf3AtCz4_26,be sure get questions using askfirebase social...,0,0,0,1,0,0,1
12039,gC_wF3uKFW0_4,so call us see wonderful,0,0,0,1,0,0,1
12824,hThsntvCk2s_8,eck soon check often look forward hearing,0,0,0,1,0,0,1
5754,298774_5,and impressed film,0,0,0,1,0,0,1


In [51]:
X_train, X_test, y_train, y_test = tts(positive_negative_frame.clean_text.values, positive_negative_frame['pos'].values, test_size = 0.2)


# BERT MODEL

In [52]:
import tensorflow as tf
from transformers import BertTokenizerFast, TFBertModel

In [53]:
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [54]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [55]:
MAX_LEN = 60

def tokenize(data,max_len=MAX_LEN) :
    input_ids = []
    attention_masks = []
    for i in range(len(data)):
        encoded = tokenizer.encode_plus(
            data[i],
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids),np.array(attention_masks)


In [56]:
train_ids, train_masks = tokenize(X_train)
test_ids, test_masks = tokenize(X_test)

In [73]:
def create_model(bert_model, max_len=MAX_LEN):
    
    ##params###
    opt = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-7)
    loss = tf.keras.losses.binary_crossentropy

    input_ids = tf.keras.Input(shape=(max_len,),dtype='int32')
    
    attention_masks = tf.keras.Input(shape=(max_len,),dtype='int32')
    
    embeddings = bert_model([input_ids,attention_masks])[1]
    
    output = tf.keras.layers.Dense(1, activation="sigmoid")(embeddings)
    
    model = tf.keras.models.Model(inputs = [input_ids,attention_masks], outputs = output)
    
    model.compile(opt, loss=loss, metrics=['binary_accuracy'])
    
    
    return model

In [74]:
model = create_model(bert_model, MAX_LEN)
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_9 (InputLayer)           [(None, 60)]         0           []                               
                                                                                                  
 input_10 (InputLayer)          [(None, 60)]         0           []                               
                                                                                                  
 tf_bert_model_1 (TFBertModel)  TFBaseModelOutputWi  109482240   ['input_9[0][0]',                
                                thPoolingAndCrossAt               'input_10[0][0]']               
                                tentions(last_hidde                                               
                                n_state=(None, 60,                                          

In [97]:
max(bert_results)

array([0.9908405], dtype=float32)

In [75]:
history_bert = model.fit([train_ids,train_masks], y_train, validation_data=([test_ids,test_masks], y_test), epochs=3, batch_size=32)



Epoch 1/3
Epoch 2/3
Epoch 3/3


In [77]:
bert_results = model.predict([test_ids, test_masks])











In [None]:
B = np.where(A > 0.5, 1, 0)

In [94]:
bert_results_enc = np.where(bert_results > 0.5, 1, 0)

bert_results_enc

array([[1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
    

In [None]:
from sklearn.metrics import confusion_matrix

In [95]:
confusion_matrix(y_true= y_test, y_pred=bert_results_enc)

array([[225,  92],
       [ 55, 268]], dtype=int64)