<a href="https://colab.research.google.com/github/ojw92/NLP-for-Text-Classification/blob/main/BERT_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [56]:
#@title Preprocessing (not needed for BERT)

# preprocessor.py

# Preprocessing steps for RNN & CNN

# Model for S22 VOC only, but with added preprocessing steps

# https://www.geeksforgeeks.org/python-call-function-from-another-file/

from timeit import default_timer as timer

import pandas as pd
import numpy as np
import re
import nltk
nltk.download('wordnet')    # for lemmatization function; don't need this line if package installed
nltk.download('omw-1.4')    # for lemmatization function; don't need this line if package installed
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer     # based on The Porter Stemming Algorithm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from datetime import datetime, timedelta
import os
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


# https://towardsdatascience.com/an-easy-tutorial-about-sentiment-analysis-with-deep-learning-and-keras-2bf52b9cba91

# 0 - split & prepare data for preprocessing                              <- add a preliminary stopword removal step (RNN & CNN)?
# 0.5 - remove duplicate data
# 1 - remove hyperlinks, emails & pricetags
# 2 - remove emoji
# 3 - remove punctuations
# 4 - apply lower case

# ===== use min-df to find low-frequency words =====

# 5 - fix misspellings                                                    <- (need to implement!)
# 6 - replace words with generic nouns                                    <- (need to implement!)
# 7 - remove stopwords       # better to replace then remove?             <- final stopword clean-up step (separate list for RNN & CNN)
# 8 - lemmatize
# (8.5 - stemming)

# 9 - tokenization & padding

# ============================================================================================================= 0                 <- add a preliminary stopword removal step

def data_split(df22, R_known=True):

    # Split the input dataframe into Text (Title + Contents) and Classes dataframes
    # input must have 3 columns of string entries (Title, Contents, and Classes)

    
    # print(df22.describe())     # get an overview of the dataset

    # check if every row entry of each column is string type (some are NaN, so no)
    # print(df22.applymap(lambda x : type(x).__name__).eq({'Title': 'str', 'Content': 'str', 'Class':'str'}))

    # convert NaN to empty strings (NaN -> str)
        # df22.apply(str) converts all columns to str, as well
    df22 = df22.replace(float('nan'), '', regex=True)

    # concatenate strings of title & content with a " " in between (1 body of text)
    text22 = df22['Title'] + " " + df22['Content']      # slicing DataFrame via .iloc[:,0] makes it a Series
    text22 = pd.DataFrame(text22, columns= ['Text'])    # so initialize it as a DataFrame. pd.DataFrame(some_Series) works
    classes22 = df22['Class']

    if R_known == True:
    # R, r, YR = 1;     N, n, YN = 0
        R_cases = re.compile('R|YR', re.IGNORECASE)
        N_cases = re.compile('N|YN', re.IGNORECASE)
        classes22 = classes22.replace(to_replace=R_cases, value=1)
        classes22 = classes22.replace(to_replace=N_cases, value=0)
    else:
        # R_known == False; prepping not yet classified data
        Y_N_cases = re.compile('Y|N', re.IGNORECASE)
        classes22 = classes22.replace(to_replace=Y_N_cases, value=0)     # all N's for simplicity

    classes22 = pd.DataFrame(classes22, columns=['Class']).astype('int32')
    # classes22.columns = ['Class']    # classes22=pd.DataFrame(classes22) causes an error cus   pd.DataFrame(some_DataFrame) makes no sense - should pass a list

    #print(text22.head(10))
    #print(classes22.head(10))
    print('==================================================')

    return text22, classes22



# ============================================================================================================= 0.5

def drop_dupe_text(text22, classes22):

    # df22.drop_duplicates() will remove extra rows that have the same values in all columns (redundant)
    # As a result only duplicate rows left are pairs of datapoints with 0 & 1 as their labels
    # This function removes all of rows from those pairs with 0 as their label, as they can be disruptive
    # in training the model. Label 1 should take priority

    textdupeindex = text22[text22.duplicated(keep=False)==True].index
    dupeclasses = classes22.filter(items = textdupeindex, axis=0)
    redundantclass0 = dupeclasses[dupeclasses.Class==0]
    text22.drop(index=redundantclass0.index, inplace=True)
    classes22.drop(index=redundantclass0.index, inplace=True)

    return text22, classes22



# ============================================================================================================= 1

def hep_remove(text22):

    # Remove hyperlinks, e-mails, and pricetags from a dataframe of strings
    ### It would be nice to instead of removing them, replace e-mails and hyperlinks with "url"
    ### and replace price tags with "price"

    # algorithm
    # regex that matches all non-whitespace text before and after '.com', '.org', '.gov', '.edu'
    # "" before and after '@', but the above line should take care of this automatically
    # "" after '$£€₱₽¥₩'; number, comma, and period should suffice, but account for pricetags written in text

    # list of hyperlinks, e-mails, and price tags
    # use this step specifically to remove words that have punctuation mixed in
    heplist = ['.com','.edu','.org','.gov', '.co', 'https:', 'http:',
                '$','£','€','₱','₽','¥','₩', ',000', '.00']   # include '-'? need to think
    # remove hyperlinks (& e-mails, as side effect!)
    text22 = [' '.join(y for y in x.split() if not any(ele in y for ele in heplist)) 
                for x in text22.Text]
    text22 = pd.DataFrame(text22, columns= ['Text'])

    return text22



# ============================================================================================================= 2

def emoji_remove(text22):

    # Remove image & textual imoji

    # Remove emoji
    # https://towardsdatascience.com/text-preprocessing-for-data-scientist-3d2419c8199d
    def emoji_rem(string):
        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', string)
    
    text22.Text = text22['Text'].apply(emoji_rem)
    # text22 = list(map(emoji_rem, text22))     # for a list, map the function

    """
    # Remove texual emoji and other colon  (":fire:", ":hundred_points",)
    def textualemoji(string):
        return re.sub(r'[^:\w:$]', ' ', string)     # starts & ends with :, matches text, number and _ between :'s

    text22.Text = text22['Text'].apply(textualemoji)
    """
    # Too many 'letter emojis' - need to use regex
    """letteremoji = ['Q_Q','QQ','T_T','UWU','uwu','UwU','uWu', 'orz','OTL',
                   'o_o','O_O','o_O','O_o', 'o-o','O-O', '==','=_=','-_-','--',
                   '*_*','$_$','@_@','?_?','+_+','>_>','<_<', '~_~', "'-'",
                   ':x',';x', ':D',';D','D:','D;', ':)',';)', ':(',';(', '(:','(;','):',');',
                   ':s',':S']"""
    
    #text22 = [' '.join(y for y in x.split() if not any(ele in y for ele in letteremoji)) 
    #            for x in text22]
    # text22 = pd.DataFrame(text22, columns=['Text'])
    
    return text22

# ===================== Implement regex in letteremoji later ========================



# ============================================================================================================= 3

def punc_remove(text22):
    
    # Remove punctuation
    
    # using RegexpTokenizer might lose words like "Mr."
    # need to remove just the punctuation, not the whole word attached to it!
    # what about words like, 'work-life-balance' ?

    # double backslash for a string of single backslash
    # punclist = ['!','@','#','$','%','^','&','*','(',')','-','_','=','+','\\','|','`','~',',','.','<','>','/','?',';',':','"',"'"]


    def puncrem(string):
        return re.sub(r'[^\w\s]', '', string)

    text22 = text22['Text'].apply(puncrem)
    text22 = pd.DataFrame(text22, columns= ['Text'])

    return text22

# If we get rid of punctuation in things like "it's" or ":S", we're gonna get
# single letter words or misspelled words (it's -> its)
# for CNN, we can just remove them, but RNN we need a fix...



# ============================================================================================================= 4

def lower_case(text22):

    # Take a data frame of strings and make them lower case characters
    
    # First replace any NaNs generated from other preprocessing steps with empty strings
    text22 = text22.replace(np.nan, '', regex=True)

    # Apply lower case
    text22 = text22['Text'].str.lower()                 # Series
    text22 = pd.DataFrame(text22, columns= ['Text'])

    return text22



# ============================================================================================================= 5               <- (need to implement!)

def fix_spelling(text22):
    
    # Take a data frame of strings and fix all misspelled words

    
    # logic here



    text22 = pd.DataFrame(text22, columns= ['Text'])

    return text22




# ============================================================================================================= 6               <- (need to implement!)

def pn_replace(text22):

    # Take a data frame of strings and replace proper nouns or certain nouns with a common, generic noun

    # Texas, New York City, NYC, LA, Los Angeles, NJ --> usloc
    # China, Hong Kong, New Zealand, UK, Canada, India --> nonusloc
    # Max, Mark, Johnny, Adam, Obama, Kanye --> person
    # bees, duck, my dog, dog, cat, falcon, eagle --> animal      (for a stopword list, make sure "my dog" appears before "dog"
    # he, she, their, us, it pronouns --> pronoun
    # use this for ideas https://www.geeksforgeeks.org/python-lemmatization-approaches-with-examples/
    # exynos vs snapdragon

    #querywords = query.split()
    #print(querywords)
    #resultwords  = [word for word in querywords if word.lower() not in stopwords]
    #result = ' '.join(resultwords)


    text22 = pd.DataFrame(text22, columns= ['Text'])

    return text22
    


# ============================================================================================================= 7

# Stopword will need a lot of work! Expect performance to be bad/worse with stopwords
# For the low-freq words the stopwords can't catch, use min-df to clean up

def stopword_remove(text22, model="RNN"):

    # Take a data frame of strings and remove all preselected stopwords from them

    # Remove stopwords (including my choice of words)
    # may need to keep words like "not", "doesn't", "does"
    #from nltk.corpus import stopwords
    #nltk.download('stopwords')     # don't need this line if package installed
    #stop = stopwords.words('english')
    """i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself,
    yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their,
    theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, 
    being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of,
    at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down,
    in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each,
    few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don,
    don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn,
    doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mustn't, needn,
    needn't, shan, shan't, shouldn, shouldn't, wasn, wasn't, weren, weren't, won, won't, wouldn, wouldn't 
    """
    # lots of words in nltk stopwords list have significance in VOC context, so make my own list:
    # if using CNN, use stopwords. If using LSTM(RNN), may need to think about it. lemmatizing LSTM can get tricky too
    """
    stop = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd",
            'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers',
            'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
            'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been',
            'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if',
            'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between',
            'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out',
            'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
            'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no',
            'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'will', 'just',
            "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 
            'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't",
            'weren', "weren't"]
    """
    # use this basic list for now - study data closely and add more later
    if model == "RNN":
        # to/from, up/down, in/out opposite meanings -> significant? "be" verb not significant
        # who/what/when/where/why/how, which, only
        stop = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd",
            'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers',
            'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
            'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been',
            'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'if',
            'or', 'as', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'between',
            'into', 'through', 'during',
            'to', 'from', 'in', 'out',     # to/from, in/out,      on/off, up/down, above/below, over/under
            'again', 'further', 'then', 'once', 'here', 'there',
            'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such',
            'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'will', 'just',
            'd', 'll', 'm', 'o', 're', 've', 'y', 'ma']
    elif model == "CNN":
        stop = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd",
            'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers',
            'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
            'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been',
            'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if',
            'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'between',
            'into', 'through', 'during', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out',
            'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
            'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such',
            'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'will', 'just',
            'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'ma']
    
    # create my own list of stopwords and concatenate to existing list of 179 NLTK stopwords
    my_stopwords = ['lol', 'lmao', 'lmfao', 'rofl', 'wtf','fuck','shit','jesus','christ','wth','what the fuck',
                    'what the hell','samsung','apple','lg','xiaomi',
                    'cus','cuz',
                    '0','1','2','3','4','5','6','7','8','9',
                    'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',
                    'hi','hey','hello','hiya','greetings','all','sincerely','bye','farewell',
                    'thanks','thank','appreciate','think','believe',
                    'probably','almost','likely', 'sometimes','frequently','occasionally','gradually','occasion','gradual',
                    'expensive','cheap','price','pricey',
                    'really'
                    ]
    # concatenate RNN/CNN stopwords with my stopwords
    stop = [*stop, *my_stopwords]

    #text22 = text22.apply(lambda x: ' '.join([word for word in str(x).split() if word not in (stop)])) # joins all rows for some reason
    removeit = np.vectorize(lambda x: ' '.join([word for word in str(x).split() if word not in (stop)]))
    text22 = pd.DataFrame(removeit(text22), columns= ['Text'])

    return text22



# ============================================================================================================= 8

# Lemmatization - 'caring' --> 'care'

def lemmatize_df(text22):

    # Take a data frame of strings and lemmatize its texts
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    wnl = WordNetLemmatizer()   # plural --> singular, but verb tense unchanged

    # This is my own, long (high-CPU-usage?) way to tokenize & lemmatize words!
    tempdf = pd.DataFrame(columns=['Text'])
    for i in text22['Text']:
        jol = []
        for j in w_tokenizer.tokenize(i):
            jol.append(wnl.lemmatize(j))
        tempdf.loc[len(tempdf.index)] = [jol]
        # text22['Text'].loc[len(text22.index)] = [jol]   # find a way to append directly to existing df and trim it
    # print(tempdf.head(10))
    # print("length of tempdf: " , len(tempdf))

    text22 = pd.DataFrame(tempdf, columns=['Text'])

    return text22

# ===== A much shorter way to lemmatize! (more CPU-efficient?) =====
"""
def lemmatize_text(some_string):
    return [wnl.lemmatize(w) for w in w_tokenizer.tokenize(some_string)]
df22['cont_sw_lem'] = df22['cont_sw'].apply(lemmatize_text)    # .apply works on DataFrame or Series
print('\nAfter lower case & lemmatization: \n')
print(df22.head())
"""



# ============================================================================================================= 8.5

# Stemming - 'caring' --> 'car'
# No need to use both lemmatization & stemming - just use lemmatization

def stemming_df(text22):

    # Take a data frame of strings and do stemming on its texts
    
    # nature, natures, natural --> natur
    snowball_stemmer = SnowballStemmer('english')
    # kol = []     # This is my way of stemming on one Series
    # for i in text22['Text']:
    #     kol.append([snowball_stemmer.stem(word) for word in i])
    # stemmed_word = [snowball_stemmer.stem(word) for word in word_tokens]
    # text22['Text'] = kol
    
    def stemit(some_string):
        return [snowball_stemmer.stem(word) for word in some_string]
    
    text22 = text22.Text.apply(stemit)
    text22 = pd.DataFrame(text22, columns= ['Text'])

    # text22['Text'] = text22.Text.apply(stemit)

    #print(text22['Text'][2715])

    return text22






# ============================================================================================================= 9


# Word tokenization

def tokenize_pad_words(text22, min_df=0, max_xref=0):
    
    # Take a data frame of strings to tokenize its words, then assign an index to each word

    #      tokenization
    # =========================
    # https://www.kaggle.com/code/hamishdickson/using-keras-oov-tokens
    # https://stackoverflow.com/questions/49073673/include-punctuation-in-keras-tokenizer
        # should I include punctuations? Only like 30 more features...
    token22 = Tokenizer(num_words=11000, lower=True, oov_token='OOV')  #20221012 data 10920 words w freq>2
    token22.fit_on_texts(text22['Text'])
    print('\nIndex determined by word freq(hi->lo) : \n', token22.word_index)    # Dict object - index determined by word frequency highest-lowest

    # Convert each voc entry to list of tokenized words, then find longest list (most words in entry)
    x22 = token22.texts_to_sequences(text22['Text'])
    max_x22 = max(len(elem) for elem in x22)
    print('\nVOC entries represented as tokenized indices : \n', x22[0:2])      # List of lists - too long to print in full
    print('\nFrequency of each word : \n', token22.word_counts)                 # Dict
    print('\nNumber of sentences each word appears in : \n', sorted(token22.word_docs.items(), key=lambda z: z[1], reverse=True))  # [('the', 3632), ('i', 3010), ('to', 2989), ...]
        # sorted(token22.word_docs.items())    [('0': 17), ('00': 1), ('000a': 1), ...] 
    print('\nTotal number of VOC : \n', token22.document_count)
    print('\nLongest VOC entry has %d (tokenized) words.' % max_x22)

    word_size22 = len(token22.word_index) + 1


    # Take a list of lists of tokenized words and do padding on them

    #      padding
    # =========================
    # each VOC has different number of words, so x has a list of lists, which is why to_categorical(x) caused error
    # normalize every voc to vectors of same length - max_x
    if max_x22 >= max_xref:
        max_x22 = max_x22
    else:
        max_x22 = max_xref
    padded_x22 = pad_sequences(x22 , max_x22)
    print('\nPadded results : \n', padded_x22)
    print('type of padded_x22 is array of lists/matrix?  ', type(padded_x22))            # <---------- delete this later after finding out what it is
    print('Length of padded_x22: ', len(padded_x22))
    print('word_size22: ', word_size22, ' (number of unique words + 1)')
    print('max_x22: ', max_x22)


    # Feature to add later - minimum word frequency to simplify our tokenized list
    # Simple line below can filter by word count > 1. But need to apply to padded_x22 & max_x22, too
        # For now, use Tokenizer(num_words=11000) since 20221012 data has 10920 words with frequency > 2
    # mindf1 = {k:v for (k,v) in token22.word_counts.items() if v > 1}
    # print(len(mindf1))


    # print(padded_x22.dtype)    # int32
    # print(classes22.dtype)    # int64 


    return padded_x22, word_size22, max_x22



# ============================================================================================================= 9


# Visualizing Word2Vec Embeddings with t-SNE
import gensim
from sklearn.manifold import TSNE
import random

def semantic_similarity(X_train):

    # X_train should be a single-column DataFrame of preprocessed Title + Content (text22 or df22['Text'])
    X_train = X_train.apply(lambda x: gensim.utils.simple_preprocess(str(x)))
    # Train the word2vec model
    w2v_model = gensim.models.Word2Vec(X_train,
                                    vector_size=100,   # size of vectors desired
                                    window=5,  # # of words before & after target word to use as context
                                    min_count=2        # min_df
                                    )
    w2v_model.build_vocab(X_train)  # prepare the model vocabulary
    w2v_model.train(X_train, total_examples=w2v_model.corpus_count, epochs=w2v_model.epochs)
    vocab_size, embedding_size = w2v_model.wv.vectors.shape

    n_samples = 250
    # Sample random words from model dictionary
    random_i = random.sample(range(vocab_size), n_samples)
    # def token2word(token):
    #     return w2v_model.wv.index_to_key[token]
    random_w = [w2v_model.wv.index_to_key[i] for i in random_i]

    # Generate Word2Vec embeddings of each word
    word_vecs = np.array([w2v_model.wv[w] for w in random_w])   # 'Word2Vec' object not subscriptable; use .wv

    # Apply t-SNE to Word2Vec embeddings, reducing to 2 dims
    tsne = TSNE()
    tsne_e = tsne.fit_transform(word_vecs)

    # Plot t-SNE result
    plt.figure(figsize=(32, 32))
    plt.scatter(tsne_e[:, 0], tsne_e[:, 1], marker='o', c=range(len(random_w)), cmap=plt.get_cmap('Spectral'))

    for label, x, y, in zip(random_w, tsne_e[:, 0], tsne_e[:, 1]):
        plt.annotate(label,
                    xy=(x, y), xytext=(0, 15),
                    textcoords='offset points', ha='right', va='bottom',
                    bbox=dict(boxstyle='round, pad=0.2', fc='yellow', alpha=0.1))



# ============================================================================================================= 9


import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
def scatterplot_VOC(x, y, number_of_R, threshold, filename=None, directory=None):
    # x = array of len(y)                      y = array of prediction probabilities
    # number_of_R = number of actual R + 1     threshold = prediction threshold in float
    plt.figure()
    if filename is None:
        filename = ""
    plt.title(f'{filename[10:-4]} Scatter Plot')     # test_data/test_1012.csv --> test_1012
    plt.scatter(x, y_prob3, c=(np.where(x<number_of_R+1,'g', 'r')))      # plots VOC (R & N)
    plt.plot([0, len(y)],[threshold, threshold], c='k', linestyle='--')  # plots threshold

    ###handles, labels = plt.gca().get_legend_handles_labels()    # *use this to add new to existing handle
    R_points = mpatches.Patch(color='g', label='R')
    N_points = mpatches.Patch(color='r', label='N')
    threshline = Line2D([0], [0], color='k', linestyle='--', label='threshold')
    ###handles.extend([R_points, N_points, threshline])           # *also plt.legend(handles=handles)

    plt.legend(handles=[R_points, N_points, threshline], loc="best")
    plt.xlabel('VOC #')
    plt.ylabel('Predictions')

    if directory is not None:
        f = os.path.join(directory, filename[10:-4])
        plt.savefig(f'{f}-scatter.png')



# ============================================================================================================= 9


def add_category(text22):
    # Look for keywords in text that indicate issue category and input 1 (yes) for that category
    # vectorize this function on consolidated text (Title + Content) prior to text cleaning steps

    # 20221107 For now, add only 5 major categories that comprise top 10 issue category in raw data:
    # 3rd Party, Display, Battery, HW, Camera, Notification, Connectivity, Messages, Biometrics, Audio, Bluetooth
            # note that Biometrics is mostly watch issues & Audio is mostly buds issues
            # Checked each word from raw data file & only added if several rows existed

    thirdparty_list = ['facebook', 'snapchat', 'youtube', 'instagram', 'reddit', 'amazon prime', 'cod',
                        'fb', 'spotify', 'netflix', 'zoom', 'discord', 'tik', 'whatsapp', 'twitter',
                        'genshin', 'game', 'dropbox', 'onedrive', 'twitch']
    display_list = ['display', 'screen', 'crack', 'scratch', 'protector', 'hz', 'scroll', 'touch', 'hdr',
                    'refresh rate', 'flicker', 'pixel', 'burn-in', 'burn in', 'tint', 'jump']
    battery_list = ['SoT', 'battery', 'usage', 'consum', 'drain', 'lasts', 'lasting', 'dies']
    camera_list = ['camera', 'shot', 'focus', 'blur', 'astrophotography', 'photo', 'video', 'saturat',
                    'shutter', 'selfie', 'record', 'lens', 'ultrawide', 'ultra wide', 'flash', 'slow-mo']
    noti_list = ['notif', 'vibrat', 'incoming', 'pop-up']
    connect_list = ['connect','network','mobile data','hotspot','esim','sim card','5g','4g','3g',
                    'signal', 'speed', 'internet', 'cellular', 'dual', 'reception', 'coverage']
        # 'data', 'service', 'sim' might catch wrong VOC
    messages_list = ['text', 'messag', 'whatsapp', 'RCS', 'MMS', 'chat', 'send', 'WhatsApp']


    text22['thirdparty'] = text22['Text'].str.contains('|'.join(thirdparty_list), case=False)
    text22['display'] = text22['Text'].str.contains('|'.join(display_list), case=False)
    text22['battery'] = text22['Text'].str.contains('|'.join(battery_list), case=False)
    # add for HW (what are S22 HW topics that span 70+% of S22 HW VOC?)
    text22['camera'] = text22['Text'].str.contains('|'.join(camera_list), case=False)
    text22['notification'] = text22['Text'].str.contains('|'.join(noti_list), case=False)
    text22['connectivity'] = text22['Text'].str.contains('|'.join(connect_list), case=False)
    text22['messages'] = text22['Text'].str.contains('|'.join(messages_list), case=False)
    
    # consider other categories of issues that S22 frequently suffer & topics that are discussed about S22



    # Should check whether each category contains a good chunk of healthily distributed R & N VOC
        # For each category, isolate all the 'True' rows & count up R & N
        # Use the category for CART if a good chunk of S22 is accurately represented with the keywords
            # assigned for that category. Also keep it even for small chunk, if it helps seclude tricky
            # VOC's and make the rest of the data more "pure" of outlier patterns
                # ex: if biometrics issues for S22 largely occur post-update, it'll help remove update-
                # related VOCs, which exhibit strong, temporary pattern. Can check correlation between
                # biometrics and updates as well, using PCA... there's no end to how complex I can go...
    # To improve quality of this data, I should manually determine the category of each VOC instead of
        # categorizing by the words that are mentioned. This will take FOREVER but will lend better data
            # Keep in mind I have to re-open all of old Reddit VOC to record the "flair" of each S22 user
            # Since I'm doing quick concept test for now, just go with keywords mentioned
    # Other than major categories, also super worth categorizing by update/non-update
        # this will help better predict issues around update release times


    return text22




# ============================================================================================================= 9


def shift_values_over(df22, index):
    # Shift columns of values over to the right by one unit from 2nd column
    # Takes DataFrame index object like below:
    # df22[df22.Class.apply(lambda x: x == 'None')].index

    shift_rows = df22.loc[index]   # rows where columns shifted to left
    shift_values = shift_rows[df22.columns[1:-1]]
    shift_rows[df22.columns[2:]] = shift_rows[df22.columns[1:-1]]
    shift_rows.Content = ''
    df22.loc[df22[df22.Class.apply(lambda x: x == 'None')].index] = shift_rows
    
    return df22



# ============================================================================================================= 9


def ROC_AUC(y_test, y_score, fpr_tpr=None, filename=None, directory=None):
    # Creates an ROC curve & computes AUC of the test data based on how it performs on the trained model
    # Also calculates precision, recall & accuracy, and plots the position on ROC based on performance
    # y_test, y_score, tp_fp need to be lists

    # Create ROC curve and compute AUC
    fpr, tpr, thresholds = roc_curve(y_test[:], y_score[:], pos_label=1)    # 'thresholds' go in '_'
    roc_auc = auc(fpr, tpr)
    #print('fpr is', fpr)
    #print('tpr is', tpr)
    #print('thresholds', thresholds)
    print('AUC : ', roc_auc)

    # ROC curve
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color="darkorange",
        lw=lw, label="ROC curve (area = %0.2f)" % roc_auc,)
    plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
    if fpr_tpr is not None:
        plt.plot(fpr_tpr[0], fpr_tpr[1], color="green", lw = 3, marker='x')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    if filename is None:
        filename = ""
    plt.title(f'{filename[10:-4]} ROC Curve')     # test_data/test_1012.csv --> test_1012
    plt.legend(loc="lower right")
    plt.show()

    if directory is not None:
        # save plot to directory
        f = os.path.join(directory, filename[10:-4])
        plt.savefig(f'{f}-roc.png')

    # Need to polish this



def confusion_matrix_display(y_test, y_pred, filename=None, directory=None):
    # inputs: actual labels & prediction labels
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    cm = confusion_matrix(y_test, y_pred)     # labels=clf.classes_
    disp = ConfusionMatrixDisplay(confusion_matrix = cm)     # display_labels=clf.classes_
    disp.plot()
    if filename is None:
        filename = ""
    plt.title(f'{filename[10:-4]} Confusion Matrix')     # test_data/test_1012.csv --> test_1012
    plt.show()

    if directory is not None:
        # save plot to directory
        f = os.path.join(directory, filename[10:-4])
        plt.savefig(f'{f}-confmat.png')

    return tn, fp, fn, tp



def prec_rec_accu(y_test, y_pred):
    # y_test & y_pred need to be 1-D arrays

    precision3 = precision_score(y_test, y_pred)
    recall3 = recall_score(y_test, y_pred)
    accuracy3 = (y_pred3==y_test.Class).sum()/len(y_pred3)
    print('For test data: ')
    print('Precision: {} / Recall: {} / Accuracy: {}'.format(
        round(precision3, 3), round(recall3, 3), round(accuracy3, 3)))
    
    return precision3, recall3, accuracy3

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### BERT Practice
#### v. 20230118

In [57]:
# pip3 install torch torchvision torchaudio     # for PyTorch without GPU, just CPU


In [58]:
!pip install transformers -U

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [59]:
# Check if the GPU can be detected
import tensorflow as tf

device_name = tf.test.gpu_device_name()              # '/device:GPU:0' means GPU is enabled
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [60]:
import torch
# torch.cuda.is_available = lambda : False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla T4'

In [61]:
!cat /proc/meminfo            # check memory resources available

MemTotal:       13297192 kB
MemFree:          565624 kB
MemAvailable:    7973144 kB
Buffers:          362748 kB
Cached:          7137612 kB
SwapCached:            0 kB
Active:          1187008 kB
Inactive:       11033440 kB
Active(anon):        916 kB
Inactive(anon):  4708412 kB
Active(file):    1186092 kB
Inactive(file):  6325028 kB
Unevictable:           0 kB
Mlocked:               0 kB
SwapTotal:             0 kB
SwapFree:              0 kB
Dirty:              1552 kB
Writeback:             0 kB
AnonPages:       4720160 kB
Mapped:          1103328 kB
Shmem:             15584 kB
KReclaimable:     222452 kB
Slab:             277096 kB
SReclaimable:     222452 kB
SUnreclaim:        54644 kB
KernelStack:        5184 kB
PageTables:        41860 kB
NFS_Unstable:          0 kB
Bounce:                0 kB
WritebackTmp:          0 kB
CommitLimit:     6648596 kB
Committed_AS:    6739124 kB
VmallocTotal:   34359738367 kB
VmallocUsed:       53756 kB
VmallocChunk:          0 kB
Percpu:          

In [62]:
# install wandb for tracking data on dashboard
!pip install datasets wandb evaluate -qU
!wget https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/text-classification/run_glue.py

--2023-01-19 21:22:55--  https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/text-classification/run_glue.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27264 (27K) [text/plain]
Saving to: ‘run_glue.py.1’


2023-01-19 21:22:55 (77.9 MB/s) - ‘run_glue.py.1’ saved [27264/27264]



In [63]:
# the run_glue.py script requires transformers dev
!pip install -q git+https://github.com/huggingface/transformers

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [64]:
import wandb

# log in to have data synced to account
# wandb.login()

# log every trained model
%env WANDB_LOG_MODEL=true

env: WANDB_LOG_MODEL=true


In [65]:

# https://github.com/PradipNichite/Youtube-Tutorials/blob/main/FineTune_BERT_Model_Youtube.ipynb



import pandas as pd

df22 = pd.read_csv('/S22_train20221011.csv', sep='\t', index_col=0).drop_duplicates()

# repair rows where values are shifted by 1 column
shift_rows = df22.loc[df22[df22.Class.apply(lambda x: x == 'None')].index]   # rows where columns shifted to left
shift_values = shift_rows[df22.columns[1:-1]]
shift_rows[df22.columns[2:]] = shift_rows[df22.columns[1:-1]]
shift_rows.Content = ''
df22.loc[df22[df22.Class.apply(lambda x: x == 'None')].index] = shift_rows

# filter training data to desired dates
df22 = df22[df22.Date.apply(lambda x: datetime.strptime(x,'%m/%d/%Y %H:%M').date() > datetime(2022,2,21).date())]

# For class imbalance, use roughly same ratio of R & N
######################################################### BERT doesn't need class imbalance addressed
# df22 = pd.concat([df22[df22.Class=='R'], df22[df22.Class=='N'].iloc[::3, :]])

# combine title & content as text22, clean the text, then combine it with labels to a single df  
text22, classes22 = data_split(df22)
df22, classes22 = drop_dupe_text(text22, classes22)

# Preprocessing & data cleaning not required; BERT uses all info in sentence (punctuation, stopwords, etc)   
df22['Class'] = classes22['Class']

df22.head()




################################ DELETE THIS LINE LATER (purpose: reduce data to reduce training time to test features)
# df22 = df22.sample(frac=0.01, random_state=5)






Unnamed: 0,Text,Class
452,Zfold 3 or S22 ultra Trying to decide between ...,0
453,S22 video cam Anyone tried out the video camer...,0
454,Thinking about trading my S21Ultra for S22+ Ev...,0
455,S21 Ultra vs S22+ Both phones are currently at...,0
456,"S21 or S22 Base/Standard Model Hey All,\n\nLoo...",0


In [66]:
df22['Class'].value_counts()

0    13926
1     3658
Name: Class, dtype: int64

In [67]:

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer


from transformers import BertTokenizer, BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)


loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_positio

In [68]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [69]:
# model = model.to( torch.device('cuda') )     # need NVIDIA driver for 'cuda'; currently have AMD on work laptop
# model = model.to('cpu')         # train on CPU

model = model.to('cuda')          # or  model.cuda()


In [70]:

test_data = ["This is possibly the worst battery I have ever seen on a mobile device",
            "How is my device running so smoothly?"]
tokenizer(test_data, padding=True, truncation=True, max_length=512)

{'input_ids': [[101, 2023, 2003, 4298, 1996, 5409, 6046, 1045, 2031, 2412, 2464, 2006, 1037, 4684, 5080, 102], [101, 2129, 2003, 2026, 5080, 2770, 2061, 15299, 1029, 102, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]]}

In [71]:
X = list(df22["Text"])
y = list(df22["Class"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,stratify=y)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [72]:
X_train_tokenized.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [73]:
print(X_train_tokenized['attention_mask'][0])

[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [74]:
len(X_train),len(X_val)


(14067, 3517)

In [75]:
# Create torch dataset
class VOC_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [76]:
train_dataset = VOC_Dataset(X_train_tokenized, y_train)
val_dataset = VOC_Dataset(X_val_tokenized, y_val)

In [77]:
train_dataset[5]


{'input_ids': tensor([  101,  1055, 19317, 11087,  2417,  7262,  3475,  1005,  1056,  1012,
          1012,  1012,  2417,  1029,  2633,  2288,  2026,  1055, 19317, 11087,
          5359,  2651,  1998,  2009,  1005,  1055,  2025,  2130,  2485,  2000,
          2417,  1012,  2009,  1005,  1055, 11034,  2012,  2190,  1012,  2428,
          1010,  2428,  9364,  1012,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [78]:
def compute_metrics(m):
    print(type(m))
    pred, labels = m
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}
     

In [79]:
# Define Trainer
args = TrainingArguments(
    report_to = 'wandb',                     # enable logging to W&B
    output_dir="output",                     # output directory
    num_train_epochs=1,
    per_device_train_batch_size=8
    # overwrite_output_dir = True,
    # evaluation_strategy = 'steps',          # check evaluation metrics at each epoch
    # learning_rate = 5e-5,                   # we can customize learning rate
    # max_steps = 30000,
    # logging_steps = 100,                    # we will log every 100 steps
    # eval_steps = 5000,                      # we will perform evaluation every 500 steps
    # save_steps = 10000,
    # load_best_model_at_end = True,
    # metric_for_best_model = 'accuracy',
    # run_name = 'custom_training'            # name of the W&B run

)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,            # for padding batched data
    compute_metrics=compute_metrics
)

PyTorch: setting up devices


In [80]:
from timeit import default_timer as timer
start = timer()

trainer.train()


end = timer()
print("%4f seconds, %4f minutes elapsed" % (float(end-start), float((end-start)/60)))

***** Running training *****
  Num examples = 14067
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1759
  Number of trainable parameters = 109483778
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss
500,0.3708
1000,0.3079
1500,0.2796


Saving model checkpoint to output/checkpoint-500
Configuration saved in output/checkpoint-500/config.json
Model weights saved in output/checkpoint-500/pytorch_model.bin
Saving model checkpoint to output/checkpoint-1000
Configuration saved in output/checkpoint-1000/config.json
Model weights saved in output/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to output/checkpoint-1500
Configuration saved in output/checkpoint-1500/config.json
Model weights saved in output/checkpoint-1500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /tmp/tmp7qc0kwc5
Configuration saved in /tmp/tmp7qc0kwc5/config.json
Model weights saved in /tmp/tmp7qc0kwc5/pytorch_model.bin


1308.139700 seconds, 21.802328 minutes elapsed


In [81]:
trainer.evaluate()


***** Running Evaluation *****
  Num examples = 3517
  Batch size = 8


<class 'transformers.trainer_utils.EvalPrediction'>


{'eval_loss': 0.2499202936887741,
 'eval_accuracy': 0.9033266988911004,
 'eval_precision': 0.7768361581920904,
 'eval_recall': 0.7513661202185792,
 'eval_f1': 0.763888888888889,
 'eval_runtime': 112.6355,
 'eval_samples_per_second': 31.225,
 'eval_steps_per_second': 3.906,
 'epoch': 1.0}

In [82]:
np.set_printoptions(suppress=True)


In [105]:
text = "Super charging is working very well."
# text = "so many issues with this phone."
inputs = tokenizer(text,padding = True, truncation = True, return_tensors='pt').to('cuda')
outputs = model(**inputs)
print(outputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)    # also gelu(), silu()
print(predictions)
predictions = predictions.cpu().detach().numpy()
predictions

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.8727, -1.0109]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tensor([[ 0.6155, -0.2697]], device='cuda:0', grad_fn=<SiluBackward0>)


array([[ 0.61548567, -0.26971212]], dtype=float32)

In [84]:
trainer.save_model('BertPractice')

Saving model checkpoint to BertPractice
Configuration saved in BertPractice/config.json
Model weights saved in BertPractice/pytorch_model.bin


In [85]:
# trainer.save_model('/content/drive/MyDrive/Youtube Tutorials/toxic')
# model_2 = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/Youtube Tutorials/toxic")
# model_2.to('cuda')

In [86]:
model_2 = BertForSequenceClassification.from_pretrained('BertPractice')
model_2.to('cuda')

loading configuration file BertPractice/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file BertPractice/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were i

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [115]:
# text = "Super charging is working very well."
text = "so many issues with this phone."
inputs = tokenizer(text,padding = True, truncation = True, return_tensors='pt').to('cuda')
outputs = model_2(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions = predictions.cpu().detach().numpy()
predictions

array([[0.09049978, 0.90950024]], dtype=float32)

In [140]:

#today's VOC
# import file and combine title & content
voctoday = pd.read_csv('/0119.csv', sep='\t', index_col=None).drop_duplicates()
voctoday['title'] = voctoday['title'].astype(str) + " " + voctoday['content'].astype(str)
voctoday = voctoday[['title', 'class']]

voclist = list(voctoday['title'])
predlist = []

for i in range(len(voclist)):
  inputs = tokenizer(voclist[i], padding = True, truncation = True, return_tensors='pt').to('cuda')
  outputs = model_2(**inputs)
  predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
  predictions = predictions.cpu().detach().numpy()
  predlist.append(predictions)

organizedzip = zip([x[:25] for x in voclist],
                   predlist,
                   voctoday['class'])
    
# results as voc text, predictions, and actual value
print(list(organizedzip))


[('2nd January update I just', array([[0.9971807 , 0.00281931]], dtype=float32), 'N'), ('No Questions just wanted ', array([[0.9981193 , 0.00188064]], dtype=float32), 'N'), ('S22u vs S23u Having time ', array([[0.9957015 , 0.00429845]], dtype=float32), 'N'), ('No more lag and stutter a', array([[0.7561109, 0.2438891]], dtype=float32), 'N'), ('Qualcomm Bluetooth Codecs', array([[0.99762386, 0.00237617]], dtype=float32), 'N'), ('Sound Issues Hey everyone', array([[0.07707566, 0.92292434]], dtype=float32), 'R'), ('Samsung Leather Case vs O', array([[0.9981981, 0.0018019]], dtype=float32), 'N'), ('Weather widget help nan', array([[0.97075576, 0.02924431]], dtype=float32), 'R'), ('battery concern I can squ', array([[0.9247212 , 0.07527879]], dtype=float32), 'R'), ('S22 Ultra Camera Issued H', array([[0.06688055, 0.9331195 ]], dtype=float32), 'R'), ('Single Take Functionality', array([[0.97552943, 0.02447058]], dtype=float32), 'N'), ('So I already had the Janu', array([[0.9965773 , 0.0034226

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=e5caba9f-cd36-4d50-aaa3-2cf59957a2f4' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>