In [34]:
import pandas as pd
import os
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector
from sklearn import preprocessing
import os
from gensim.models import KeyedVectors
import nlpaug.augmenter.word as naw

In [35]:
def get_cats(df):
    import pandas as pd
    cats = []
    for i in range(0,len(df)):
        row = df.iloc[i,:]
        cats.append(list(row[~pd.isna(row.values)].index))
    return pd.Series(cats)

def get_lang_detector(nlp, name):
    """Adds language detection to spaCy pipeline"""
    return LanguageDetector()

def fix_ShortComments(df, col = 'comments'):
    """Replaces comments with three or less characters with 'no comments found'"""
    m = df[col].str.len().values <= 3
    df[col] = df[col].mask(m, 'no comment found')
    return df

def remove_CharRepetition(string):
    """Reduces 3 or more same character repetition to one: pppleeeease -> please"""
    import re
    def extract(match_obj):
        if match_obj.group() is not None:
            char = match_obj.group(0)[0]
        return char
    return re.sub(r"([aeiou])\1\1+|([b-df-hj-np-tv-z])\2\2+|(\W)\3+", extract, string)

def remove_Punctuation(text):
    """Removes all punctuation symbols but apostrophes, commas and periods"""
    import string, re
    remove = string.punctuation
    remove = remove.replace(".", "")
    remove = remove.replace(",", "")
    remove = remove.replace("'","")
    pattern = r"[{}]|[{}]$|^[{}]".format(re.escape(remove), re.escape(string.punctuation), re.escape(string.punctuation))
    
    return re.sub(pattern, "", text)

def text_cleaning(df, col = 'comments'):
    """
    1. Replaces any character representing a whitespace
    2. Removes extra white spaces
    3. Unescapes unicode and html characters
    4. Removes web sites and emails
    5. Removes white spaces at the begining/end of the comment
    6. Removes not word characters
    """
    import html, string
    from unidecode import unidecode
    return (df[col]
            .str.translate(str.maketrans(string.whitespace, "      "))# Replaces whitespace characters ' \t\n\r\x0b\x0c'
            .map(unidecode, na_action = 'ignore')# Decodes characters with unicode encoding
            .map(html.unescape, na_action = 'ignore')# Unescapes HTML characters
            .replace('[Ff]rom case [a-zA-Z]{4,}\S*:\s?|[Tt]o case [a-zA-Z]{4,}\S*:\s?','', regex = True) # Removes a web classificator 'From case:'
            .replace("(https\W*\S*|www\W*\S*)|(\S*@\S*\s?)", "", regex = True)# Removes websites and emais###(https\W*\S*|www\W+\S+)|(\S*@\S*\s?)
            .apply(remove_CharRepetition)# Removes repetition of 3 or more characters, i.e., pllleeeease --> please
            .replace("^\s+|\s+$", "", regex = True)# Removes extra spaces at the begining/end of the comment
            .apply(remove_Punctuation)# Removes every punctuation symbol but apostrophes, commas and periods
            .replace("\s{2,}", " ", regex = True)# Removes extra whitespaces
            .str.lower()
            )

def generate_Doc(df, col = 'comments'):
    import swifter
    """Generates a spaCy Doc per comment"""
    return df[col].swifter.apply(nlp)

def count_Sents(df, col = 'Doc'):
    """Counts the number of sentences per comment"""
    return [len(list(Doc.sents)) for Doc in df[col]]

def get_Lang(df, col, info):
    """Extracts the language information from nlp object"""
    return [Doc._.language.get(info) for Doc in df[col]]

def remove_ShortComments(df, col = 'comments'):
    """Removes comments whose length is < 2"""
    return df[df[col].apply(len) > 2]

def delete_AgentComments(df, col1 = 'comments', col2 = 'Doc'):
    """
    Deletes comments typed by a contact center agent
    
    The entries are:
    
    df: the complete data frame
    col1: the column containing the comments
    col2: the column containing the nlp object
    
    Returns those elements of the data frame that contain only customer comments based on the grammatical structure of the sentences;
    guest/customer word doesn not appear as nominal subject followed by a proper noun
    For more information see https://universaldependencies.org/en/dep/nsubj.html    
    """
    from spacy.matcher import Matcher
    import swifter
    series = df[df[col1].str.contains('guest|customer', regex = True, na = False)][col2]
    matcher = Matcher(nlp.vocab)
    pattern = [[{'LOWER': {'IN':['guest', 'customer']}, 'POS': 'PROPN', 'DEP': 'nsubj'}]]
    matcher.add('CustomerGuest', pattern)
    matches = series.swifter.progress_bar(False).apply(matcher)
    # return pd.Series([True if item != 0 else False for item in matches.apply(len)])
    return df[~df.index.isin(matches[matches.apply(len) != 0].index)]

def create_synthetic_labels(data:pd.DataFrame(), min_num_labels:int=100):
    '''
    Returns a new DataFrame with synthetic data. Parameters are input dataframe, minimum number of labels, 
    and list of levels that sythetic data should be applied to.

            Parameters:
                    data (pd.DataFrame()): Pandas DataFrame
                    min_num_labels (int): Decimal Integer, with default value of 100

            Returns:
                    result_df (pd.DataFrame): Resulting dataframe with synthetic labels
    '''
    issue_cat_list = [item for item in data['labels'].value_counts().items()]
    for issue_cat, issue_num in issue_cat_list:
        if issue_num < min_num_labels:
            synth_add_num = min_num_labels-issue_num
            real_cat_list = data[data['labels']==issue_cat].index.tolist()
            real_cat_num = len(real_cat_list)
            quotient = int(synth_add_num/real_cat_num)
            remainder = synth_add_num%real_cat_num
            remainder_count = 1
            for real_cat_item in real_cat_list:
                if quotient > 0:
                    for count in range(quotient):
                        temp_row = data.iloc[real_cat_item]
                        temp_row['data_type'] = 'synthetic'
                        temp_row['comments'] = aug1.augment(temp_row['comments'])
                        data = data.append(temp_row,ignore_index=True)
                if (remainder > 0) & (remainder_count<=remainder):
                        temp_row = data.iloc[real_cat_item]
                        temp_row['data_type'] = 'synthetic'
                        temp_row['comments'] = aug1.augment(temp_row['comments'])
                        data = data.append(temp_row,ignore_index=True)
                        remainder_count=remainder_count+1
  
    return data

In [44]:
files = []
rawFolder = 'Smoketests/'
preprocesedFolder = 'Preprocessed/'
for entry in os.listdir(rawFolder):
    if 'csv' in entry or 'parquet' in entry:
        files.append(entry)
dataset = files[3]

In [30]:
le = preprocessing.LabelEncoder()

nlp = spacy.load('en_core_web_lg')
Language.factory("language_detector", func = get_lang_detector)
nlp.add_pipe('language_detector', last = True)
# pd.set_option('display.max_columns', None

In [8]:
%%time
df = (pd.read_parquet(dataset)
        .query("CallCentreRep == 'Web Comment'")
        .reset_index(drop=True)
        .filter(regex='[Mm]sg[DL]|\w+\W', axis = 1)
        .assign(cats = lambda x: get_cats(x.iloc[:,4:]))
        .filter(items = ['MsgDate', 'MsgLevelDescript', 'MsgLevel', 'MsgDetails', 'cats'])
        .rename(columns = {'MsgDetails': 'comments', 'MsgLevelDescript': 'labels'})
        .fillna({"comments": 'No comment found'})# imputing missing values
      # .query("description == 'Web Comment' & issue != 'Lv 4 - Hang Up/Wrong Number/Disconnect/Prank'")# removing phone comments
      # .drop_duplicates(subset='case_id')# removing duplicate cases
        .reset_index(drop = True)
        .pipe(fix_ShortComments)# imputing comments with <=3 characters
        .assign(comments = lambda x: text_cleaning(x, 'comments')# cleaning text by removing: html, unicode, punctuation, etc
                , labels = lambda x: x['labels'].replace('Level\s\d\s\W', '', regex = True)# Removes 'Level - D' from each row
                , Doc = lambda x: generate_Doc(x, 'comments')# generating nlp objects
                , NumSents = lambda x: count_Sents(x, 'Doc')# counting sentences per comment (spaCy)
                , NumWords = lambda x: x['Doc'].apply(len)# counting the words per comment
                , Lang = lambda x: get_Lang(x, 'Doc', info = 'language')# extracting language from nlp object
                , Score = lambda x: get_Lang(x, 'Doc', info = 'score')# extracting language score from nlp object
                )
        .pipe(remove_ShortComments)# Removes comments whose length is < 2 (mostly empty/one-character comments
        .query("Lang != 'es' & Score > 0.70")
        .pipe(delete_AgentComments)
      )
df.shape

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


Pandas Apply:   0%|          | 0/9375 [00:00<?, ?it/s]

CPU times: user 5min 1s, sys: 2.36 s, total: 5min 3s
Wall time: 5min 1s


(9345, 10)

### NLP Aug

In [9]:
os.environ["MODEL_DIR"] = 'model'
model_dir = 'model/'
aug1 = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")

In [None]:
# df['issues'] = le.fit_transform(df['MsgLevelDescript'])

In [11]:
output_df = df[['comments', 'labels']].copy()
output_df['data_type'] = 'real'

In [13]:
%%time
output_ready_df = create_synthetic_labels(output_df, 300)

CPU times: user 4d 26min, sys: 2h 10min 40s, total: 4d 2h 36min 41s
Wall time: 12h 44min 36s


In [27]:
output_ready_df.comments = output_ready_df.comments.apply(''.join)

In [15]:
output_ready_df.head()

Unnamed: 0,comments,labels,data_type
0,"[sent sunday, july 29 to 31, 2022 731 pm to fa...",DQ Cake,synthetic
1,[general corporate request yes i know that i s...,Other Request/Suggestion,synthetic
2,[general item of sales promotional notice corp...,Dilly Bar,synthetic
3,"sent sunday, july 31, 2022 741 pm to fan relat...",DQ Mobile App,real
4,points and rewards please add my purchase to m...,DQ Mobile App Points,real


In [32]:
output_ready_df.to_parquet(preprocesedFolder+dataset,engine = 'pyarrow',compression = 'gzip')