In [1]:
import spacy
import pandas as pd
import string
import numpy

In [2]:
nlp = spacy.load('en_core_web_md')

In [6]:
filename = 'dataset\8000_utterances.csv'

In [63]:
df = pd.read_csv(filename)

In [77]:
class Text_Pre_Processor:
    '''Prepare dataframe for use in Tf-id and semantic similarity comparison. 
    
    Args: 
    nlp= initialized spaCy 
    df = prepared pandas dataframe
    
    Output: 
    Doc object with lemmatized text version of input without punct or stop words'''
    def __init__(self, nlp):
        self.nlp = nlp
     
    #input string, remove punct, return new string
    def remove_punct(self, text):
        text = text.translate(str.maketrans('','',string.punctuation))
        return text
        
    #input text, remove text, return new doc.
    def stop_word_removal(self, text):
        doc = nlp(text)
        stop_word_index = []
        for index, token in enumerate(doc):
            if token.text.lower() in nlp.Defaults.stop_words:
                stop_word_index.append(index)
        filtered_tokens = [token.text for i,token in enumerate(doc) if i not in stop_word_index]
        doc = nlp(' '.join(filtered_tokens))
        return doc
        
    #input doc, return lemmatized doc
    def lemmatization(self, doc):
        lemmas = ''
        for token in doc:
            lemmas = lemmas + (token.lemma_) + ' '
        return nlp(lemmas)
        
    #Perform all processing on text input
    def process(self, utterance):
        text = self.remove_punct(utterance)
        doc = self.stop_word_removal(text)
        doc = self.lemmatization(doc)
        return doc

In [98]:
df = pd.read_csv(filename)

In [100]:
#preprocess text into lemmas, add semantic vector data into dataframe per entry, export into new dataframe

text_processor = Text_Pre_Processor(nlp)
#add new column to dataframe:
df['semantic_value'] = 0.0
df['semantic_value_vec_norm'] = 0.0


for i, text in enumerate(df.loc[:, 'utterance']):
#for i, text in enumerate(df.loc[0:5, 'utterance']):
    doc = text_processor.process(text)
    new_utterance = []
    for token in doc:
        new_utterance.append(token.text)
    df.at[i, 'utterance'] = ' '.join(new_utterance)
    
    #Cannot put array into df cell. Could convert to str and then back to list for sklearn?
    df.at[i, 'semantic_value'] = str(doc.vector)
    
    df.at[i, 'semantic_value_vec_norm'] = doc.vector_norm
    
#export treated DF as CSV for use in next steps
df.to_csv('treated_data.csv', index=False)

In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8175 entries, 0 to 8174
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   flags                    8175 non-null   object 
 1   utterance                8175 non-null   object 
 2   category                 8175 non-null   object 
 3   intent                   8175 non-null   object 
 4   semantic_value           8175 non-null   object 
 5   semantic_value_vec_norm  8175 non-null   float64
dtypes: float64(1), object(5)
memory usage: 383.3+ KB


In [102]:
df.head()

Unnamed: 0,flags,utterance,category,intent,semantic_value,semantic_value_vec_norm
0,BM,problem cancel order,ORDER,cancel_order,[ 0.9792027 1.7512335 -2.1456368 -0.207200...,34.474487
1,BIM,find information cancel order,ORDER,cancel_order,[ 1.818227 -0.20782498 -1.0179275 -0.259152...,36.865709
2,B,need help cancel order,ORDER,cancel_order,[ 3.3943768 2.520625 -4.8734274 -0.143539...,42.555865
3,BIP,help cancel order,ORDER,cancel_order,[ 1.948936 1.3805002 -4.733937 0.601046...,40.969231
4,B,problem cancel order,ORDER,cancel_order,[ 0.9792027 1.7512335 -2.1456368 -0.207200...,34.474487
