In [1]:
import spacy
import pandas as pd
import string
import numpy

In [2]:
nlp = spacy.load('en_core_web_md')

In [3]:
filename = 'dataset\8000_utterances.csv'

In [4]:
df = pd.read_csv(filename)

In [5]:
class Text_Pre_Processor:
    '''Prepare dataframe for use in Tf-id and semantic similarity comparison. 
    
    Args: 
    nlp= initialized spaCy 
    df = prepared pandas dataframe
    
    Output: 
    Doc object with lemmatized text version of input without punct or stop words'''
    def __init__(self, nlp):
        self.nlp = nlp
     
    #input string, remove punct, return new string
    def remove_punct(self, text):
        text = text.translate(str.maketrans('','',string.punctuation))
        return text
        
    #input text, remove text, return new doc.
    def stop_word_removal(self, text):
        doc = nlp(text)
        stop_word_index = []
        for index, token in enumerate(doc):
            if token.text.lower() in nlp.Defaults.stop_words:
                stop_word_index.append(index)
        filtered_tokens = [token.text for i,token in enumerate(doc) if i not in stop_word_index]
        doc = nlp(' '.join(filtered_tokens))
        return doc
        
    #input doc, return lemmatized doc
    def lemmatization(self, doc):
        lemmas = ''
        for token in doc:
            lemmas = lemmas + (token.lemma_) + ' '
        return nlp(lemmas)
        
    #Perform all processing on text input
    def process(self, utterance):
        text = self.remove_punct(utterance)
        doc = self.stop_word_removal(text)
        doc = self.lemmatization(doc)
        return doc

In [16]:
#preprocess text into lemmas, add semantic vector data into dataframe per entry, export into new dataframe

text_processor = Text_Pre_Processor(nlp)
#add new column to dataframe:
#df['semantic_value'] = 0.0
df.insert(len(df.columns), 'semantic_value', pd.Series([[1, 2, 3], 'a'], dtype=object))

df['semantic_value_vec_norm'] = 0

for i, text in enumerate(df.loc[:, 'utterance']):
#for i, text in enumerate(df.loc[0:5, 'utterance']):
    doc = text_processor.process(text)
    new_utterance = []
    for token in doc:
        new_utterance.append(token.text)
    df.at[i, 'utterance'] = ' '.join(new_utterance)
    
    #Cannot put array into df cell. Could convert to str and then back to list for sklearn?
    df.at[i, 'semantic_value'] = doc.vector
    
    df.at[i, 'semantic_value_vec_norm'] = doc.vector_norm
    
#export treated DF for use in next steps
df.to_csv('treated_data.csv', index=False)
df.to_json('treated_data.json', index=False)

ValueError: cannot insert semantic_value, already exists

In [18]:
df.to_json('treated_data.json')

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8175 entries, 0 to 8174
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   flags                    8175 non-null   object 
 1   utterance                8175 non-null   object 
 2   category                 8175 non-null   object 
 3   intent                   8175 non-null   object 
 4   semantic_value           8175 non-null   object 
 5   semantic_value_vec_norm  8175 non-null   float64
dtypes: float64(1), object(5)
memory usage: 383.3+ KB


In [13]:
df.head()

Unnamed: 0,flags,utterance,category,intent,semantic_value,semantic_value_vec_norm
0,BM,problem cancel order,ORDER,cancel_order,"[0.9792027, 1.7512335, -2.1456368, -0.20720005...",34.474487
1,BIM,find information cancel order,ORDER,cancel_order,"[1.818227, -0.20782498, -1.0179275, -0.2591524...",36.865709
2,B,need help cancel order,ORDER,cancel_order,"[3.3943768, 2.520625, -4.8734274, -0.1435399, ...",42.555865
3,BIP,help cancel order,ORDER,cancel_order,"[1.948936, 1.3805002, -4.733937, 0.60104674, 2...",40.969231
4,B,problem cancel order,ORDER,cancel_order,"[0.9792027, 1.7512335, -2.1456368, -0.20720005...",34.474487


In [15]:
df.loc[0, "semantic_value"]

array([ 0.9792027 ,  1.7512335 , -2.1456368 , -0.20720005,  4.6970334 ,
        2.1917    ,  1.2470099 ,  0.81709   , -1.1547333 ,  1.7342666 ,
        5.3666434 ,  3.5145    , -4.395563  ,  2.4533    , -2.3689833 ,
        1.6324171 ,  4.0568333 , -0.75202674, -3.0815334 , -1.2708668 ,
       -2.0174    , -0.52185637, -2.0121167 ,  2.9151165 , -2.1957    ,
       -1.2282    , -1.7814001 , -0.01191   , -2.6512668 ,  2.0717766 ,
        2.1749332 , -0.13233997, -0.89767   ,  1.3347334 ,  0.86010337,
       -0.44156003,  2.35814   , -0.8203433 ,  2.3345733 ,  3.2949002 ,
       -0.23484336,  1.07331   , -0.13427   , -0.47432336, -1.6106133 ,
        0.48746333,  5.7774334 , -0.9473067 , -0.18473339,  1.1517333 ,
       -1.4760332 ,  0.40541002, -0.07463332, -4.1847997 , -4.623733  ,
        1.8456335 , -0.8551033 ,  2.0120666 ,  0.09019998,  2.8318336 ,
        3.9416    , -1.3601333 , -3.1752331 , -1.3463668 ,  2.6987    ,
        1.9365698 , -2.0134666 , -3.2986333 , -0.28944668,  3.34