In [14]:
# Load packages
import os
import pickle
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
import plotly.express as px

from progressbar import ProgressBar

In [15]:
from offcorss_functions.data_exploring_functions import *
from offcorss_functions.classification_functions import *

In [16]:
# Open lexical dictionary
products_file  = 'offcorss_products.csv'
products_words = pd.read_csv(products_file).drop(columns = 'Unnamed: 0')

# Open file for classification
classifier_file ="./classifier.pickle"

# Open lexical dictionary
known_words = pd.read_csv('./offcorss_functions/OFFCORSS_lexicon.csv')

f         = open(classifier_file, 'rb')
logit_fit = pickle.load(f)
f.close()

In [29]:
#File names and base path.
base_datapath = os.path.join('data', 'OFFCORSS_NPS')
file_list = ['NPS_Responses',
            'nps_responses_2018-04-14-1602181341',
            'nps_responses_2018-05-10-1602181334',
            'nps_responses_2018-06-14-1602181329',
            'nps_responses_2018-07-09-1602181324',
            'nps_responses_2018-08-09-1602181317',
            'nps_responses_2018-10-07-1602181309',
            'nps_responses_2018-11-09-1602181299',
            'nps_responses_2018-12-05-1602181268',
            'nps_responses_2019-02-05-1602181263',
            'nps_responses_2020-08-10-1602181258',
            'nps_responses_2020-09-21-1602181250',
            ]

data_comments = []
for cur_file in file_list:
    cur_path = os.path.join(base_datapath, cur_file + '.csv')
    cur_data = pd.read_csv(cur_path)
    cur_data['Survey Date'] = pd.to_datetime(cur_data['Survey Date'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
    cur_data['Survey Date'] = cur_data['Survey Date'].dt.date
    
    if len(data_comments) == 0:
        data_comments = cur_data
    else:
        data_comments = pd.concat([data_comments, cur_data], axis=0)
        
data_comments            = data_comments.reset_index(drop = True)
data_comments['Comment'] = data_comments['Comment'].fillna('')
data_comments['text']    = data_comments['Comment'].str.lower()

In [30]:
data_responses = data_comments.reset_index(drop = True)

vt_comment = []
vt_tokens  = []
vt_products= np.empty((0,len(products_words)), int)

vt_mean    = []
vt_sum     = []
vt_size    = []
vt_min     = []
vt_max     = []
vt_sentim  = []


products_stm = stem_tokens(products_words.products)
nm_lenData   = len(data_responses)

pbar = ProgressBar()

for idx in pbar(range(0,nm_lenData)):    
    
    # Get comment
    cur_comm = data_responses.loc[idx,['text']].values[0]
    
    # Get tokenized comment
    tokens, words = get_comment_tokens(cur_comm)
    
    # get products array
    product_inComments = np.in1d(products_stm, stem_tokens(words), assume_unique=True)
    product_inComments = product_inComments.astype(int) 
    product_inComments = product_inComments.reshape(1, len(products_words))

      
    # get classifier features    
    features_comment = get_comment_features(cur_comm,known_words)
    
    nm_mean = features_comment[0]
    nm_sum  = features_comment[1]
    nm_size = features_comment[2]
    nm_min  = features_comment[3]
    nm_max  = features_comment[4]
    
    # append classifier features
    vt_comment.append(cur_comm)
    vt_mean.append(nm_mean)
    vt_sum.append(nm_sum)
    vt_size.append(nm_size)
    vt_min.append(nm_min)
    vt_max.append(nm_max)
    
    # append comments tokens and products
    vt_tokens.append(tokens)
    vt_products = np.append(vt_products, product_inComments, axis=0)
    
    
    
features_df = pd.DataFrame({'comment':vt_comment,'mean':vt_mean,'sum':vt_sum,'size':vt_size,
                            'min':vt_min,'max':vt_max})

comments_df = pd.DataFrame({'comment':vt_comment,'tokens':vt_tokens})
products_df = pd.DataFrame(vt_products, columns = products_words.products)


100% |########################################################################|


In [33]:
comments_df

Unnamed: 0,comment,tokens
0,,
1,atender a las reclamaciones a tiempo para evit...,atender reclamaciones tiempo evitar recibir me...
2,"no leen con atención, por favor dictar capacit...","no leen atención , favor dictar capacitaciones..."
3,se debe poder hacer cambios en tiends fisicas....,debe poder hacer cambios tiends fisicas . tamp...
4,,
...,...,...
6291,,
6292,very good all,very good all
6293,,
6294,,


In [34]:
nm_maxThres = [0.49494949] #<-- Value from training notebook
vt_scoreLim = [1/3,2/3]

comment_score = features_df.copy()
comment_score['Intercept'] = 1.0
comment_score["size"]      = comment_score["size"].astype('float64')

comment_score["score"] = logit_fit.predict(comment_score[['Intercept',"mean", "sum", "max"]])

vt_good     = (comment_score["score"] >= vt_scoreLim[1])
vt_bad      = (comment_score["score"] <= vt_scoreLim[0])
vt_neutral  = (comment_score["score"] > vt_scoreLim[0]) & (comment_score["score"] < vt_scoreLim[1])

#vt_good    = np.flatnonzero(list(vt_good.values))
#vt_bad     = np.flatnonzero(list(vt_bad.values))
#vt_neutral = np.flatnonzero(list(vt_neutral.values))

comment_score["class"] = 'na'
comment_score.loc[vt_good.values,'class']    = 'good'
comment_score.loc[vt_bad.values,'class']     = 'bad'
comment_score.loc[vt_neutral.values,'class'] = 'neutral'

comment_score

Unnamed: 0,comment,mean,sum,size,min,max,Intercept,score,class
0,,0.000000,0.000,0.0,0.00,0.000,1.0,0.448265,neutral
1,atender a las reclamaciones a tiempo para evit...,-0.005952,-0.250,42.0,-1.00,0.625,1.0,0.226545,bad
2,"no leen con atención, por favor dictar capacit...",-0.013889,-0.375,27.0,-0.50,0.375,1.0,0.254529,bad
3,se debe poder hacer cambios en tiends fisicas....,0.025794,1.625,63.0,-0.25,0.375,1.0,0.559106,neutral
4,,0.000000,0.000,0.0,0.00,0.000,1.0,0.448265,neutral
...,...,...,...,...,...,...,...,...,...
6291,,0.000000,0.000,0.0,0.00,0.000,1.0,0.448265,neutral
6292,very good all,-0.083333,-0.500,6.0,-0.50,0.000,1.0,0.132422,bad
6293,,0.000000,0.000,0.0,0.00,0.000,1.0,0.448265,neutral
6294,,0.000000,0.000,0.0,0.00,0.000,1.0,0.448265,neutral


In [35]:
data_comments_scored = pd.merge(data_comments, comments_df[['tokens']], left_index = True, right_index=True)
data_comments_scored = pd.merge(data_comments_scored, comment_score[['score','class']], left_index = True, right_index=True)
data_comments_scored = pd.merge(data_comments_scored, products_df, left_index = True, right_index=True)
data_comments_scored

Unnamed: 0,Survey Date,Name,User Id,Email,Rating,Classification,Comment,Response Date,text,tokens,...,tendido,tenis,termo,toalla,tobillera,top,tutu,vestido,visera,zapato
0,2020-08-10,Johanna Vargas T,4.008424e+11,vhannyt@gmail.com,7,passive,,2020-08-10 11:38,,,...,0,0,0,0,0,0,0,0,0,0
1,2020-08-10,Maria Carolina Parra Rincón,4.009328e+11,mariacarolinaparrar@gmail.com,1,detractor,Atender a las reclamaciones a tiempo para evit...,2020-08-10 11:59,atender a las reclamaciones a tiempo para evit...,atender reclamaciones tiempo evitar recibir me...,...,0,0,0,0,0,0,0,0,0,0
2,2020-08-10,Luz Marina González Pulido,4.006469e+11,14a793667beb4637bc67b25241ee1150@ct.vtex.com.br,0,detractor,"No leen con atención, por favor dictar capacit...",2020-08-10 12:11,"no leen con atención, por favor dictar capacit...","no leen atención , favor dictar capacitaciones...",...,0,0,0,0,0,0,0,0,0,0
3,2020-08-10,Devolución de producto,4.011302e+11,paolaga@gmail.com,3,detractor,Se debe poder hacer cambios en tiends fisicas....,2020-08-10 12:32,se debe poder hacer cambios en tiends fisicas....,debe poder hacer cambios tiends fisicas . tamp...,...,0,0,0,0,0,0,0,0,0,0
4,2020-08-10,Devolución de producto,4.013074e+11,leonlobozharick@gmail.com,10,promoter,,2020-08-10 12:37,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6291,2020-09-21,LINA,4.038139e+11,lina4313@hotmail.com,10,promoter,,2020-09-23 20:10,,,...,0,0,0,0,0,0,0,0,0,0
6292,2020-09-21,LAURA,4.038407e+11,emilylau916@gmail.com,10,promoter,Very good all,2020-09-24 11:10,very good all,very good all,...,0,0,0,0,0,0,0,0,0,0
6293,2020-09-21,LEIDY,4.038469e+11,leidymartinez-094@hotmail.es,10,promoter,,2020-09-24 14:24,,,...,0,0,0,0,0,0,0,0,0,0
6294,2020-09-21,JULIETH,4.038530e+11,julieth12daniela@gmail.com,10,promoter,,2020-09-24 18:01,,,...,0,0,0,0,0,0,0,0,0,0


In [36]:
# Save processed files
save_datapath = os.path.join('data', 'data_clean')
save_file = 'nps_responses_scored'
save_path = os.path.join(save_datapath, save_file + '.csv')

data_comments_scored.to_csv(save_path, encoding='utf-8-sig')