In [114]:
#pip install progressbar

In [22]:
# Load packages
import os
import pickle
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
import plotly.express as px

from progressbar import ProgressBar

In [23]:
from offcorss_functions.data_exploring_functions import *
from offcorss_functions.classification_functions import *

In [24]:
# Open lexical dictionary
products_file  = 'offcorss_products.csv'
products_words = pd.read_csv(products_file).drop(columns = 'Unnamed: 0')

# Open file for classification
classifier_file ="./classifier.pickle"

# Open lexical dictionary
known_words = pd.read_csv('./offcorss_functions/OFFCORSS_lexicon.csv')

f         = open(classifier_file, 'rb')
logit_fit = pickle.load(f)
f.close()

In [44]:
#File names and base path.
base_datapath = 'C:\\Users\\sapmn3\\Database\\OFFCORSS\\data\\OFFCORSS_instagram'
file_list = ['instagram_responses',
            ]

data_nps = []
for cur_file in file_list:
    cur_path = os.path.join(base_datapath, cur_file + '.csv')
    cur_data = pd.read_csv(cur_path)
    cur_data['time'] = pd.to_datetime(cur_data['time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
    cur_data['time'] = cur_data['time'].dt.date
    if len(data_nps) == 0:
        data_comments = cur_data
    else:
        data_comments = pd.concat([data_nps, cur_data], axis=0)
        
#use lower letters to the comment column
data_comments = data_comments.drop(columns = 'Unnamed: 0')
data_comments['text'] = data_comments['text'].str.lower()

In [48]:
data_responses = data_comments.reset_index(drop = True)

vt_comment = []
vt_tokens  = []
vt_products= np.empty((0,len(products_words)), int)

vt_mean    = []
vt_sum     = []
vt_size    = []
vt_min     = []
vt_max     = []
vt_sentim  = []


products_stm = stem_tokens(products_words.products)
nm_lenData   = len(data_responses)

pbar = ProgressBar()

for idx in pbar(range(0,nm_lenData)):    
    
    # Get comment
    cur_comm = data_responses.loc[idx,['text']].values[0]
    
    # Get tokenized comment
    tokens, words = get_comment_tokens(cur_comm)
    
    # get products array
    product_inComments = np.in1d(products_stm, stem_tokens(words), assume_unique=True)
    product_inComments = product_inComments.astype(int) 
    product_inComments = product_inComments.reshape(1, len(products_words))

      
    # get classifier features    
    features_comment = get_comment_features(cur_comm,known_words)
    
    nm_mean = features_comment[0]
    nm_sum  = features_comment[1]
    nm_size = features_comment[2]
    nm_min  = features_comment[3]
    nm_max  = features_comment[4]
    
    # append classifier features
    vt_comment.append(cur_comm)
    vt_mean.append(nm_mean)
    vt_sum.append(nm_sum)
    vt_size.append(nm_size)
    vt_min.append(nm_min)
    vt_max.append(nm_max)
    
    # append comments tokens and products
    vt_tokens.append(tokens)
    vt_products = np.append(vt_products, product_inComments, axis=0)
    
    
    
features_df = pd.DataFrame({'comment':vt_comment,'mean':vt_mean,'sum':vt_sum,'size':vt_size,
                            'min':vt_min,'max':vt_max})

comments_df = pd.DataFrame({'comment':vt_comment,'tokens':vt_tokens})
products_df = pd.DataFrame(vt_products, columns = products_words.products)



100% |########################################################################|


In [105]:
nm_maxThres = [0.49494949] #<-- Value from training notebook
vt_scoreLim = [1/3,2/3]

comment_score = features_df.copy()
comment_score['Intercept'] = 1.0
comment_score["size"]      = comment_score["size"].astype('float64')

comment_score["score"] = logit_fit.predict(comment_score[['Intercept',"mean", "sum", "max"]])

vt_good     = (comment_score["score"] >= vt_scoreLim[1])
vt_bad      = (comment_score["score"] <= vt_scoreLim[0])
vt_neutral  = (comment_score["score"] > vt_scoreLim[0]) & (comment_score["score"] < vt_scoreLim[1])

#vt_good    = np.flatnonzero(list(vt_good.values))
#vt_bad     = np.flatnonzero(list(vt_bad.values))
#vt_neutral = np.flatnonzero(list(vt_neutral.values))

comment_score["class"] = 'na'
comment_score.loc[vt_good.values,'class']    = 'good'
comment_score.loc[vt_bad.values,'class']     = 'bad'
comment_score.loc[vt_neutral.values,'class'] = 'neutral'

comment_score

Unnamed: 0,comment,mean,sum,size,min,max,Intercept,score,class
0,as√≠ es un d√≠a para disfrutar sana mente y con...,0.066667,1.000,15.0,-0.125,0.625,1.0,0.614808,neutral
1,buenos d√≠as que precio tienen?,0.097222,0.875,9.0,-0.125,1.000,1.0,0.619522,neutral
2,üéä,0.000000,0.000,1.0,0.000,0.000,1.0,0.448265,neutral
3,hermosa mi sheshe,0.333333,1.000,3.0,0.000,1.000,1.0,0.992207,good
4,cuanto cuesta el vestido de manga larga talla 14,-0.020833,-0.375,18.0,-0.500,0.250,1.0,0.262286,bad
...,...,...,...,...,...,...,...,...,...
33483,@tina.benjumea üåü todos los tapabocas tienen un...,0.031250,1.875,60.0,0.000,0.500,1.0,0.560458,neutral
33484,@anil_rizo üíõ todos los tapabocas tienen un val...,0.022917,1.375,60.0,0.000,0.500,1.0,0.486015,neutral
33485,üíõüíõ @paopao.martinezfigueroa todos los tapaboc...,0.022917,1.375,60.0,0.000,0.500,1.0,0.486015,neutral
33486,@contreraszarateyamileth ‚òÄÔ∏è‚òÄÔ∏è todos los tapabo...,0.022917,1.375,60.0,0.000,0.500,1.0,0.486015,neutral


In [112]:
data_comments_scored = pd.merge(data_comments, comments_df[['tokens']], left_index = True, right_index=True)
data_comments_scored = pd.merge(data_comments_scored, comment_score[['score','class']], left_index = True, right_index=True)
data_comments_scored = pd.merge(data_comments_scored, products_df, left_index = True, right_index=True)
data_comments_scored

Unnamed: 0,rootPost_id,parentPost_id,reponsePost_id,brand_username,text,time,likes,username,tokens,score,...,tendido,tenis,termo,toalla,tobillera,top,tutu,vestido,visera,zapato
0,1947257691818809835,1.947258e+18,1.794235e+16,offcorss,as√≠ es un d√≠a para disfrutar sana mente y con...,2019-01-01,1,rosaliafaneite,as√≠ d√≠a disfrutar sana mente exito,0.614808,...,0,0,0,0,0,0,0,0,0,0
1,1947952244502703605,1.947952e+18,1.801764e+16,offcorss,buenos d√≠as que precio tienen?,2019-01-02,1,aleja.vasquez911,buenos d√≠as precio ?,0.619522,...,0,0,0,0,0,0,0,0,0,0
2,1948073194162672935,1.948073e+18,1.791861e+16,offcorss,üéä,2019-01-02,1,pactrii188,üéä,0.448265,...,0,0,0,0,0,0,0,0,0,0
3,1948284380053077836,1.948284e+18,1.798609e+16,offcorss,hermosa mi sheshe,2019-01-03,1,dalekeyboutique,hermosa sheshe,0.992207,...,0,0,0,0,0,0,0,0,0,0
4,1948284380053077836,1.948284e+18,1.790035e+16,offcorss,cuanto cuesta el vestido de manga larga talla 14,2019-01-03,3,delgadoa953,cuanto cuesta vestido manga larga talla 14,0.262286,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33483,2404696945357653802,1.791184e+16,1.793289e+16,politokids,@tina.benjumea üåü todos los tapabocas tienen un...,2020-09-23,0,politokids,"@ tina.benjumea üåü tapabocas valor $ 10.900 , p...",0.560458,...,0,0,0,0,0,0,0,0,0,0
33484,2404696945357653802,1.787722e+16,1.796018e+16,politokids,@anil_rizo üíõ todos los tapabocas tienen un val...,2020-09-23,0,politokids,"@ anil_rizo üíõ tapabocas valor $ 10.900 , puede...",0.486015,...,0,0,0,0,0,0,0,0,0,0
33485,2404696945357653802,1.790429e+16,1.786213e+16,politokids,üíõüíõ @paopao.martinezfigueroa todos los tapaboc...,2020-09-23,0,politokids,üíõüíõ @ paopao.martinezfigueroa tapabocas valor $...,0.486015,...,0,0,0,0,0,0,0,0,0,0
33486,2404696945357653802,1.785560e+16,1.788603e+16,politokids,@contreraszarateyamileth ‚òÄÔ∏è‚òÄÔ∏è todos los tapabo...,2020-09-23,0,politokids,@ contreraszarateyamileth ‚òÄÔ∏è‚òÄÔ∏è tapabocas valor...,0.486015,...,0,0,0,0,0,0,0,0,0,0


In [113]:
# Save processed files
save_datapath = 'C:\\Users\\sapmn3\\Database\\OFFCORSS\\data\\data_clean'
save_file = 'instagram_responses_scored'
save_path = os.path.join(save_datapath, save_file + '.csv')

data_comments_scored.to_csv(save_path, encoding='utf-8-sig')