## Apply Sentiment Score to each sentence and Export to .csv

In [1]:
#Load libraries
import numpy as np
import pandas as pd


In [2]:
#load data
'''
Data was obtained via Kaggle (https://www.kaggle.com/datasets/fiodarryzhykau/employee-review)

On Kaggle, the data had been separated into train & test and stored in 2 separate files. 
For this project the files will be merged.  Train, test, validation will be generated within the code
allowing the code to be more dynamically used in the future as additional data may become available. 

'''

kaggle_data_test = pd.read_csv('./data/employee_review_mturk_dataset_test_v6_kaggle.csv')

kaggle_data_train = pd.read_csv('./data/employee_review_mturk_dataset_v10_kaggle.csv')

data = [kaggle_data_test, kaggle_data_train.rename(columns={'adjusted':'updated'})]

df = pd.concat(
    data,
    axis=0,
    join="outer",
    ignore_index=False,
    keys=None,
    levels=None,
    names=None,
    
    )


df

Unnamed: 0,id,person_name,nine_box_category,feedback,updated,reviewed
0,20051,Lacey Howard,"Category 1: 'Risk' (Low performance, Low poten...",Lacey's performance has been sub standard in t...,True,True
1,20057,Amy Jones,"Category 1: 'Risk' (Low performance, Low poten...",Amy struggles at her work a lot. Shes always o...,True,True
2,20058,Amy Jones,"Category 1: 'Risk' (Low performance, Low poten...",Amy Jones is a nice person and she is dedicate...,True,True
3,20059,Amy Jones,"Category 1: 'Risk' (Low performance, Low poten...",Amy Jones needs to become a better player. She...,True,True
4,20060,Amy Jones,"Category 1: 'Risk' (Low performance, Low poten...",Amy is able to focus on the task at hand only ...,True,True
...,...,...,...,...,...,...
873,10205,Bailey Hunt,"Category 9: 'Star' (High performance, High pot...",No one performs like Bailey. I believe she wil...,False,False
874,10226,Thaddeus Burgess,"Category 9: 'Star' (High performance, High pot...",Thaddeus Burgess is a constant force within th...,True,True
875,20022,Max Miller,"Category 9: 'Star' (High performance, High pot...",Max Miller is a a great coworker. He is dili...,True,True
876,20023,Allan Logan,"Category 9: 'Star' (High performance, High pot...","Allan Logan, Excellent performer absolutely bl...",True,True


### Split Nine Box Category into Category, Performance & Potential

In [3]:
#Split the 'nine_box_category' into 3 columns ->  'performance' ,  'potential' and 'category'

#split performance & potential values and clean to only include the Low, Moderate, High text 
df[['performance', 'potential']] = df['nine_box_category'].str.split(',', 1, expand=True)
df['performance'] = df['performance'].str.split('(').str[1].str.split(' ').str[0]
df['potential'] = df['potential'].str.split(')').str[0].str.split(' ').str[1]

#split and clean 'category'
#text before colon
df['category'] = df['nine_box_category'].str.split(':').str[0]
#text after space
df['category'] = df['category'].str.split(' ').str[1]

df['peformance_potential'] =  df['category'] + ': ' + df['performance'] + ' : '+  df['potential']

#Drop unnecessary columns 
#df.drop(['updated','reviewed','nine_box_category'], axis=1, inplace=True)
df.drop(['nine_box_category'], axis=1, inplace=True)

df


Unnamed: 0,id,person_name,feedback,updated,reviewed,performance,potential,category,peformance_potential
0,20051,Lacey Howard,Lacey's performance has been sub standard in t...,True,True,Low,Low,1,1: Low : Low
1,20057,Amy Jones,Amy struggles at her work a lot. Shes always o...,True,True,Low,Low,1,1: Low : Low
2,20058,Amy Jones,Amy Jones is a nice person and she is dedicate...,True,True,Low,Low,1,1: Low : Low
3,20059,Amy Jones,Amy Jones needs to become a better player. She...,True,True,Low,Low,1,1: Low : Low
4,20060,Amy Jones,Amy is able to focus on the task at hand only ...,True,True,Low,Low,1,1: Low : Low
...,...,...,...,...,...,...,...,...,...
873,10205,Bailey Hunt,No one performs like Bailey. I believe she wil...,False,False,High,High,9,9: High : High
874,10226,Thaddeus Burgess,Thaddeus Burgess is a constant force within th...,True,True,High,High,9,9: High : High
875,20022,Max Miller,Max Miller is a a great coworker. He is dili...,True,True,High,High,9,9: High : High
876,20023,Allan Logan,"Allan Logan, Excellent performer absolutely bl...",True,True,High,High,9,9: High : High


In [4]:
#Examine data types and check for null values
print(df.dtypes)
df.isna().sum() 
#No na values found

id                       int64
person_name             object
feedback                object
updated                   bool
reviewed                  bool
performance             object
potential               object
category                object
peformance_potential    object
dtype: object


id                      0
person_name             0
feedback                0
updated                 0
reviewed                0
performance             0
potential               0
category                0
peformance_potential    0
dtype: int64

In [5]:
#Verify unique values - should be Low, Medium or High only
print('Performance values:')
print(df.performance.unique())

print('Potential values:')
print(df.potential.unique())

print('Category values:')
print(df.category.unique())

Performance values:
['Low' 'Moderate' 'High']
Potential values:
['Low' 'Moderate' 'High']
Category values:
['1' '2' '3' '4' '5' '6' '7' '8' '9']


### Split feedback into single sentences

In [6]:
from nltk.tokenize import sent_tokenize

#tokenize feedback into sentences
#feedback_sent contains a list of sentences
df['feedback_sent'] = df['feedback'].apply(sent_tokenize) 

#split feedback_sent into multiple rows by sentence
df_sent = df.explode('feedback_sent')
#df_sent = df


#remove unnecessary columns
df_sent.drop(['feedback'], axis=1, inplace=True)

df_sent


Unnamed: 0,id,person_name,updated,reviewed,performance,potential,category,peformance_potential,feedback_sent
0,20051,Lacey Howard,True,True,Low,Low,1,1: Low : Low,Lacey's performance has been sub standard in t...
0,20051,Lacey Howard,True,True,Low,Low,1,1: Low : Low,She tries to take up too many tasks at the sam...
0,20051,Lacey Howard,True,True,Low,Low,1,1: Low : Low,"Although she has delivered some results, they ..."
0,20051,Lacey Howard,True,True,Low,Low,1,1: Low : Low,Unless she improves a lot by taking advantage ...
1,20057,Amy Jones,True,True,Low,Low,1,1: Low : Low,Amy struggles at her work a lot.
...,...,...,...,...,...,...,...,...,...
876,20023,Allan Logan,True,True,High,High,9,9: High : High,"Never before have I seen so much talent, I onl..."
877,20024,Allan Logan,True,True,High,High,9,9: High : High,Allan Logan touches anything and it turns to g...
877,20024,Allan Logan,True,True,High,High,9,9: High : High,Nothing but the most capable hand in the team.
877,20024,Allan Logan,True,True,High,High,9,9: High : High,He has shown he can achieve what many would dr...


### Apply Flair Sentiment Scoring & Word Count

In [7]:
from flair.models import TextClassifier
from flair.data import Sentence
import textstat

classifier = TextClassifier.load('en-sentiment')

value = list()
score = list()
word_count = list()


#Flair provides a value of POSTIVE or NEGATIVE this will be added to the data frame as 'polarity'
#Flair also returns a confidence score which will be stored as 'confidence'
#Word Count is calculated and stored in the dataframe. 


for feedback_sent in df_sent['feedback_sent']:
    sent = Sentence(feedback_sent)
    classifier.predict(sent)
    score.append(sent.labels[0].score)
    
    if sent.labels[0].value == 'NEGATIVE':
        sent.labels[0].score == 0 - sent.labels[0].score
    else:
        sent.labels[0].score == sent.labels[0].score
        
        
    value.append(sent.labels[0].value)
    word_count.append(len(feedback_sent.split()))
    
    #print('Sentence polarity: ', sent.labels)
    

df_sent['polarity'] = value
df_sent['confidence'] = score
df_sent['word_count'] = word_count


    
   


2022-06-06 08:07:17,493 loading file C:\Users\E073462\.flair\models\sentiment-en-mix-distillbert_4.pt


### Write new dataframe to .csv

In [8]:
df_sent.to_csv('./data/feedback.csv' ,index=False)

In [9]:
df_sent

Unnamed: 0,id,person_name,updated,reviewed,performance,potential,category,peformance_potential,feedback_sent,polarity,confidence,word_count
0,20051,Lacey Howard,True,True,Low,Low,1,1: Low : Low,Lacey's performance has been sub standard in t...,NEGATIVE,0.758257,11
0,20051,Lacey Howard,True,True,Low,Low,1,1: Low : Low,She tries to take up too many tasks at the sam...,NEGATIVE,0.997663,24
0,20051,Lacey Howard,True,True,Low,Low,1,1: Low : Low,"Although she has delivered some results, they ...",NEGATIVE,0.999925,23
0,20051,Lacey Howard,True,True,Low,Low,1,1: Low : Low,Unless she improves a lot by taking advantage ...,NEGATIVE,0.960807,28
1,20057,Amy Jones,True,True,Low,Low,1,1: Low : Low,Amy struggles at her work a lot.,NEGATIVE,0.995157,7
...,...,...,...,...,...,...,...,...,...,...,...,...
876,20023,Allan Logan,True,True,High,High,9,9: High : High,"Never before have I seen so much talent, I onl...",POSITIVE,0.998588,15
877,20024,Allan Logan,True,True,High,High,9,9: High : High,Allan Logan touches anything and it turns to g...,POSITIVE,0.822073,9
877,20024,Allan Logan,True,True,High,High,9,9: High : High,Nothing but the most capable hand in the team.,POSITIVE,0.999360,9
877,20024,Allan Logan,True,True,High,High,9,9: High : High,He has shown he can achieve what many would dr...,POSITIVE,0.999163,11
