# Linking Postings to Recognized Emotions [Script C]

*This script links emotions recognized in reviews made about companies, to job postings that were made by the same company within the same time range. Here, we pick time ranges of six months because companies tend to hold semi-annual reviews. At the end of this script, we get the top three emotions for each company and time range and drop all information related to the company and the time range. This results in the dataset that will be used to train our multi-label classification model in Script C*

## Package Installations, Imports & Setup



In [None]:
import pandas as pd
import numpy as np
import re

import io
from google.colab import files

## Data Import & Processing

In [None]:
# Import review sentiments from Script B
reviews = pd.read_csv("review_sentiments.csv")
reviews.drop_duplicates(inplace=True)

In [None]:
# Method takes in sentiment list with percentages and flattens it into a simple list
def flatten_sentiments(sent_list):
  sentiments = []
  for sent in sent_list:
    sentiments.append(sent.get('sentiment'))
  return sentiments

In [None]:
# Add column with flattened review
reviews['sent_list'] = reviews['sentiments'].apply(lambda x: flatten_sentiments(eval(x)))

In [None]:
# Turn dataframe into multilabel dataset, with a column for each emotion
SENTIMENT_LABELS = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']

for SENTIMENT in SENTIMENT_LABELS:
  reviews[SENTIMENT] = reviews['sent_list'].apply(lambda x: SENTIMENT in x)
  reviews[SENTIMENT] = reviews[SENTIMENT].astype(int)

KEEP_COLUMNS = ['company', 'date'] + SENTIMENT_LABELS
review_sentiments = reviews[KEEP_COLUMNS].copy()

# Reformat date column to be of date type
review_sentiments['date'] = review_sentiments['date'].apply(lambda x: pd.to_datetime(x, format='%b %d, %Y '))
review_sentiments.sort_values(by='date', inplace=True)
review_sentiments.reset_index(drop=True, inplace=True)

In [None]:
# Define six-month date ranges
date_rng = pd.date_range(start=min(review_sentiments['date']), end=max(review_sentiments['date']), freq='6M')
date_rng = date_rng.insert(27, max(review_sentiments['date']))

In [None]:
# Method takes in date and returns time period it is in
def find_period(curr_date):
  period = 0
  for period in range(28):
    if curr_date <= date_rng[period]:
      return period+1

In [None]:
# Match each row to corresponding period according to date range and drop date
review_sentiments['period'] = review_sentiments['date'].apply(lambda x: find_period(x))
review_sentiments.drop(columns=['date'], inplace=True)

In [None]:
# Sum up emotions by company and time period
company_sentiments = review_sentiments.groupby(['company','period'], as_index=False).sum()

## Matching Postings and Emotions

In [None]:
# Method returns dataframe with top 3 emotions per row

def get_top_3_exclude_0(df):
  #Create new dataframe to store top sentiments
  top_sentiments = pd.DataFrame(np.empty, index = np.arange(len(df)), columns = ['listing','1st Sent', '2nd Sent', '3rd Sent'])
  #Duplicate initial dataframe and drop listing so idxmax can be used
  df2 = df.drop(columns=['listing'])
  for j in range(1,4):
    for index, row in df2.iterrows():
      top_sent = row.idxmax()
      #print(top_sent)
      #print(index)
      top_sentiments.iloc[index]['listing'] = df.iloc[index]['listing']
      top_sentiments.iloc[index][j] = top_sent
      df2.iloc[index][top_sent] = np.nan
  return top_sentiments

In [None]:
# Import postings dataframe
postings = pd.read_csv("postings.csv")

In [None]:
# Merge posting dataset and emotion dataset on company and time period
posting_sentiments = postings.merge(company_sentiments, how='left', left_on=["companyName", "period"], right_on=["company","period"])
posting_sentiments.dropna(subset = ["company"], inplace=True)
posting_sentiments[SENTIMENT_LABELS] = posting_sentiments[SENTIMENT_LABELS].replace(['0', 0], np.nan)

In [None]:
# Drop all columns but the listing content and the emotion labels
KEEP_COLUMNS = ['listing_jobDesc'] + SENTIMENT_LABELS
posting_sentiments = posting_sentiments[KEEP_COLUMNS].copy()
posting_sentiments.reset_index(drop=True, inplace=True)
posting_sentiments.rename(columns={"listing_jobDesc": "listing"}, inplace=True)

In [None]:
# Get top sentiments per posting
top_sentiments = get_top_3_exclude_0(posting_sentiments)

In [None]:
# Combine top 3 emotions into tag columns
top_sentiments['tags'] = top_sentiments.apply(lambda row: [row['1st Sent'], row['2nd Sent'], row['3rd Sent']], axis = 1)

In [None]:
def isNaN(string):
    return string != string

In [None]:
# Method to remove all nan values
def remove_if_present(my_list):
  for item in my_list:
    if(isNaN(item)):
      my_list.remove(item)
  return my_list

In [None]:
# Remove all nan values from tags column
top_sentiments['tags'] = top_sentiments.apply(lambda row: remove_if_present(row['tags']), axis = 1)

In [None]:
# Export dataframe for next script
top_sentiments.to_csv('top_sentiments.csv') 
files.download('top_sentiments.csv')