In [1]:
import pandas as pd

adj_en = pd.read_csv("datasets/adjs_en.csv")
df = pd.read_csv("datasets/dataset_oficial_adj_nouns.csv")
df.fillna('', inplace=True)
adj_en.head()

Unnamed: 0,word,frequency,sentiment
0,online,905,0.0
1,many,332,0.0
2,good,319,5.0
3,n,207,0.0
4,clear,200,0.0


In [2]:
## Correct manual input mistakes
for i in range(df.shape[0]):
    if len((df.loc[i, "missing_info"]).strip()) == 0:
        df.loc[i, "missing_info_sentiment"] = 3
    if len((df.loc[i, "improvements"]).strip()) == 0:
        df.loc[i, "improvements_sentiment"] = 3

In [3]:
adj_en_sentiment = dict(zip(adj_en["word"], adj_en["sentiment"]))

In [4]:
def calculate_sentiment(adjectives):
    sentiment_score = 0
    for adj in adjectives.split():
        if adj in adj_en_sentiment.keys():
            sentiment_score += adj_en_sentiment[adj]
    return sentiment_score

In [5]:
df.loc[:, "missing_info_en_adj_score"] = df["missing_info_en_adj"].apply(lambda x: calculate_sentiment(x) if len(x.strip()) > 0 else 1000)
df.loc[:, "improvements_en_adj_score"] = df["improvements_en_adj"].apply(lambda x: calculate_sentiment(x) if len(x.strip()) > 0 else 1000)

In [6]:
def classify_sentiment(n_del, p_del, sent_score):
    """Function that classifies a sentiment score into one of three categories: 0 (negative), 1 (neutral), 2 (positive)"""
    if sent_score == 1000:
        return 3
    elif sent_score < n_del:
        return 0
    elif sent_score > p_del:
        return 2
    else:
        return 1

In [7]:
from itertools import combinations

# All possible combinations for delimiters
digits = list(range(-10, 10, 1))
digits.remove(0)

delimiters = list(combinations(digits, 2))
delimiters

[(-10, -9),
 (-10, -8),
 (-10, -7),
 (-10, -6),
 (-10, -5),
 (-10, -4),
 (-10, -3),
 (-10, -2),
 (-10, -1),
 (-10, 1),
 (-10, 2),
 (-10, 3),
 (-10, 4),
 (-10, 5),
 (-10, 6),
 (-10, 7),
 (-10, 8),
 (-10, 9),
 (-9, -8),
 (-9, -7),
 (-9, -6),
 (-9, -5),
 (-9, -4),
 (-9, -3),
 (-9, -2),
 (-9, -1),
 (-9, 1),
 (-9, 2),
 (-9, 3),
 (-9, 4),
 (-9, 5),
 (-9, 6),
 (-9, 7),
 (-9, 8),
 (-9, 9),
 (-8, -7),
 (-8, -6),
 (-8, -5),
 (-8, -4),
 (-8, -3),
 (-8, -2),
 (-8, -1),
 (-8, 1),
 (-8, 2),
 (-8, 3),
 (-8, 4),
 (-8, 5),
 (-8, 6),
 (-8, 7),
 (-8, 8),
 (-8, 9),
 (-7, -6),
 (-7, -5),
 (-7, -4),
 (-7, -3),
 (-7, -2),
 (-7, -1),
 (-7, 1),
 (-7, 2),
 (-7, 3),
 (-7, 4),
 (-7, 5),
 (-7, 6),
 (-7, 7),
 (-7, 8),
 (-7, 9),
 (-6, -5),
 (-6, -4),
 (-6, -3),
 (-6, -2),
 (-6, -1),
 (-6, 1),
 (-6, 2),
 (-6, 3),
 (-6, 4),
 (-6, 5),
 (-6, 6),
 (-6, 7),
 (-6, 8),
 (-6, 9),
 (-5, -4),
 (-5, -3),
 (-5, -2),
 (-5, -1),
 (-5, 1),
 (-5, 2),
 (-5, 3),
 (-5, 4),
 (-5, 5),
 (-5, 6),
 (-5, 7),
 (-5, 8),
 (-5, 9),
 (-4, -3),
 (

In [8]:
delimiters_comb = delimiters[0]
delimiters_comb

(-10, -9)

In [9]:
df.columns

Index(['poll_id', 'token', 'institution', 'is_info_useful', 'is_info_missing',
       'missing_info', 'improvements', 'date', 'missing_info_sentiment',
       'improvements_sentiment', 'missing_info_en', 'improvements_en',
       'missing_info_adj', 'improvements_adj', 'missing_info_noun_adj',
       'improvements_noun_adj', 'missing_info_en_adj', 'improvements_en_adj',
       'missing_info_en_noun_adj', 'improvements_en_noun_adj',
       'missing_info_en_adj_score', 'improvements_en_adj_score'],
      dtype='object')

In [10]:
df.loc[:, "missing_info_en_adj_sentiment"] = df["missing_info_en_adj_score"].apply(
    lambda x: classify_sentiment(*delimiters_comb, x))
df.loc[:, "improvements_info_en_adj_sentiment"] = df["improvements_en_adj_score"].apply(
    lambda x: classify_sentiment(*delimiters_comb, x))
df[['missing_info_sentiment', 'improvements_sentiment', 'missing_info_en_adj_sentiment', 
    'improvements_info_en_adj_sentiment']].head(15)

Unnamed: 0,missing_info_sentiment,improvements_sentiment,missing_info_en_adj_sentiment,improvements_info_en_adj_sentiment
0,3.0,1,3,3
1,1.0,1,2,2
2,0.0,1,2,3
3,3.0,2,3,2
4,0.0,1,3,2
5,3.0,1,3,3
6,1.0,0,2,2
7,1.0,1,3,2
8,3.0,0,3,2
9,1.0,1,3,3


In [12]:
missing_info_acc = df["missing_info_sentiment"] == df["missing_info_en_adj_sentiment"]
improvements_acc = df["missing_info_sentiment"] == df["missing_info_en_adj_sentiment"]
df.loc[missing_info_acc].shape[0]

775

In [14]:
(df["missing_info_sentiment"] == df["missing_info_en_adj_sentiment"]).sum()

775