In [1]:
import pandas as pd

adj_en = pd.read_csv("datasets/adjs_en.csv")
df = pd.read_csv("datasets/dataset_oficial_adj_nouns.csv")
df.fillna('', inplace=True)
adj_en.head()

Unnamed: 0,word,frequency,sentiment
0,online,905,0.0
1,many,332,0.0
2,good,319,5.0
3,n,207,0.0
4,clear,200,0.0


In [2]:
## Correct manual input mistakes
for i in range(df.shape[0]):
    if len((df.loc[i, "missing_info"]).strip()) == 0:
        df.loc[i, "missing_info_sentiment"] = 3
    if len((df.loc[i, "improvements"]).strip()) == 0:
        df.loc[i, "improvements_sentiment"] = 3
df["missing_info_sentiment"].value_counts()

0.0    1425
1.0     914
3.0     755
2.0      10
Name: missing_info_sentiment, dtype: int64

In [3]:
adj_en_sentiment = dict(zip(adj_en["word"], adj_en["sentiment"]))

In [4]:
def calculate_sentiment(adjectives):
    sentiment_score = 0
    for adj in adjectives.split():
        if adj in adj_en_sentiment.keys():
            sentiment_score += adj_en_sentiment[adj]
    return sentiment_score

In [5]:
df.loc[:, "missing_info_en_adj_score"] = df["missing_info_en_adj"].apply(lambda x: calculate_sentiment(x))
df.loc[:, "improvements_en_adj_score"] = df["improvements_en_adj"].apply(lambda x: calculate_sentiment(x))

In [6]:
def classify_sentiment(n_del, p_del, sent_score):
    """Function that classifies a sentiment score into one of three categories: 0 (negative), 1 (neutral), 2 (positive)"""
    if sent_score == 1000:
        return 3
    elif sent_score < n_del:
        return 0
    elif sent_score > p_del:
        return 2
    else:
        return 1

In [7]:
from itertools import combinations

# All possible combinations for delimiters
digits = list(range(-10, 10, 1))
digits.remove(0)

delimiters = list(combinations(digits, 2))

In [8]:
delimiters_comb = delimiters[90]
delimiters_comb

(-5, 7)

In [9]:
df.columns

Index(['poll_id', 'token', 'institution', 'is_info_useful', 'is_info_missing',
       'missing_info', 'improvements', 'date', 'missing_info_sentiment',
       'improvements_sentiment', 'missing_info_en', 'improvements_en',
       'missing_info_adj', 'improvements_adj', 'missing_info_noun_adj',
       'improvements_noun_adj', 'missing_info_en_adj', 'improvements_en_adj',
       'missing_info_en_noun_adj', 'improvements_en_noun_adj',
       'missing_info_en_adj_score', 'improvements_en_adj_score'],
      dtype='object')

In [10]:
df.loc[:, "missing_info_en_adj_sentiment"] = df["missing_info_en_adj_score"].apply(lambda x: classify_sentiment(*delimiters_comb, x))
df.loc[:, "improvements_en_adj_sentiment"] = df["improvements_en_adj_score"].apply(lambda x: classify_sentiment(*delimiters_comb, x))
df[['missing_info_sentiment', 'improvements_sentiment', 'missing_info_en_adj_sentiment', 'improvements_en_adj_sentiment']].head(15)

Unnamed: 0,missing_info_sentiment,improvements_sentiment,missing_info_en_adj_sentiment,improvements_en_adj_sentiment
0,3.0,1,1,1
1,1.0,1,0,1
2,0.0,1,1,1
3,3.0,2,1,1
4,0.0,1,1,1
5,3.0,1,1,1
6,1.0,0,1,1
7,1.0,1,1,1
8,3.0,0,1,1
9,1.0,1,1,1


In [11]:
def get_accuracy(df):
    missing_info_acc = (df["missing_info_sentiment"] == df["missing_info_en_adj_sentiment"]) | (df["missing_info_sentiment"] == 3)
    improvements_acc = (df["improvements_sentiment"] == df["improvements_en_adj_sentiment"]) | (df["missing_info_sentiment"] == 3)
    return (missing_info_acc.sum() + improvements_acc.sum()) / (df.shape[0]*2)

In [12]:
get_accuracy(df)

0.5902061855670103

In [13]:
for delimiters_comb in delimiters:
    df.loc[:, "missing_info_en_adj_sentiment"] = df["missing_info_en_adj_score"].apply(lambda x: classify_sentiment(*delimiters_comb, x))
    df.loc[:, "improvements_en_adj_sentiment"] = df["improvements_en_adj_score"].apply(lambda x: classify_sentiment(*delimiters_comb, x))
    print(delimiters_comb, "->", get_accuracy(df))

(-10, -9) -> 0.2551546391752577
(-10, -8) -> 0.25612113402061853
(-10, -7) -> 0.25757087628865977
(-10, -6) -> 0.25757087628865977
(-10, -5) -> 0.2601481958762887
(-10, -4) -> 0.2603092783505155
(-10, -3) -> 0.26788015463917525
(-10, -2) -> 0.2696520618556701
(-10, -1) -> 0.2699742268041237
(-10, 1) -> 0.5718427835051546
(-10, 2) -> 0.5723260309278351
(-10, 3) -> 0.5737757731958762
(-10, 4) -> 0.5737757731958762
(-10, 5) -> 0.5752255154639175
(-10, 6) -> 0.5752255154639175
(-10, 7) -> 0.5752255154639175
(-10, 8) -> 0.5752255154639175
(-10, 9) -> 0.5752255154639175
(-9, -8) -> 0.2603092783505155
(-9, -7) -> 0.2617590206185567
(-9, -6) -> 0.2617590206185567
(-9, -5) -> 0.26433634020618557
(-9, -4) -> 0.2644974226804124
(-9, -3) -> 0.27206829896907214
(-9, -2) -> 0.27384020618556704
(-9, -1) -> 0.27416237113402064
(-9, 1) -> 0.5760309278350515
(-9, 2) -> 0.576514175257732
(-9, 3) -> 0.5779639175257731
(-9, 4) -> 0.5779639175257731
(-9, 5) -> 0.5794136597938144
(-9, 6) -> 0.579413659793814

In [14]:
df['missing_info_sentiment'].value_counts()

0.0    1425
1.0     914
3.0     755
2.0      10
Name: missing_info_sentiment, dtype: int64

In [15]:
df['improvements_sentiment'].value_counts()

0    1673
1    1361
3      37
2      33
Name: improvements_sentiment, dtype: int64

In [16]:
df['missing_info_en_adj_sentiment'].value_counts()

0    3103
1       1
Name: missing_info_en_adj_sentiment, dtype: int64