# Data exploration - statistics

In [10]:

train_files = [
    "../data/eng-danc.json",
    "../data/eng-fina.json",
    "../data/eng-nava.json",
    "../data/eng-redh.json",
    "../data/eng-scan.json",
    "../data/eng-spec.json"
]
    
test_file = "../data/eng-houn.json"

In [12]:
def extract_words_and_sentiments(data):
    """
    Transforms a JSON data object into a list of [words, sentiment_labels] pairs.
    Sentiment is assigned to tokens based on the 'wids' from the concept data.
    Tokens not associated with any concept get a sentiment of 0.
    """
    transformed_data = []

    for sentence in data.get("sentences", []):
        words = sentence.get("words", [])
        concepts = sentence.get("concepts", [])

        word_list = [w["word"] for w in words]
        sentiment_list = ["NEUTRAL"] * len(words)  # initialize all sentiments as neutral (0)

        # map word id to index in words list
        wid_to_index = {w["wid"]: idx for idx, w in enumerate(words)}

        for concept in concepts:
            if "sentiment" in concept and "wids" in concept:
                sentiment = concept["sentiment"]

                if sentiment <= -20:
                    simple_sentiment = "NEGATIVE"
                elif sentiment >= 20:
                    simple_sentiment = "POSITIVE"
                else:
                    simple_sentiment = "NEUTRAL"

                for wid in concept["wids"]:
                    idx = wid_to_index.get(wid)
                    if idx is not None:
                        sentiment_list[idx] = simple_sentiment

        transformed_data.append([word_list, sentiment_list])

    return transformed_data


In [14]:
combined_train_list = []

for file in train_files:
    with open(file, "r") as f:
        data = json.load(f)
        transformed = extract_words_and_sentiments(data)
        combined_train_list.extend(transformed)

In [15]:
with open(test_file, "r") as f:
    test_data = json.load(f)
    
test_list = extract_words_and_sentiments(test_data)
combined_train_list[0]

[['The', 'Adventure', 'of', 'the', 'Dancing', 'Men'],
 ['NEUTRAL', 'NEUTRAL', 'NEUTRAL', 'NEUTRAL', 'NEUTRAL', 'NEUTRAL']]

In [6]:
import pandas as pd

In [18]:
df_train = pd.DataFrame(combined_train_list, columns=['tokens', 'sentiment_label'])
df_test = pd.DataFrame(test_list, columns=['tokens', 'sentiment_label'])
print(df_train.head())

                                              tokens  \
0            [The, Adventure, of, the, Dancing, Men]   
1  [Holmes, had, been, seated, for, some, hours, ...   
2  [His, head, was, sunk, upon, his, breast, ,, a...   
3  [", So, ,, Watson, ,, ", said, he, ,, suddenly...   
4           [I, gave, a, start, of, astonishment, .]   

                                     sentiment_label  
0  [NEUTRAL, NEUTRAL, NEUTRAL, NEUTRAL, NEUTRAL, ...  
1  [NEUTRAL, NEUTRAL, NEUTRAL, NEUTRAL, NEUTRAL, ...  
2  [NEUTRAL, NEUTRAL, NEUTRAL, NEUTRAL, NEUTRAL, ...  
3  [NEUTRAL, NEUTRAL, NEUTRAL, NEUTRAL, NEUTRAL, ...  
4  [NEUTRAL, NEUTRAL, NEUTRAL, NEGATIVE, NEUTRAL,...  


In [25]:
total_tokens = df_train['token_count'].sum()
print(f"Total number of tokens in all training sentences: {total_tokens}")

total_test_tokens = df_test['token_count'].sum()
print(f"Total number of tokens in all test sentences: {total_test_tokens}")

Total number of tokens in all training sentences: 68241
Total number of tokens in all test sentences: 70360
