# Sentiments Index

## Preparation of Excel File

In [64]:
# combining date from texts_cleaned.xlsx with sentiments.xlsx
import pandas as pd
import json

# reading sentiments.xlsx as sentiments dataframe
excel_file = 'sentiments.xlsx'
sheet_name = '2020 - 2024'
sentiments = pd.read_excel(excel_file, sheet_name=sheet_name)

# extracting custom id and user content from jsonl
def extract_custom_id_and_user_content(jsonl_file_path):
    results = []

    with open(jsonl_file_path, 'r') as file:
        for line in file:
            data = json.loads(line.strip())
            custom_id = data.get('custom_id')
            messages = data.get('body', {}).get('messages', [])
            user_content = None
            for message in messages:
                if message.get('role') == 'user':
                    user_content = message.get('content')
                    break

            if custom_id and user_content:
                results.append({
                    'custom_id': custom_id,
                    'user_content': user_content
                })

    return results

jsonl_file_path = './chatgpt/jsonl/2020_2024(1).jsonl'
extracted_data = extract_custom_id_and_user_content(jsonl_file_path)

jsonl = pd.DataFrame(extracted_data)

# merging sentiments with texts
merged_df = pd.merge(jsonl, sentiments, on='custom_id')

# reading texts_cleaned.xlsx as texts_cleaned dataframe
excel_file = 'texts_cleaned.xlsx'
texts_cleaned = pd.read_excel(excel_file, sheet_name=sheet_name)

# merging texts_cleaned with merged_df
final_df = pd.merge(texts_cleaned, merged_df, right_on='user_content', left_on='Text')

# drop columns user_content and custom_id
final_df = final_df.drop(columns=['user_content', 'custom_id'])

# final_df.to_excel('sentiments_cleaned.xlsx', sheet_name=sheet_name, index=False)
with pd.ExcelWriter('sentiments_cleaned.xlsx', engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:
    final_df.to_excel(writer, sheet_name=sheet_name, index=False)



## Data Analysis

### Boolean

In [61]:
import pandas as pd

def compare_columns(df):
    # mapping categorical values to numerical values
    mapping = {'Positive': 1, 'Neutral': 0, 'Negative': -1}
    
    # apply the mapping to both columns
    df['boolean_x'] = df['categorical_x'].map(mapping)
    df['boolean_y'] = df['categorical_y'].map(mapping)
    
    # calculate the absolute difference
    df['boolean_difference'] = (df['boolean_x'] - df['boolean_y']).abs()
     
    return df

excel_file = 'sentiments_cleaned.xlsx'
sheet_names = '2010 - 2014'

df = pd.read_excel(excel_file, sheet_name=sheet_name)
df = compare_columns(df)

# df.to_excel('market_sentiments.xlsx', sheet_name=sheet_name, index=False)
with pd.ExcelWriter('market_sentiments.xlsx', engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:
    df.to_excel(writer, sheet_name=sheet_name, index=False)

#### Graph