In [None]:
import csv

import wordcloud
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import altair as alt
import plotly.express as px
# alt.data_transformers.enable('vegafusion')


### data loading

In [None]:
import io

# Read the original CSV content
with open('F:/temp/cleaned_reduced_dataset.csv', 'r', encoding='utf-8') as file:
    content = file.read()

# Replace triple quotes with a single quote character
modified_content = content.replace('"""', '|')

# Create an in-memory file-like object with the modified content
data = io.StringIO(modified_content)

# Read the modified CSV content into a Pandas DataFrame
df = pd.read_csv(
    data,
    quoting=csv.QUOTE_MINIMAL,
    quotechar='|'
)

# Display the DataFrame
df['DATE'] = pd.to_datetime(df['DATE'], dayfirst=True)
df = df.sort_values(by='DATE')
df=df.drop(' index', axis=1)

In [None]:
def count_words(text):
    return len(text.split())

# Apply the function to the 'TWEET' column
df['WORD_COUNT'] = df['TWEET'].apply(count_words)

In [None]:
df_mean = df[['DATE', 'STOCK','1_DAY_RETURN',	'2_DAY_RETURN',	'3_DAY_RETURN','7_DAY_RETURN', 'VOLATILITY_10D','VOLATILITY_30D','TEXTBLOB_POLARITY']].drop_duplicates().groupby('DATE')[
    ['1_DAY_RETURN',	'2_DAY_RETURN',	'3_DAY_RETURN','7_DAY_RETURN', 'VOLATILITY_10D','VOLATILITY_30D','TEXTBLOB_POLARITY']].mean()
df_mean.columns = [f"{x}_mn" for x in df_mean.columns]
df_mean.head()
df1 = df.merge(df_mean, on=['DATE'], how = 'right')
for col in ['1_DAY_RETURN',	'2_DAY_RETURN',	'3_DAY_RETURN','7_DAY_RETURN', 'VOLATILITY_10D','VOLATILITY_30D','TEXTBLOB_POLARITY']:
    df1[col] = df1[col]-df1[f'{col}_mn']

# By stock future return by sentiment direction

In [None]:
avg_sentiment_by_ticker = df1.groupby('STOCK')[['1_DAY_RETURN','2_DAY_RETURN','3_DAY_RETURN','7_DAY_RETURN','TEXTBLOB_POLARITY']].mean()
avg_sentiment_corr_by_ticker = df1.groupby('STOCK').apply(lambda x:x[['1_DAY_RETURN','2_DAY_RETURN','3_DAY_RETURN','7_DAY_RETURN']].corrwith(x['TEXTBLOB_POLARITY']))
avg_sentiment_corr_by_ticker.columns = [f"{x}_Corr_Sentiment" for x in avg_sentiment_corr_by_ticker.columns]
avg_sentiment_by_ticker= pd.concat([avg_sentiment_by_ticker, avg_sentiment_corr_by_ticker], axis=1)

# # Filter the dataset to include only tweets with negative sentiment
avg_sentiment_by_ticker = avg_sentiment_by_ticker.dropna()

# Define a function to remove outliers using IQR
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove outliers for the sentiment and return columns
avg_sentiment_by_ticker = remove_outliers(avg_sentiment_by_ticker.reset_index(), 'TEXTBLOB_POLARITY')
avg_sentiment_by_ticker = remove_outliers(avg_sentiment_by_ticker, '1_DAY_RETURN')
avg_sentiment_by_ticker = remove_outliers(avg_sentiment_by_ticker, '2_DAY_RETURN')
avg_sentiment_by_ticker = remove_outliers(avg_sentiment_by_ticker, '3_DAY_RETURN')
avg_sentiment_by_ticker = remove_outliers(avg_sentiment_by_ticker, '7_DAY_RETURN')
avg_sentiment_by_ticker.set_index('STOCK', inplace=True)

In [None]:
df1['sentiment_q'] = pd.qcut(df1['TEXTBLOB_POLARITY'], q=4, labels=[-2,-1,1,2])

In [None]:
asq = df1.groupby('sentiment_q')[['1_DAY_RETURN','2_DAY_RETURN','3_DAY_RETURN','7_DAY_RETURN','TEXTBLOB_POLARITY']].median()
asq_corr = df1.groupby('sentiment_q').apply(lambda x:x[['1_DAY_RETURN','2_DAY_RETURN','3_DAY_RETURN','7_DAY_RETURN']].corrwith(x['TEXTBLOB_POLARITY']))
asq_corr.columns = [f"{x}_Corr_Sentiment" for x in asq_corr.columns]
asq= pd.concat([asq, asq_corr], axis=1)
asqm = asq[['1_DAY_RETURN','2_DAY_RETURN','3_DAY_RETURN','7_DAY_RETURN']].reset_index().melt(id_vars='sentiment_q', var_name='Period', value_name='Return')

# Map sentiment_q values to descriptive labels
sentiment_labels = {-2: 'very negative', -1: 'slightly negative', 1: 'slightly positive', 2: 'very positive'}
# asq['sentiment_label'] = asq['sentiment_q'].map(sentiment_labels)
asq['sentiment_label'] = asq.index.map(sentiment_labels)
asqm['sentiment_label'] = asqm['sentiment_q'].map(sentiment_labels)



In [None]:

def create_chart(y_column, title):
    points = alt.Chart(avg_sentiment_by_ticker.reset_index()).mark_circle().encode(
        x=alt.X('TEXTBLOB_POLARITY:Q', title='Tweet Sentiment',
                scale=alt.Scale(domain=[avg_sentiment_by_ticker['TEXTBLOB_POLARITY'].min()-0.01,
                                        avg_sentiment_by_ticker['TEXTBLOB_POLARITY'].max()+0.01])),
        y=alt.Y(f'{y_column}:Q', title=title, axis=alt.Axis(format='.2%')),
        size=alt.Size(f'{y_column}_Corr_Sentiment:Q', title='sentiment.corr',
                      scale=alt.Scale(range=[5, 300],
                                      domain=[avg_sentiment_by_ticker[f'{y_column}_Corr_Sentiment'].min(),
                                              avg_sentiment_by_ticker[f'{y_column}_Corr_Sentiment'].max()]
                                      )
                      ),
        # color=alt.Color('STOCK:N', title='Stock',
        #                 # legend=alt.Legend(columns=1),
        #                 ),
        tooltip=['STOCK:N', 'TEXTBLOB_POLARITY:Q', f'{y_column}:Q', f'{y_column}_Corr_Sentiment:Q']
    )

    regression_line = points.transform_regression(
        'TEXTBLOB_POLARITY', y_column
    ).mark_line(color='red')

    return (points + regression_line).properties(
        # return (points).properties(
        # width=600,
        # height=300,
        title=title

    )
chart_1_day_n = create_chart('1_DAY_RETURN', '1-Day Future Return')
# print(1)
chart_2_day_n = create_chart('2_DAY_RETURN', '2-Day Future Return')
# print(2)
chart_3_day_n = create_chart('3_DAY_RETURN', '3-Day Future Return')
# print(3)
chart_7_day_n = create_chart('7_DAY_RETURN', '7-Day Future Return')
# print(7)

In [None]:

melted_df = asq.reset_index().melt(
    id_vars=['sentiment_q'],
    value_vars=['1_DAY_RETURN', '2_DAY_RETURN', '3_DAY_RETURN', '7_DAY_RETURN'],
    var_name='Period',
    value_name='Return'
)
# Define color scale for returns
color_scale = alt.Scale(
    domain=[melted_df['Return'].min(), melted_df['Return'].max()],
    range=['purple', 'yellow', 'green']
)

chart1 = alt.Chart(melted_df).mark_bar().encode(
    x=alt.X('Period:N', title='Future Return', sort=['1_DAY_RETURN', '2_DAY_RETURN', '3_DAY_RETURN', '7_DAY_RETURN']),
    y=alt.Y('Return:Q', title='Average Return'),
    color=alt.Color('Return:Q', scale=color_scale),
    column=alt.Column('sentiment_q:N', title='Sentiment Quantile'),
    tooltip=['Period', 'Return']
).properties(
    width=150,
    height=300,
    title='Return Trajectory by Sentiment Quantile and Period'
)

In [None]:
# # Map sentiment_q values to descriptive labels
# sentiment_labels = {-2: 'very negative', -1: 'slightly negative', 1: 'slightly positive', 2: 'very positive'}
# # asq['sentiment_label'] = asq['sentiment_q'].map(sentiment_labels)
# asq['sentiment_label'] = asq.index.map(sentiment_labels)

# Melt the DataFrame
melted_df = asq.reset_index().melt(
    id_vars=['sentiment_label'],
    value_vars=['1_DAY_RETURN', '2_DAY_RETURN', '3_DAY_RETURN', '7_DAY_RETURN'],
    var_name='Period',
    value_name='Return'
)

# Define color scale for returns
color_scale = alt.Scale(
    domain=[melted_df['Return'].min(), melted_df['Return'].max()],
    range=['purple', 'yellow', 'green']
)

# Create the chart
chart1 = alt.Chart(melted_df).mark_bar().encode(
    x=alt.X('Period:N', title='Period', sort=['1_DAY_RETURN', '2_DAY_RETURN', '3_DAY_RETURN', '7_DAY_RETURN']),
    y=alt.Y('Return:Q', title='Average Future Return'),
    color=alt.Color('Return:Q', scale=color_scale),
    column=alt.Column('sentiment_label:N', title='Sentiment Level',sort=['very negative', 'slightly negative', 'slightly positive', 'very positive']),
    tooltip=['Period', 'Return']
).properties(
    # width=150,
    # height=300,
    title='Return Trajectory by Sentiment Level and Period'
)


In [None]:
chart2 = alt.Chart(asqm).mark_bar().encode(
    x=alt.X('sentiment_label:N', title='Sentiment Level',sort=['very negative', 'slightly negative', 'slightly positive', 'very positive']),
    y=alt.Y('Return:Q', title='Average Future Return', axis=alt.Axis(format='.2%')),
    color=alt.Color('Return:Q', title='Return', scale=alt.Scale(scheme='viridis')),
    column=alt.Column('Period:N', title='Period'),
    tooltip=['sentiment_label:N', 'Period:N', alt.Tooltip('Return:Q', format='.2%')]
).properties(
    # width=150,
    # height=300,
    title='Return Trajectory by Sentiment Quantile and Period'
)


In [None]:
        
chart3 = alt.vconcat(
    alt.hconcat(chart1.properties(width=100), chart_1_day_n.properties(width=400), chart_2_day_n.properties(width=400)),
    alt.hconcat(chart2.properties(width=100), chart_3_day_n.properties(width=400), chart_7_day_n.properties(width=400))
# ).resolve_legend(
#     color="independent"
# ).resolve_scale(
#     color="independent"
).properties(
    title={
        "text": ["Sentiment Analysis"],
        "fontSize": 18,
        "anchor": "middle",
        "color": "black"
    },
).configure_title(
    fontSize=12,
    anchor='middle'
).configure_axis(
    labelFontSize=12,
    titleFontSize=10
)

chart3


## Single Stock 

In [None]:
df1.shape

In [None]:
df2 = df1.groupby(['DATE','STOCK'])[['LAST_PRICE', '1_DAY_RETURN', '2_DAY_RETURN',
'3_DAY_RETURN', '7_DAY_RETURN', 'PX_VOLUME', 'VOLATILITY_10D','VOLATILITY_30D', 'LSTM_POLARITY', 'TEXTBLOB_POLARITY', 'WORD_COUNT']].mean().reset_index()

In [None]:
# Stock selector
stock_selection = alt.binding_select(options=df2['STOCK'].unique().tolist(), name='Company')
stock_select = alt.selection_single(fields=['STOCK'], bind=stock_selection, name='Select', empty='none')

# Define the base chart for the stock price
base_price = alt.Chart(df2).mark_line().encode(
    x=alt.X('DATE:T', title='Date'),
    y=alt.Y('LAST_PRICE:Q', title='Closing Price ($)'),
    color=alt.Color('STOCK:N', legend=None)
).transform_filter(
    stock_select
).properties(
    title='Daily Closing Price'
)

# Define the chart for average sentiment
base_sentiment = alt.Chart(df2).mark_line().encode(
    x=alt.X('DATE:T', title='Date'),
    y=alt.Y('mean(TEXTBLOB_POLARITY):Q', title='Average Sentiment'),
    color=alt.Color('STOCK:N', legend=None)
).transform_filter(
    stock_select
).properties(
    title='Average Sentiment Over Time'
)

# Combine the charts horizontally
combined_chart = alt.hconcat(
    base_price,
    base_sentiment
).resolve_scale(
    y='independent'
)

# Add the company selector above the charts
final_chart = alt.vconcat(
    alt.Chart(df2).mark_point().encode().add_selection(stock_select).properties(title='Select a Company'),
    combined_chart
).configure_title(
    fontSize=20,
    anchor='middle'
).configure_axis(
    labelFontSize=12,
    titleFontSize=14
).configure_legend(
    titleFontSize=12,
    labelFontSize=10,
    symbolSize=80
).properties(width=100)

final_chart

In [None]:
del combined_chart,base_price,base_sentiment