In [1]:
from pyspark.sql import SparkSession


In [2]:
spark = SparkSession.builder.getOrCreate()


In [1]:
import os
import pandas as pd
import plotly.graph_objects as go
from datetime import datetime, timedelta


In [59]:
parent_dir = os.path.dirname(os.getcwd())
metv_path = os.path.join(parent_dir, "data", "raw", "metv_montly.json")

df_metv = pd.read_json(metv_path)
df_metv

Unnamed: 0,Datetime,Open,High,Low,Close,Adj Close,Volume
0,2023-05-10 18:30:00+00:00,9.0855,9.1450,9.0855,9.1450,9.1450,0
1,2023-05-10 19:30:00+00:00,9.1400,9.1450,9.1100,9.1100,9.1100,17447
2,2023-05-11 13:30:00+00:00,9.1400,9.1600,9.1135,9.1385,9.1385,194728
3,2023-05-11 14:30:00+00:00,9.1200,9.1400,9.0900,9.1201,9.1201,58655
4,2023-05-11 15:30:00+00:00,9.1400,9.1650,9.1400,9.1599,9.1599,15837
...,...,...,...,...,...,...,...
137,2023-06-08 15:30:00+00:00,9.9600,9.9850,9.9521,9.9850,9.9850,11841
138,2023-06-08 16:30:00+00:00,9.9850,9.9850,9.9458,9.9550,9.9550,21325
139,2023-06-08 17:30:00+00:00,9.9500,9.9550,9.9200,9.9478,9.9478,20112
140,2023-06-08 18:30:00+00:00,9.9550,9.9791,9.9550,9.9650,9.9650,17740


In [60]:
df_metv['Date'] = df_metv['Datetime'].dt.strftime('%Y-%m-%d')
df_metv

Unnamed: 0,Datetime,Open,High,Low,Close,Adj Close,Volume,Date
0,2023-05-10 18:30:00+00:00,9.0855,9.1450,9.0855,9.1450,9.1450,0,2023-05-10
1,2023-05-10 19:30:00+00:00,9.1400,9.1450,9.1100,9.1100,9.1100,17447,2023-05-10
2,2023-05-11 13:30:00+00:00,9.1400,9.1600,9.1135,9.1385,9.1385,194728,2023-05-11
3,2023-05-11 14:30:00+00:00,9.1200,9.1400,9.0900,9.1201,9.1201,58655,2023-05-11
4,2023-05-11 15:30:00+00:00,9.1400,9.1650,9.1400,9.1599,9.1599,15837,2023-05-11
...,...,...,...,...,...,...,...,...
137,2023-06-08 15:30:00+00:00,9.9600,9.9850,9.9521,9.9850,9.9850,11841,2023-06-08
138,2023-06-08 16:30:00+00:00,9.9850,9.9850,9.9458,9.9550,9.9550,21325,2023-06-08
139,2023-06-08 17:30:00+00:00,9.9500,9.9550,9.9200,9.9478,9.9478,20112,2023-06-08
140,2023-06-08 18:30:00+00:00,9.9550,9.9791,9.9550,9.9650,9.9650,17740,2023-06-08


In [61]:
df_metv.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,142.0,142.0,142.0,142.0,142.0,142.0
mean,9.588475,9.615819,9.566618,9.592561,9.592561,49251.739437
std,0.359287,0.362469,0.356437,0.357674,0.357674,53916.626016
min,8.995,9.015,8.985,8.9971,8.9971,0.0
25%,9.2137,9.258525,9.20955,9.236525,9.236525,18060.5
50%,9.505,9.53,9.485,9.5175,9.5175,37475.5
75%,9.9375,9.96875,9.9,9.936175,9.936175,61399.0
max,10.12,10.17,10.1,10.123,10.123,467944.0


In [62]:
print(df_metv.isna().sum())

Datetime     0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
Date         0
dtype: int64


In [102]:
start_date = df_metv['Date'].min()
end_date = df_metv['Date'].max()

# Tarih aralığına göre filtreleme
filtered_data_stock = df_metv[(df_metv['Date'] >= start_date) & (df_metv['Date'] <= end_date)]

# Hareketli ortalama hesaplama
moving_avg_stock = filtered_data_stock.groupby('Date')['Adj Close'].mean()

# Yeni bir veri çerçevesi oluşturma
df_moving_avg_stock = pd.DataFrame({'date': moving_avg_stock.index, 'moving_avg_price': moving_avg_stock.values})

In [103]:
fig1 = go.Figure(data=go.Scatter(x=df_moving_avg_stock['date'], y=df_moving_avg_stock['moving_avg_price'], mode='lines'))

fig1.update_layout(
    xaxis_title='Date',
    yaxis_title='Adj Close',
    title='Adj Close vs. Date',
    xaxis=dict(
        tickmode='linear',
        dtick='D1',  # Set the tick frequency to one day (D1)
    )
)

fig1.show()

In [82]:
earliest_date = df_metv['Date'].iloc[0]
latest_date = df_metv['Date'].iloc[-1]

subset = df_metv.loc[(df_metv['Date'] >= earliest_date) & (df_metv['Date'] <= latest_date)]
max_price = subset['Adj Close'].max()
min_price = subset['Adj Close'].min()

max_price_day = subset.loc[subset['Adj Close'] == max_price, 'Date'].values[0]
min_price_day = subset.loc[subset['Adj Close'] == min_price, 'Date'].values[0]

date_range = f"{earliest_date} to {latest_date}"

print("Highest Price:", format(max_price, '.2f'))
print("Day of Highest Price:", max_price_day)

print("Lowest Price:", format(min_price, '.2f'))
print("Day of Lowest Price:", min_price_day)

print("Date Range:", date_range)

Highest Price: 10.12
Day of Highest Price: 2023-06-06
Lowest Price: 9.00
Day of Lowest Price: 2023-05-12
Date Range: 2023-05-10 to 2023-06-08


Processing news data

In [22]:
news_path = os.path.join(parent_dir, "data", "raw", "news.json")

In [23]:
df_news = pd.read_json(news_path)
df_news

Unnamed: 0,title,description,content,publish_date
0,‘Painted into a corner’: can generative AI sav...,Mark Zuckerberg says in earnings call that com...,Meta is not pivoting away from its signature p...,2023-05-11T05:00:17Z
1,"Yes, the Metaverse Is Still Happening",There’s still much hype around the metaverse. ...,Don’t get left behind while competitors contin...,2023-05-12T15:00:00Z
2,Starburst opens Juicyverse experience in metav...,Starburst has launched its Juicyverse experien...,Starburst has launched its Juicyverse experien...,2023-05-11T14:30:00Z
3,Metaverse Could Contribute Up To 2.4% of US GD...,A study commissioned by Meta has found that th...,The concept of the metaverse includes augmente...,2023-05-10T01:25:00Z
4,"Activision Boss Hyped About AI, Suggests It Co...",There’s always a new tech trend being billed a...,Theres always a new tech trend being billed as...,2023-05-11T20:11:00Z
...,...,...,...,...
1495,'SpinOk' spyware found in 193 more Android apps,"About a week ago, we talked about the so-calle...","About a week ago, we talked about the so-calle...",2023-06-08T09:29:31Z
1496,Apple vision pro just brilliantly destroyed me...,I’d really hate to be Mark Zuckerberg right no...,Skip to comments.\r\nApple vision pro just bri...,2023-06-07T01:44:10Z
1497,Q2 2023 Metaverse Development Trends and Appli...,"DUBLIN, June 6, 2023 /PRNewswire/ -- The ""Deve...","DUBLIN, June 6, 2023 /PRNewswire/ -- The ""Deve...",2023-06-07T00:15:00Z
1498,There are 5 reasons why Nvidia stock is a top ...,"""Net-net, we gain more confidence Nvidia can s...","Even after Nvidia'sAI-fueled, year-to-date ral...",2023-06-07T16:28:31Z


In [24]:
df_news['publish_date'] = pd.to_datetime(df_news['publish_date'])
df_news['date'] = df_news['publish_date'].dt.strftime('%Y-%m-%d')

df_news

Unnamed: 0,title,description,content,publish_date,date
0,‘Painted into a corner’: can generative AI sav...,Mark Zuckerberg says in earnings call that com...,Meta is not pivoting away from its signature p...,2023-05-11 05:00:17+00:00,2023-05-11
1,"Yes, the Metaverse Is Still Happening",There’s still much hype around the metaverse. ...,Don’t get left behind while competitors contin...,2023-05-12 15:00:00+00:00,2023-05-12
2,Starburst opens Juicyverse experience in metav...,Starburst has launched its Juicyverse experien...,Starburst has launched its Juicyverse experien...,2023-05-11 14:30:00+00:00,2023-05-11
3,Metaverse Could Contribute Up To 2.4% of US GD...,A study commissioned by Meta has found that th...,The concept of the metaverse includes augmente...,2023-05-10 01:25:00+00:00,2023-05-10
4,"Activision Boss Hyped About AI, Suggests It Co...",There’s always a new tech trend being billed a...,Theres always a new tech trend being billed as...,2023-05-11 20:11:00+00:00,2023-05-11
...,...,...,...,...,...
1495,'SpinOk' spyware found in 193 more Android apps,"About a week ago, we talked about the so-calle...","About a week ago, we talked about the so-calle...",2023-06-08 09:29:31+00:00,2023-06-08
1496,Apple vision pro just brilliantly destroyed me...,I’d really hate to be Mark Zuckerberg right no...,Skip to comments.\r\nApple vision pro just bri...,2023-06-07 01:44:10+00:00,2023-06-07
1497,Q2 2023 Metaverse Development Trends and Appli...,"DUBLIN, June 6, 2023 /PRNewswire/ -- The ""Deve...","DUBLIN, June 6, 2023 /PRNewswire/ -- The ""Deve...",2023-06-07 00:15:00+00:00,2023-06-07
1498,There are 5 reasons why Nvidia stock is a top ...,"""Net-net, we gain more confidence Nvidia can s...","Even after Nvidia'sAI-fueled, year-to-date ral...",2023-06-07 16:28:31+00:00,2023-06-07


In [25]:
columns_to_check = ["title", "description", "content"]
columns_with_nan = df_news[columns_to_check].isnull().any()

# Replace NaN values with an empty string
df_news[columns_to_check] = df_news[columns_to_check].fillna("")



In [26]:
df_news["summary"] = df_news["title"] + " " + df_news["description"] + " " + df_news["content"]
df_news

Unnamed: 0,title,description,content,publish_date,date,summary
0,‘Painted into a corner’: can generative AI sav...,Mark Zuckerberg says in earnings call that com...,Meta is not pivoting away from its signature p...,2023-05-11 05:00:17+00:00,2023-05-11,‘Painted into a corner’: can generative AI sav...
1,"Yes, the Metaverse Is Still Happening",There’s still much hype around the metaverse. ...,Don’t get left behind while competitors contin...,2023-05-12 15:00:00+00:00,2023-05-12,"Yes, the Metaverse Is Still Happening There’s ..."
2,Starburst opens Juicyverse experience in metav...,Starburst has launched its Juicyverse experien...,Starburst has launched its Juicyverse experien...,2023-05-11 14:30:00+00:00,2023-05-11,Starburst opens Juicyverse experience in metav...
3,Metaverse Could Contribute Up To 2.4% of US GD...,A study commissioned by Meta has found that th...,The concept of the metaverse includes augmente...,2023-05-10 01:25:00+00:00,2023-05-10,Metaverse Could Contribute Up To 2.4% of US GD...
4,"Activision Boss Hyped About AI, Suggests It Co...",There’s always a new tech trend being billed a...,Theres always a new tech trend being billed as...,2023-05-11 20:11:00+00:00,2023-05-11,"Activision Boss Hyped About AI, Suggests It Co..."
...,...,...,...,...,...,...
1495,'SpinOk' spyware found in 193 more Android apps,"About a week ago, we talked about the so-calle...","About a week ago, we talked about the so-calle...",2023-06-08 09:29:31+00:00,2023-06-08,'SpinOk' spyware found in 193 more Android app...
1496,Apple vision pro just brilliantly destroyed me...,I’d really hate to be Mark Zuckerberg right no...,Skip to comments.\r\nApple vision pro just bri...,2023-06-07 01:44:10+00:00,2023-06-07,Apple vision pro just brilliantly destroyed me...
1497,Q2 2023 Metaverse Development Trends and Appli...,"DUBLIN, June 6, 2023 /PRNewswire/ -- The ""Deve...","DUBLIN, June 6, 2023 /PRNewswire/ -- The ""Deve...",2023-06-07 00:15:00+00:00,2023-06-07,Q2 2023 Metaverse Development Trends and Appli...
1498,There are 5 reasons why Nvidia stock is a top ...,"""Net-net, we gain more confidence Nvidia can s...","Even after Nvidia'sAI-fueled, year-to-date ral...",2023-06-07 16:28:31+00:00,2023-06-07,There are 5 reasons why Nvidia stock is a top ...


In [27]:
# Drop the rows with NaN values in the "summary" column
df_news = df_news.dropna(subset=['summary'])

In [28]:
df_news_copy = df_news.copy()

Applying the classifier model

In [29]:
import flair

# Load the sentiment classifier
classifier = flair.models.TextClassifier.load('en-sentiment')

# Define a function for sentiment analysis
def sentiment_analysis(text):
    sentence = flair.data.Sentence(text)
    classifier.predict(sentence)
    label = sentence.labels[0].value
    score = sentence.labels[0].score
    return label, score


In [30]:
# Convert DataFrame to dictionary
data_dict = df_news['summary'].to_dict()

In [31]:
# Apply sentiment analysis to the dictionary
sentiment_results = {key: sentiment_analysis(value) for key, value in data_dict.items()}

In [32]:
# Convert sentiment results dictionary to DataFrame
sentiment_df = pd.DataFrame.from_dict(sentiment_results, orient='index', columns=['label', 'score'])

# Merge sentiment results DataFrame with the original DataFrame
df_news = pd.concat([df_news, sentiment_df], axis=1)

In [33]:
df_news

Unnamed: 0,title,description,content,publish_date,date,summary,label,score
0,‘Painted into a corner’: can generative AI sav...,Mark Zuckerberg says in earnings call that com...,Meta is not pivoting away from its signature p...,2023-05-11 05:00:17+00:00,2023-05-11,‘Painted into a corner’: can generative AI sav...,NEGATIVE,0.999829
1,"Yes, the Metaverse Is Still Happening",There’s still much hype around the metaverse. ...,Don’t get left behind while competitors contin...,2023-05-12 15:00:00+00:00,2023-05-12,"Yes, the Metaverse Is Still Happening There’s ...",NEGATIVE,0.841264
2,Starburst opens Juicyverse experience in metav...,Starburst has launched its Juicyverse experien...,Starburst has launched its Juicyverse experien...,2023-05-11 14:30:00+00:00,2023-05-11,Starburst opens Juicyverse experience in metav...,POSITIVE,0.997464
3,Metaverse Could Contribute Up To 2.4% of US GD...,A study commissioned by Meta has found that th...,The concept of the metaverse includes augmente...,2023-05-10 01:25:00+00:00,2023-05-10,Metaverse Could Contribute Up To 2.4% of US GD...,POSITIVE,0.706658
4,"Activision Boss Hyped About AI, Suggests It Co...",There’s always a new tech trend being billed a...,Theres always a new tech trend being billed as...,2023-05-11 20:11:00+00:00,2023-05-11,"Activision Boss Hyped About AI, Suggests It Co...",NEGATIVE,0.987233
...,...,...,...,...,...,...,...,...
1495,'SpinOk' spyware found in 193 more Android apps,"About a week ago, we talked about the so-calle...","About a week ago, we talked about the so-calle...",2023-06-08 09:29:31+00:00,2023-06-08,'SpinOk' spyware found in 193 more Android app...,NEGATIVE,0.999973
1496,Apple vision pro just brilliantly destroyed me...,I’d really hate to be Mark Zuckerberg right no...,Skip to comments.\r\nApple vision pro just bri...,2023-06-07 01:44:10+00:00,2023-06-07,Apple vision pro just brilliantly destroyed me...,NEGATIVE,0.999934
1497,Q2 2023 Metaverse Development Trends and Appli...,"DUBLIN, June 6, 2023 /PRNewswire/ -- The ""Deve...","DUBLIN, June 6, 2023 /PRNewswire/ -- The ""Deve...",2023-06-07 00:15:00+00:00,2023-06-07,Q2 2023 Metaverse Development Trends and Appli...,POSITIVE,0.977257
1498,There are 5 reasons why Nvidia stock is a top ...,"""Net-net, we gain more confidence Nvidia can s...","Even after Nvidia'sAI-fueled, year-to-date ral...",2023-06-07 16:28:31+00:00,2023-06-07,There are 5 reasons why Nvidia stock is a top ...,NEGATIVE,0.998654


In [34]:
# # Apply sentiment analysis function to the 'summary' column
# df_news[['label', 'score']] = df_news['summary'].apply(lambda x: pd.Series(sentiment_analysis(x)))

# # Now df_news contains two new columns: 'label' and 'score' with sentiment analysis results

In [35]:
df_news['score'] = df_news.apply(lambda row: row['score'] * -1 if row['label'] == 'NEGATIVE' else row['score'], axis=1)

In [36]:
df_news

Unnamed: 0,title,description,content,publish_date,date,summary,label,score
0,‘Painted into a corner’: can generative AI sav...,Mark Zuckerberg says in earnings call that com...,Meta is not pivoting away from its signature p...,2023-05-11 05:00:17+00:00,2023-05-11,‘Painted into a corner’: can generative AI sav...,NEGATIVE,-0.999829
1,"Yes, the Metaverse Is Still Happening",There’s still much hype around the metaverse. ...,Don’t get left behind while competitors contin...,2023-05-12 15:00:00+00:00,2023-05-12,"Yes, the Metaverse Is Still Happening There’s ...",NEGATIVE,-0.841264
2,Starburst opens Juicyverse experience in metav...,Starburst has launched its Juicyverse experien...,Starburst has launched its Juicyverse experien...,2023-05-11 14:30:00+00:00,2023-05-11,Starburst opens Juicyverse experience in metav...,POSITIVE,0.997464
3,Metaverse Could Contribute Up To 2.4% of US GD...,A study commissioned by Meta has found that th...,The concept of the metaverse includes augmente...,2023-05-10 01:25:00+00:00,2023-05-10,Metaverse Could Contribute Up To 2.4% of US GD...,POSITIVE,0.706658
4,"Activision Boss Hyped About AI, Suggests It Co...",There’s always a new tech trend being billed a...,Theres always a new tech trend being billed as...,2023-05-11 20:11:00+00:00,2023-05-11,"Activision Boss Hyped About AI, Suggests It Co...",NEGATIVE,-0.987233
...,...,...,...,...,...,...,...,...
1495,'SpinOk' spyware found in 193 more Android apps,"About a week ago, we talked about the so-calle...","About a week ago, we talked about the so-calle...",2023-06-08 09:29:31+00:00,2023-06-08,'SpinOk' spyware found in 193 more Android app...,NEGATIVE,-0.999973
1496,Apple vision pro just brilliantly destroyed me...,I’d really hate to be Mark Zuckerberg right no...,Skip to comments.\r\nApple vision pro just bri...,2023-06-07 01:44:10+00:00,2023-06-07,Apple vision pro just brilliantly destroyed me...,NEGATIVE,-0.999934
1497,Q2 2023 Metaverse Development Trends and Appli...,"DUBLIN, June 6, 2023 /PRNewswire/ -- The ""Deve...","DUBLIN, June 6, 2023 /PRNewswire/ -- The ""Deve...",2023-06-07 00:15:00+00:00,2023-06-07,Q2 2023 Metaverse Development Trends and Appli...,POSITIVE,0.977257
1498,There are 5 reasons why Nvidia stock is a top ...,"""Net-net, we gain more confidence Nvidia can s...","Even after Nvidia'sAI-fueled, year-to-date ral...",2023-06-07 16:28:31+00:00,2023-06-07,There are 5 reasons why Nvidia stock is a top ...,NEGATIVE,-0.998654


In [93]:
# Son 1 ayın tarih aralığını belirleme
end_date = datetime.now()
start_date = end_date - timedelta(days=30)


# Tarih aralığına göre filtreleme
filtered_data_news = df_news[(df_news['date'] >= start_date) & (df_news['date'] <= end_date)]

# Hareketli ortalama hesaplama
moving_avg_news = filtered_data_news.groupby('date')['score'].mean()

# Yeni bir veri çerçevesi oluşturma
df_moving_avg_news = pd.DataFrame({'date': moving_avg_news.index, 'moving_avg_score': moving_avg_news.values})
df_moving_avg_news

Unnamed: 0,date,moving_avg_score
0,2023-05-11,0.024302
1,2023-05-12,0.172416
2,2023-05-13,-0.117903
3,2023-05-14,0.103021
4,2023-05-15,-0.047642
5,2023-05-16,-0.002618
6,2023-05-17,0.059553
7,2023-05-18,0.270738
8,2023-05-19,0.086082
9,2023-05-20,0.014329


In [95]:
fig2 = go.Figure(data=go.Scatter(x=df_moving_avg_news['date'], y=df_moving_avg_news['moving_avg_score'], mode='lines'))

fig2.update_layout(
    xaxis_title='Date',
    yaxis_title='Score',
    title='Score vs. Date',
    xaxis=dict(
        tickmode='linear',
        dtick='D1',  # Set the tick frequency to one day (D1)
    )
)

fig2.show()

lets plot them together

In [104]:
# Create the first scatter trace
trace1 = go.Scatter(
    x=df_moving_avg_stock['date'],
    y=df_moving_avg_stock['moving_avg_price'],
    mode='lines',
    name='Adj Close'
)

# Create the second scatter trace with a secondary y-axis
trace2 = go.Scatter(
    x=df_moving_avg_news['date'],
    y=df_moving_avg_news['moving_avg_score'],
    mode='lines',
    name='Score',
    yaxis='y2'
)

# Define the layout
layout = go.Layout(
    title='Adj Close and Score Comparison',
    xaxis=dict(title='Date'),
    yaxis=dict(title='Adj Close'),
    yaxis2=dict(
        title='Score',
        overlaying='y',
        side='right'
    )
)

# Create the figure
fig = go.Figure(data=[trace1, trace2], layout=layout)

# Display the figure
fig.show()


In [97]:
# Calculate the Pearson correlation coefficient between sentiment scores and 'Adj Close'
pearson_corr = df_metv['Adj Close'].corr(df_news['score'], method='pearson')

# Calculate the Spearman correlation coefficient between sentiment scores and 'Adj Close'
spearman_corr = df_metv['Adj Close'].corr(df_news['score'], method='spearman')

print("Pearson correlation coefficient:", pearson_corr)
print("Spearman correlation coefficient:", spearman_corr)


Pearson correlation coefficient: 0.03328894357259381
Spearman correlation coefficient: 0.007616101541676827


weak negative correlation.

In [105]:
df_moving_avg_stock.head()

Unnamed: 0,date,moving_avg_price
0,2023-05-10,9.1275
1,2023-05-11,9.143714
2,2023-05-12,9.033257
3,2023-05-15,9.165
4,2023-05-16,9.175957


In [106]:
df_moving_avg_news.head()

Unnamed: 0,date,moving_avg_score
0,2023-05-11,0.024302
1,2023-05-12,0.172416
2,2023-05-13,-0.117903
3,2023-05-14,0.103021
4,2023-05-15,-0.047642


In [107]:
news_path

'C:\\Users\\HP\\Desktop\\big_data_project\\data\\raw\\news.json'

In [109]:
parent_dir 

'C:\\Users\\HP\\Desktop\\big_data_project'

In [110]:
processed_data_path = os.path.join(parent_dir, "data", "processed")

# Define the output file paths
news_output_path = os.path.join(processed_data_path, "processed_news_data.csv")
stock_output_path = os.path.join(processed_data_path, "processed_stock_data.csv")

# Save the dataframes as CSV files
df_moving_avg_news.to_csv(news_output_path, index=False)
df_moving_avg_stock.to_csv(stock_output_path, index=False)
