<h1 style= "color:#9370DB;"> Stock Analysis </h1>

In [1]:
# 📚 Libraries 
import kagglehub
import pandas as pd
import numpy as np
import os

# 📊 Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as g

# 🤖 Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error 

<h2 style="color: #9370DB;"> 01 | Data Extraction </h2>

In [2]:
# Download latest version
path = kagglehub.dataset_download("andrewmvd/sp-500-stocks")

In [3]:
# Print all files in the dataset path
print(os.listdir(path))

['sp500_stocks.csv', 'sp500_companies.csv', 'sp500_index.csv']


In [4]:
csv_file_path = os.path.join(path, 'sp500_stocks.csv')
csv_file_path2 = os.path.join(path, 'sp500_companies.csv')
csv_file_path3 = os.path.join(path, 'sp500_index.csv')
data = pd.read_csv(csv_file_path)
df = pd.read_csv(csv_file_path2)
sp = pd.read_csv(csv_file_path3)

<h3 style="color: #4169E1;">1.1 | Exploring the Data </h3>

In [5]:
data

Unnamed: 0,Date,Symbol,Adj Close,Close,High,Low,Open,Volume
0,2010-01-04,MMM,43.783867,69.414719,69.774246,69.122070,69.473244,3640265.0
1,2010-01-05,MMM,43.509628,68.979935,69.590302,68.311035,69.230766,3405012.0
2,2010-01-06,MMM,44.126682,69.958191,70.735786,69.824417,70.133781,6301126.0
3,2010-01-07,MMM,44.158325,70.008362,70.033447,68.662209,69.665550,5346240.0
4,2010-01-08,MMM,44.469463,70.501671,70.501671,69.648827,69.974915,4073337.0
...,...,...,...,...,...,...,...,...
1890269,2024-12-02,ZTS,176.809998,176.809998,176.910004,173.729996,175.779999,2391500.0
1890270,2024-12-03,ZTS,176.940002,176.940002,181.399994,176.559998,176.710007,2679000.0
1890271,2024-12-04,ZTS,175.320007,175.320007,178.500000,174.539993,174.600006,2687000.0
1890272,2024-12-05,ZTS,174.770004,174.770004,176.529999,173.720001,175.270004,2442000.0


In [6]:
data[data['Symbol'] == 'AAPL']

Unnamed: 0,Date,Symbol,Adj Close,Close,High,Low,Open,Volume
146562,2010-01-04,AAPL,6.447411,7.643214,7.660714,7.585000,7.622500,493729600.0
146563,2010-01-05,AAPL,6.458559,7.656429,7.699643,7.616071,7.664286,601904800.0
146564,2010-01-06,AAPL,6.355827,7.534643,7.686786,7.526786,7.656429,552160000.0
146565,2010-01-07,AAPL,6.344077,7.520714,7.571429,7.466071,7.562500,477131200.0
146566,2010-01-08,AAPL,6.386256,7.570714,7.571429,7.466429,7.510714,447610800.0
...,...,...,...,...,...,...,...,...
150315,2024-12-02,AAPL,239.589996,239.589996,240.789993,237.160004,237.270004,48137100.0
150316,2024-12-03,AAPL,242.649994,242.649994,242.759995,238.899994,239.809998,38861000.0
150317,2024-12-04,AAPL,243.009995,243.009995,244.110001,241.250000,242.869995,44383900.0
150318,2024-12-05,AAPL,243.039993,243.039993,244.539993,242.130005,243.990005,40033900.0


In [None]:
df

In [None]:
msft_df = df[df['Symbol'] == 'MSFT']
apple_df = data[data['Symbol'] == 'AAPL']

In [None]:
df.isna().sum()

In [None]:
df.Sector.value_counts()

In [None]:
df.Industry.value_counts()

In [None]:
df.isna().sum()

In [None]:
data.isna().sum()

In [None]:
sp

<h3 style="color: #4169E1;">1.2 | Copies</h3>

In [8]:
data2 = data.copy()

<h2 style="color: #9370DB;"> 02 | Data Cleaning </h2>

<h3 style="color: #4169E1;"> 2.1 | Dealing with Data types</h3>

In [None]:
data.dtypes

In [None]:
df.dtypes

In [None]:
sp.dtypes

<h3 style="color: #4169E1;"> 2.2 | Dealing with NaN values</h3>

In [None]:
data.isna().sum()

In [None]:
df.isna().sum()

In [None]:
sp.isna().sum()

<h3 style="color: #4169E1;"> 2.3 | Dealing with Duplicates</h3>

<h3 style="color: #4169E1;"> 2.5 | Dealing with outliers</h3>

<h3 style="color: #4169E1;"> 2.6 | Moving target to the right </h3>

<h2 style="color: #9370DB;"> 03 | EDA (Exploratory Data Analysis) </h2>

<h3 style="color: #4169E1;"> Optional | Selecting Numerical </h3>

<h3 style="color: #4169E1;">3.1 | Descriptive Statistics </h3>

<h3 style="color: #4169E1;"> 3.2 | Checking Distributions</h3>

<h3 style="color: #4169E1;"> 3.3 | Checking our target distribution</h3>

<h3 style="color: #4169E1;">3.4 | Checking Outliers </h3>

<h3 style="color: #4169E1;">3.5 | Looking for Correlations </h3>

<h2 style="color: #9370DB;"> 04 | Data Processing </h2>

<h3 style="color: #4169E1;"> 4.1 | X-Y Split</h3>

<h3 style="color: #4169E1;"> 4.2 | Selecting the Model</h3>

<h4 style="color: #00BFFF;"> 4.2.1 | Selecting Model: Linear Regression </h4>

<h4 style="color: #00BFFF;"> 4.2.2 | Selecting Model: Ridge Regression </h4>

<h4 style="color: #00BFFF;"> 4.2.3 | Selecting Model: Lasso Regression </h4>

<h4 style="color: #00BFFF;"> 4.2.4 | Selecting Model: Decision Tree Regression </h4>

<h4 style="color: #00BFFF;"> 4.2.5 | Selecting Model: KNN Regression </h4>

<h4 style="color: #00BFFF;"> 4.2.6 | Selecting Model: XGBoost Regression </h4>

<h3 style="color: #4169E1;"> 4.3 | Final Comparision</h3>

<h2 style="color: #9370DB;"> 05 | Improving Model </h2>

<h3 style="color: #4169E1;"> 5.1 | Normalization with MinMaxScaler</h3>

<h3 style="color: #4169E1;"> 5.2 | Standardization with StandardScaler</h3>

<h3 style="color: #4169E1;"> 5.3 | Normzalization with Long Transform</h3>

<h3 style="color: #4169E1;"> 5.4 | Feature Engineering </h3>

<h2 style="color: #9370DB;"> 06 | Reporting </h2>