In [None]:
import pandas as pd # https://pandas.pydata.org/docs/getting_started/install.html
import numpy as np # https://numpy.org/devdocs//user/absolute_beginners.html
from scipy import stats # https://docs.scipy.org/doc/scipy/tutorial/stats.html

# Path to the single CSV file
file_path = '' #Your file path

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path) # https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html

# Print the first few rows of the DataFrame
print(df.head()) # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.head.html

# Find and print duplicate rows
duplicates = df[df.duplicated()] # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.duplicated.html
print(duplicates)

# Basic data exploration
df.head()  # View the first few rows
df.info()  # Check data types and missing values # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.info.html
df.describe()  # Get summary statistics # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.describe.html

# Preprocess the data
# Convert timestamps to datetime format
df['Date'] = pd.to_datetime(df['Date']) # https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html
print(df.head())

# Check for missing values in each column
missing_values = df.isnull().sum() # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isnull.html
print("Missing values in each column:")
print(missing_values)

# Create a column for first difference of 'Close' prices
df['Close_diff'] = df['Close'].diff().fillna(0) # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.diff.html
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html

# Preview Close and its differenced version
print(df[['Close', 'Close_diff']].head())

# Columns to apply log return transformation to
cols = ['Open', 'Close', "High", "Low", 'Marketcap', 'Volume']

# Calculate log return for each column:ln(current) - ln(previous)
for col in cols:
    df["log_return_" + col] = np.log(df[col]) - np.log(df[col].shift(1)) # https://numpy.org/doc/stable/reference/generated/numpy.log.html

# Fill missing values (first row)
df.fillna(0, inplace=True)

# Display the DataFrame with log return columns
print(df)


# Compute first difference of log returns for 'Close'
df['Close_log_diff'] = df['log_return_Close'].diff().fillna(0)

# Display updated DataFrame
print(df.head())


# Define moving average periods (in days)
moving_avg_periods = [1, 7, 14, 30]  

# Calculate moving averages for 'Close'
for period in moving_avg_periods:
    df[f'MA_{period}d'] = df['Close'].rolling(window=period, min_periods=1).mean() # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rolling.html

# Calculate ratios between short-term and longer-term MAs
df['MA_1_MA_7_Ratio'] = df['MA_1d'] / df['MA_7d']
df['MA_1_MA_14_Ratio'] = df['MA_1d'] / df['MA_14d']
df['MA_1_MA_30_Ratio'] = df['MA_1d'] / df['MA_30d']

# Display the DataFrame
print(df)

# Create new features representing volatility
df["High-Low"] = df["log_return_High"] - df["log_return_Low"]
df["Open-Close"] = df["log_return_Open"] - df["log_return_Close"]

# Fill missing values in volatility features with their median
df[["High-Low", "Open-Close"]] = df[["High-Low", "Open-Close"]].fillna(df[["High-Low", "Open-Close"]].median())

# Print the updated DataFrame
print(df)


# Time series decomposition (trend/seasonality/residual)
from statsmodels.tsa.seasonal import seasonal_decompose # https://www.statsmodels.org/stable/generated/statsmodels.tsa.seasonal.seasonal_decompose.html

# Decompose 'Close' price using a multiplicative model with a period of 12
decomposition = seasonal_decompose(df['Close'], model='multiplicative', period=12)
df = df.copy() # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.copy.html
# Create new columns for trend, seasonal, and residual components
df['trend'] = decomposition.trend
df['trend'] = df['trend'].interpolate() # Fill missing trend values via interpolation

df['seasonal'] = decomposition.seasonal
df['residual'] = decomposition.resid


# View the decomposed components
print(df.head())


# Filter data for a specific period
df_period = df[(df['Date'] >= '2020-10-05') & (df['Date'] <= '2021-07-06')]
print(df_period)

# Visualization
import matplotlib.pyplot as plt #https://matplotlib.org/2.0.2/users/pyplot_tutorial.html
import seaborn as sns # https://seaborn.pydata.org/tutorial/introduction.html

# Create a boxplot of the 'Close' prices in the filtered period
plt.figure(figsize=(8, 6))
sns.boxplot(x=df_period['Close'])
plt.title('Bitcoin Boxplot for Outlier Detection')
plt.show()

# Plot time series of different price metrics
plt.figure(figsize=(14, 7))
# Opening Price Chart
plt.plot(df['Date'], df['Open'], label='Opening Price', color='blue', alpha=0.6)
# Closing Price Chart
plt.plot(df['Date'], df['Close'], label='Closing Price', color='green', alpha=0.6)
# Max Price Chart
plt.plot(df['Date'], df['High'], label='Max Price', color='purple', alpha=0.6)
# Min Price Chart
plt.plot(df['Date'], df['Low'], label='Min Price', color='orange', alpha=0.6)
plt.title('Time Series Analysis of Bitcoin Prices')
plt.xlabel('Date')
plt.ylabel('Price (USD)')
plt.legend()
plt.grid(True)
plt.show()

# ADF (Augmented Dickey-Fuller) Test for stationarity
from statsmodels.tsa.stattools import adfuller # https://www.statsmodels.org/stable/generated/statsmodels.tsa.stattools.adfuller.html
result = adfuller(df_period['log_return_Close'].dropna())
print(f"ADF Statistic: {result[0]}")
print(f"p-value: {result[1]}")
print("Critical Values:")
for key, value in result[4].items():
        print(f'\t{key}: {value}')

# Interpret ADF test result
if result[1] < 0.05:
        print("The series is likely stationary (p-value < 0.05).")
else:
        print("The series is likely non-stationary (p-value >= 0.05).")


# Ljung-Box test for autocorrelation in multiple features
from statsmodels.stats.diagnostic import acorr_ljungbox # https://www.statsmodels.org/stable/generated/statsmodels.stats.diagnostic.acorr_ljungbox.html

# List of columns to check
columns = ['Close', 'Close_diff', 'log_return_Close', 'Close_log_diff']

# Apply Ljung-Box test on each series
for col in columns:
    lb_test = acorr_ljungbox(df_period[col].dropna(), lags=[10], return_df=True)
    print(f"Ljung-Box Test for {col}:")
    print(lb_test, "\n")


# Line plot to compare original, differenced, and log return series
plt.figure(figsize=(12, 6))
plt.plot(df_period['Date'], df_period['Close'], label='Close', color='blue')
plt.plot(df_period['Date'], df_period['Close_diff'], label='Close_diff', color='green')
plt.plot(df_period['Date'], df_period['log_return_Close'], label='log_return_Close', color='red')
plt.title('Close, Close_diff, and log_return_Close')
plt.xlabel('Date')
plt.ylabel('Values')
plt.legend()
plt.xticks(rotation=45)  
plt.tight_layout()  
plt.show()



# Set up a 4-row, 1-column plot layout to visualize time series decomposition components
fig, axes = plt.subplots(4, 1, figsize=(10, 8), sharex=True) # sharex=True ensures all plots share the same x-axis (Date)

# Plot Observed (Close) with Date on x-axis
df_period.plot(x='Date', y='Close', ax=axes[0], title='Observed', color='blue')

# Plot Trend with Date on x-axis
df_period.plot(x='Date', y='trend', ax=axes[1], title='Trend', color='orange')

# Plot Seasonal with Date on x-axis
df_period.plot(x='Date', y='seasonal', ax=axes[2], title='Seasonal', color='green')

# Plot Residual with Date on x-axis
df_period.plot(x='Date', y='residual', ax=axes[3], title='Residual', color='red')
plt.tight_layout()
plt.show()


# Plot ACF and PACF to analyze autocorrelation and partial autocorrelation
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# https://www.statsmodels.org/stable/generated/statsmodels.graphics.tsaplots.plot_pacf.html
# https://www.statsmodels.org/dev/generated/statsmodels.graphics.tsaplots.plot_acf.html
plt.figure(figsize=(12, 6))
# ACF plot
plt.subplot(121)
plot_acf(df_period['Close'], lags=50, ax=plt.gca())
plt.title("Bitcoin ACF Plot")
# PACF plot
plt.subplot(122)
plot_pacf(df_period['Close'], lags=50, ax=plt.gca())
plt.title("Bitcoin PACF Plot")
plt.tight_layout()
plt.show()


