In [1]:
##################################################""TASK 1""##################################################

# Import necessary libraries
import yfinance as yf
import numpy as np
import pandas as pd
from scipy.stats import zscore

# Function to fetch stock data
def fetch_stock_data(ticker, start_date, end_date):
    stock_data = yf.download(ticker, start=start_date, end=end_date)
    return stock_data

# Define parameters
ticker_adani = "ADANIPORTS.NS"  # Ticker for Adani Ports and Special Economic Zone Ltd
start_date_adani = "2023-01-01"
end_date_adani = "2023-12-31"

# Task 1: Data Ingestion
# Fetch OHLC data for Adani Ports
data_adani = fetch_stock_data(ticker_adani, start_date_adani, end_date_adani)

# Task 2: Data Validation
# Check for missing values
missing_values_adani = data_adani.isnull().sum()

# Check for outliers using Z-score
outliers_adani = (np.abs(zscore(data_adani.select_dtypes(include=[np.number]))) > 3).all(axis=1)

# Check data types consistency
data_types_adani = data_adani.dtypes

# Task 3: Standardize Data Format
# Ensure the data is in a common format like a pandas DataFrame
data_adani.index = pd.to_datetime(data_adani.index)

# Display the results
print("Missing Values:\n", missing_values_adani)
print("\nOutliers:\n", outliers_adani)
print("\nData Types:\n", data_types_adani)


[*********************100%%**********************]  1 of 1 completed
Missing Values:
 Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

Outliers:
 Date
2023-01-02    False
2023-01-03    False
2023-01-04    False
2023-01-05    False
2023-01-06    False
              ...  
2023-12-22    False
2023-12-26    False
2023-12-27    False
2023-12-28    False
2023-12-29    False
Length: 245, dtype: bool

Data Types:
 Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object


In [2]:
##################################################""TASK 2""##################################################
# Task 2: Data Cleaning

# 1. Handle Missing Values:
# Check for missing values
missing_values_adani = data_adani.isnull().sum()

# If missing values exist, you can decide on a strategy (e.g., imputation or removal) and implement it.
# For demonstration purposes, let's fill missing values with the mean.
data_adani_cleaned = data_adani.fillna(data_adani.mean())

# 2. Detect and Correct Outliers:
# Check for outliers using Z-score
outliers_adani = (np.abs(zscore(data_adani_cleaned.select_dtypes(include=[np.number]))) > 3).all(axis=1)

# If outliers are present, decide on a method to correct them (e.g., trimming or winsorizing).
# For demonstration purposes, we'll replace outliers with the median in each column.
for column in data_adani_cleaned.columns:
    median_value = data_adani_cleaned[column].median()
    data_adani_cleaned.loc[outliers_adani, column] = median_value


# Timestamps and Date Formats:
# Check the format of the date/timestamps
print("Date format:", data_adani_cleaned.index.inferred_freq)

# If formats are inconsistent, standardize the date format
data_adani_cleaned.index = pd.to_datetime(data_adani_cleaned.index)

# Display the cleaned data
print("\nCleaned Data:\n", data_adani_cleaned.head())


Date format: None

Cleaned Data:
                   Open        High         Low       Close   Adj Close  \
Date                                                                     
2023-01-02  823.000000  826.750000  816.299988  822.299988  816.808838   
2023-01-03  822.250000  826.400024  817.799988  820.450012  814.971191   
2023-01-04  820.799988  822.000000  806.500000  810.000000  804.591003   
2023-01-05  814.049988  821.599976  797.000000  819.599976  814.126831   
2023-01-06  819.900024  824.400024  803.500000  806.099976  800.716980   

             Volume  
Date                 
2023-01-02  2042294  
2023-01-03  2166531  
2023-01-04  3260112  
2023-01-05  3119740  
2023-01-06  2892006  


In [16]:
##################################################""TASK 3""##################################################

# Task 3: Data Transformation

# 1. Calculate Technical Indicators
# Example: Simple Moving Average (SMA)
data_adani_cleaned['SMA_20'] = data_adani_cleaned['Close'].rolling(window=20).mean()

# Example: Bollinger Bands
window = 20
data_adani_cleaned['Rolling_mean'] = data_adani_cleaned['Close'].rolling(window=window).mean()
data_adani_cleaned['Upper_band'] = data_adani_cleaned['Rolling_mean'] + 2 * data_adani_cleaned['Close'].rolling(window=window).std()
data_adani_cleaned['Lower_band'] = data_adani_cleaned['Rolling_mean'] - 2 * data_adani_cleaned['Close'].rolling(window=window).std()

# Example: Relative Strength Index (RSI)
delta = data_adani_cleaned['Close'].diff()
gain = delta.where(delta > 0, 0)
loss = -delta.where(delta < 0, 0)

average_gain = gain.rolling(window=14).mean()
average_loss = loss.rolling(window=14).mean()

rs = average_gain / average_loss
data_adani_cleaned['RSI'] = 100 - (100 / (1 + rs))

# Drop rows with NaN values introduced by rolling windows
data_adani_cleaned.dropna(inplace=True)

# 2. Apply Feature Engineering
# Example: Volatility Measure (Standard Deviation)
data_adani_cleaned['Volatility'] = data_adani_cleaned['Close'].pct_change().rolling(window=20, min_periods=1).std().fillna(0)
data_adani_cleaned['Volatility'].fillna(0, inplace=True)


# Example: Price Pattern (Upward or Downward Movement)
data_adani_cleaned['Price_Pattern'] = np.where(data_adani_cleaned['Close'] > data_adani_cleaned['Open'], 'Upward', 'Downward')

# 3. Resample the Data
# Example: Resample to hourly frequency
hourly_data = data_adani_cleaned.resample('H').ffill()

# Display the transformed data
print("\nTransformed Data:\n", data_adani_cleaned.head())



Transformed Data:
                   Open        High         Low       Close   Adj Close  \
Date                                                                     
2023-06-20  738.000000  742.000000  730.200012  737.799988  732.873108   
2023-06-21  739.900024  753.250000  733.299988  749.450012  744.445312   
2023-06-22  755.000000  758.000000  741.000000  745.599976  740.621033   
2023-06-23  741.000000  741.000000  703.000000  714.299988  709.530029   
2023-06-26  710.000000  726.299988  705.099976  724.400024  719.562622   

              Volume      SMA_20  Rolling_mean  Upper_band  Lower_band  \
Date                                                                     
2023-06-20   6753358  736.379996    736.379996  750.245802  722.514189   
2023-06-21   9312613  737.939996    737.939996  750.137445  725.742547   
2023-06-22  10061086  738.994995    738.994995  749.876904  728.113086   
2023-06-23  15398550  738.367496    738.367496  752.999418  723.735573   
2023-06-26   5696

In [17]:
##################################################""TASK 4""##################################################

import pytest

# Example SMA calculation
window_size = 20
data_adani_cleaned['SMA_20'] = data_adani_cleaned['Close'].rolling(window=window_size).mean()

# Define first_row_sma_expected after calculating SMA
first_row_sma_expected = data_adani_cleaned['SMA_20'].iloc[window_size - 1]

class TestDataValidation(unittest.TestCase):
    def setUp(self):
        # Initialize data or load it from a test file
        pass

    def test_data_integrity(self):
        # Print relevant information for debugging
        print("Length of cleaned data:", len(data_adani_cleaned))
        print("Length of original data:", len(data_adani))

        # Check if the length of cleaned data matches original data
        self.assertEqual(len(data_adani_cleaned), len(data_adani))

    def test_calculation_correctness(self):
        # Calculate the actual SMA for the first row
        first_row_sma_actual = data_adani_cleaned['SMA_20'].iloc[window_size]

        # Print relevant information for debugging
        print("Expected SMA:", first_row_sma_expected)
        print("Actual SMA:", first_row_sma_actual)

        # Define a tolerance value (adjust as needed)
        tolerance = 1e-5
        self.assertAlmostEqual(first_row_sma_actual, first_row_sma_expected, delta=tolerance)

    def test_resampling(self):
        # Define an expected length for resampled data
        expected_length = 100  # Adjust this value based on your expectations

        # Print relevant information for debugging
        print("Length of resampled data:", len(hourly_data))
        print("Expected length:", expected_length)

        # Check if resampled data has the expected length and format
        self.assertEqual(len(hourly_data), expected_length)

        # Update the set of expected columns based on your transformation
        expected_columns = {'Open', 'High', 'Low', 'Close', 'SMA_20', 'Volatility', 'Price_Pattern'}
        self.assertSetEqual(set(hourly_data.columns), expected_columns)

# Create a test suite
test_suite = unittest.TestLoader().loadTestsFromTestCase(TestDataValidation)

# Run the test suite
unittest.TextTestRunner().run(test_suite)


FFF
FAIL: test_calculation_correctness (__main__.TestDataValidation)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/var/folders/d1/0sq30c3568xgm1scfrbs_jdr0000gn/T/ipykernel_1483/3838290656.py", line 35, in test_calculation_correctness
    self.assertAlmostEqual(first_row_sma_actual, first_row_sma_expected, delta=tolerance)
AssertionError: 731.9225036621094 != 732.2425018310547 within 1e-05 delta (0.31999816894528976 difference)

FAIL: test_data_integrity (__main__.TestDataValidation)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/var/folders/d1/0sq30c3568xgm1scfrbs_jdr0000gn/T/ipykernel_1483/3838290656.py", line 23, in test_data_integrity
    self.assertEqual(len(data_adani_cleaned), len(data_adani))
AssertionError: 131 != 245

FAIL: test_resampling (__main__.TestDataValidation)
----------------------------------------------------------------------
Tr

Expected SMA: 732.2425018310547
Actual SMA: 731.9225036621094
Length of cleaned data: 131
Length of original data: 245
Length of resampled data: 4609
Expected length: 100


<unittest.runner.TextTestResult run=3 errors=0 failures=3>