In [14]:
# !pip install yahooquery

In [None]:
import yfinance as yf
import pandas as pd
import os
from yahooquery import Screener


#### EXTRACT

In [7]:

# Function to get tickers from Yahoo Finance
def get_tickers():
    # Initialize the Yahoo Finance screener
    screener = Screener()
    
    # Attempt to get the 'most_actives' screeners with up to 1000 results
    result = screener.get_screeners('most_actives', count=200)

    # Debug: Print the structure of the result to inspect it
    print("Screener result:", result)
    
    # Check if 'most_actives' and 'quotes' keys are present in the response
    try:
        tickers = [item['symbol'] for item in result['most_actives']['quotes']]
        return tickers
    except (KeyError, TypeError):
        print("Unexpected response structure or no data returned.")
        return []

# Get tickers
tickers = get_tickers()

# Display the total number of tickers and sample output
print(f"Total tickers collected: {len(tickers)}")
print("Sample tickers:", tickers[:20])


Screener result: {'most_actives': {'id': '437465ef-980e-4d8c-a860-de7cbfbab373', 'title': 'Most Actives', 'description': 'Discover the most traded equities in the trading day.', 'canonicalName': 'MOST_ACTIVES', 'criteriaMeta': {'size': 200, 'offset': 0, 'sortField': 'dayvolume', 'sortType': 'DESC', 'quoteType': 'EQUITY', 'criteria': [{'field': 'region', 'operators': ['EQ'], 'values': [], 'labelsSelected': [53], 'dependentValues': []}, {'field': 'intradaymarketcap', 'operators': ['EQ'], 'values': [], 'labelsSelected': [1, 2, 3], 'dependentValues': []}, {'field': 'dayvolume', 'operators': ['GT'], 'values': [5000000], 'labelsSelected': [], 'dependentValues': []}], 'topOperator': 'AND'}, 'rawCriteria': '{"offset":0,"size":200,"sortField":"dayvolume","sortType":"DESC","quoteType":"EQUITY","query":{"operator":"AND","operands":[{"operator":"eq","operands":["region","us"]},{"operator":"or","operands":[{"operator":"BTWN","operands":["intradaymarketcap",2000000000,10000000000]},{"operator":"BTWN

#### TRANSFORM

In [8]:



# Dictionary to store data for each ticker
stock_data = {}

for ticker in tickers:
    try:
        # Download 3 months of data
        stock = yf.Ticker(ticker)
        hist = stock.history(period="3mo")
        
        # Proceed if data is available
        if not hist.empty:
            hist['Daily Return'] = hist['Close'].pct_change()
            hist['Volatility'] = hist['Daily Return'].rolling(window=10).std()  # 10-day rolling volatility
            hist['10_MA'] = hist['Close'].rolling(window=10).mean()
            hist['20_MA'] = hist['Close'].rolling(window=20).mean()
            hist['Momentum'] = hist['10_MA'] - hist['20_MA']  # Numeric momentum feature
            hist['Average Volume'] = hist['Volume'].rolling(window=10).mean()  # 10-day average volume
            
            # Calculate mean return and volatility over the period
            mean_return = hist['Daily Return'].mean()
            mean_volatility = hist['Volatility'].mean()
            latest_momentum = hist['Momentum'].iloc[-1]  # Latest momentum value
            
            # Classification labels based on calculated features
            growth_label = 'High Growth' if mean_return > 0.01 else 'Stable'
            volatility_label = 'High Volatility' if mean_volatility > 0.02 else 'Low Volatility'
            momentum_label = 'High Momentum' if latest_momentum > 0 else 'Low Momentum'
            
            # Save to stock_data
            stock_data[ticker] = {
                'Mean Return': mean_return,
                'Mean Volatility': mean_volatility,
                'Momentum': latest_momentum,
                'Average Volume': hist['Average Volume'].iloc[-1],  # Latest average volume
                '10_MA': hist['10_MA'].iloc[-1],
                '20_MA': hist['20_MA'].iloc[-1],
                'Growth Label': growth_label,
                'Volatility Label': volatility_label,
                'Momentum Label': momentum_label
            }
    except Exception as e:
        print(f"Error retrieving data for {ticker}: {e}")

# Convert dictionary to DataFrame
final_labeled_df = pd.DataFrame(stock_data).T.reset_index()
final_labeled_df.columns = ['Ticker', 'Mean Return', 'Mean Volatility', 'Momentum', 'Average Volume', '10_MA', '20_MA',
                            'Growth Label', 'Volatility Label', 'Momentum Label']

# Display final DataFrame
print(final_labeled_df.head())


NBIS: Period '3mo' is invalid, must be one of ['1d', 'ytd', 'max']


  Ticker Mean Return Mean Volatility  Momentum Average Volume       10_MA  \
0   NVDA    0.004333        0.033188     5.351    245254090.0     139.158   
1    NIO    0.005627        0.047785   -0.6085     59255180.0       5.311   
2    DJT    0.009823        0.065354    6.7955     57166180.0      34.066   
3   TSLA    0.002823        0.036305 -3.737499     85775920.0  232.514003   
4      F    0.000909        0.016384      0.23     47297160.0      11.077   

        20_MA Growth Label Volatility Label Momentum Label  
0     133.807       Stable  High Volatility  High Momentum  
1      5.9195       Stable  High Volatility   Low Momentum  
2     27.2705       Stable  High Volatility  High Momentum  
3  236.251502       Stable  High Volatility   Low Momentum  
4      10.847       Stable   Low Volatility  High Momentum  


In [None]:
#### LOAD

In [13]:


# Assume both `reshaped_df` and `final_labeled_df` are already defined
# Define the folder path
folder_path = './datasets/'

# Create the folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Save `final_labeled_df` to CSV
final_labeled_file_path = os.path.join(folder_path, 'final_labeled_df.csv')
final_labeled_df.to_csv(final_labeled_file_path, index=False)
print(f"final_labeled_df saved to {final_labeled_file_path}")


reshaped_df saved to ./datasets/reshaped_df.csv
final_labeled_df saved to ./datasets/final_labeled_df.csv
