In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os

# Base folder paths
base_path = '/content/drive/MyDrive/OMSA Practicum'
sp500_path = os.path.join(base_path, 'SP500')
multicap_news_path = os.path.join(base_path, 'MultiCap_News')


In [None]:
import pandas as pd

# Function to read all files in a folder
def read_files_from_folder(folder_path):
    files_data = {}
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.csv'):
            files_data[filename] = pd.read_csv(file_path)
        elif filename.endswith('.txt'):
            with open(file_path, 'r') as file:
                files_data[filename] = file.read()
    return files_data

# Read SP500 files
sp500_data = read_files_from_folder(sp500_path)

# Read MultiCap_News files
multicap_news_data = read_files_from_folder(multicap_news_path)

# Output examples
print("SP500 Data Keys:", sp500_data.keys())
print("MultiCap News Data Keys:", multicap_news_data.keys())


SP500 Data Keys: dict_keys(['price.csv', 'price_daily.csv', 'price_SP500.csv', 'company_info_sp500.txt', 'sp500_item1_sec_filings_0.txt', 'sp500_item1a_sec_filings_0.txt', 'sp500_item7_sec_filings_0.txt', 'volume.csv'])
MultiCap News Data Keys: dict_keys([])


In [None]:
import pandas as pd
import io

# Dictionary to store processed dataframes
processed_dataframes = {}

# Function to process already loaded DataFrames
def handle_existing_dataframe(df, file_name):
    processed_dataframes[file_name] = df
    print(f"Stored preloaded DataFrame '{file_name}' with shape {df.shape}")

# Function to process raw CSV files in chunks and store with names
def process_csv_in_chunks(file_content, file_name, chunksize=10000):
    try:
        chunk_iter = pd.read_csv(io.StringIO(file_content), chunksize=chunksize)
        for i, chunk in enumerate(chunk_iter):
            df_name = f"{file_name}_chunk_{i+1}"
            processed_dataframes[df_name] = chunk
            print(f"Stored chunk {i + 1} of {file_name} as '{df_name}' with shape {chunk.shape}")
    except Exception as e:
        print(f"Error processing CSV in chunks for {file_name}: {e}")

# Function to process TXT files into DataFrames
def process_txt_to_dataframe(content, file_name, delimiter=None, chunksize=5000):
    try:
        if delimiter:
            # Process structured TXT file as a DataFrame
            chunk_iter = pd.read_csv(io.StringIO(content), delimiter=delimiter, chunksize=chunksize)
            for i, chunk in enumerate(chunk_iter):
                df_name = f"{file_name}_chunk_{i+1}"
                processed_dataframes[df_name] = chunk
                print(f"Stored chunk {i + 1} of TXT file {file_name} as '{df_name}' with shape {chunk.shape}")
        else:
            # Handle unstructured TXT files (store as lines for now)
            lines = content.splitlines()
            processed_dataframes[file_name] = pd.DataFrame({'line': lines})
            print(f"Stored TXT file '{file_name}' as DataFrame with {len(lines)} lines")
    except Exception as e:
        print(f"Error processing TXT file {file_name}: {e}")

# Iterate through the keys in sp500_data
for key, content in sp500_data.items():
    print(f"\nExploring: {key}\n" + "-" * 50)

    if isinstance(content, pd.DataFrame):
        # If already a DataFrame, handle directly
        handle_existing_dataframe(content, key)
    elif key.endswith('.csv'):
        # If CSV content is raw text, process it in chunks
        print(f"Chunked processing for {key}:")
        process_csv_in_chunks(content, key, chunksize=5000)
    elif key.endswith('.txt'):
        # Process TXT files
        print(f"Processing TXT file {key}:")
        delimiter = '|'  # Example: Adjust as needed based on your file's structure
        process_txt_to_dataframe(content, key, delimiter=delimiter, chunksize=5000)
    else:
        print(f"Unsupported file format: {key}")

# Display the stored dataframes
print("\nStored DataFrames:")
for name, data in processed_dataframes.items():
    if isinstance(data, pd.DataFrame):
        print(f"DataFrame '{name}' with shape {data.shape}")



Exploring: price.csv
--------------------------------------------------
Stored preloaded DataFrame 'price.csv' with shape (1214, 926)

Exploring: price_daily.csv
--------------------------------------------------
Stored preloaded DataFrame 'price_daily.csv' with shape (5857, 927)

Exploring: price_SP500.csv
--------------------------------------------------
Stored preloaded DataFrame 'price_SP500.csv' with shape (1214, 3)

Exploring: company_info_sp500.txt
--------------------------------------------------
Processing TXT file company_info_sp500.txt:
Stored chunk 1 of TXT file company_info_sp500.txt as 'company_info_sp500.txt_chunk_1' with shape (0, 3034)

Exploring: sp500_item1_sec_filings_0.txt
--------------------------------------------------
Processing TXT file sp500_item1_sec_filings_0.txt:
Stored chunk 1 of TXT file sp500_item1_sec_filings_0.txt as 'sp500_item1_sec_filings_0.txt_chunk_1' with shape (5000, 6)
Stored chunk 2 of TXT file sp500_item1_sec_filings_0.txt as 'sp500_item

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Function to perform EDA on a single DataFrame
def perform_eda(df, name):
    print(f"\nEDA for '{name}'\n" + "-" * 50)

    # 1. Basic Information
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}\n")
    print(f"Data Types:\n{df.dtypes}\n")

    # Missing values
    print(f"Missing values:\n{df.isnull().sum()}\n")

    # Duplicate rows
    duplicate_count = df.duplicated().sum()
    print(f"Number of duplicate rows: {duplicate_count}\n")

    # Descriptive statistics (limited to smaller datasets for better readability)
    if df.shape[1] <= 50:  # Limit columns for large DataFrames
        print(f"Descriptive Statistics:\n{df.describe(include='all')}\n")

    # Memory usage
    memory_usage = df.memory_usage(deep=True).sum() / (1024 ** 2)
    print(f"Memory Usage: {memory_usage:.2f} MB\n")

    # # Correlation heatmap for numerical data
    # numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
    # if len(numerical_cols) > 1:
    #     corr = df[numerical_cols].corr()
    #     plt.figure(figsize=(10, 8))
    #     sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm")
    #     plt.title(f"Correlation Matrix for {name}")
    #     plt.show()

    # Top and bottom rows
    print(f"First 5 rows:\n{df.head()}\n")
    print(f"Last 5 rows:\n{df.tail()}\n")

# Display stored DataFrames
print("\nStored DataFrames:")
for name, data in processed_dataframes.items():
    if isinstance(data, pd.DataFrame):
        print(f"DataFrame '{name}' with shape {data.shape}")

# Specify the DataFrame you want to analyze
selected_dataframe_name = 'company_info_sp500.txt_chunk_1'  # Change this to the desired DataFrame name

# Perform EDA for the selected DataFrame
if selected_dataframe_name in processed_dataframes:
    selected_dataframe = processed_dataframes[selected_dataframe_name]
    if not selected_dataframe.empty:
        perform_eda(selected_dataframe, selected_dataframe_name)
    else:
        print(f"The selected DataFrame '{selected_dataframe_name}' is empty.")
else:
    print(f"DataFrame '{selected_dataframe_name}' not found in processed data.")



Stored DataFrames:
DataFrame 'price.csv' with shape (1214, 926)
DataFrame 'price_daily.csv' with shape (5857, 927)
DataFrame 'price_SP500.csv' with shape (1214, 3)
DataFrame 'company_info_sp500.txt_chunk_1' with shape (0, 3034)
DataFrame 'sp500_item1_sec_filings_0.txt_chunk_1' with shape (5000, 6)
DataFrame 'sp500_item1_sec_filings_0.txt_chunk_2' with shape (5000, 6)
DataFrame 'sp500_item1_sec_filings_0.txt_chunk_3' with shape (4996, 6)
DataFrame 'sp500_item1a_sec_filings_0.txt_chunk_1' with shape (5000, 6)
DataFrame 'sp500_item1a_sec_filings_0.txt_chunk_2' with shape (5000, 6)
DataFrame 'sp500_item1a_sec_filings_0.txt_chunk_3' with shape (4996, 6)
DataFrame 'sp500_item7_sec_filings_0.txt_chunk_1' with shape (5000, 6)
DataFrame 'sp500_item7_sec_filings_0.txt_chunk_2' with shape (5000, 6)
DataFrame 'sp500_item7_sec_filings_0.txt_chunk_3' with shape (4996, 6)
DataFrame 'volume.csv' with shape (1214, 3934)
The selected DataFrame 'company_info_sp500.txt_chunk_1' is empty.


## **SP500 Datasets Overview**


---

### **Overview of `price.csv`**

#### **Structure**:
- **Rows**: 1,214  
- **Columns**: 926  
- **Key Columns**:
  - `Date`: Contains date information.
  - Company-specific columns (by CIK or ticker): Represent various financial metrics, primarily stock prices or related values.

#### **Key Characteristics**:
- **Data Types**:
  - `Date`: `object`
  - Other columns: `float64`
- **Missing Data**:
  - Significant missing values in many columns, e.g., `1534701` has 640 missing values, `null` has 314 missing values.
  - No missing values in the `Date` column.
- **Duplicates**:
  - No duplicate rows were detected.
- **Memory Usage**:
  - 8.65 MB.

#### **First and Last Rows**:
- Example from **first row** (2000-01-07):
  - `1534701`: `NaN`
  - `1341439`: `25.84375`
  - `null`: `50.07373`
  - `792985`: `17.8125`
- Example from **last row** (2023-04-06):
  - `1534701`: `102.84`
  - `1341439`: `95.92`
  - `null`: `NaN`
  - `792985`: `NaN`

#### **Insights**:
- The dataset spans a long time period (2000–2023).
- The data contains stock prices (or related metrics) for multiple companies, indexed by their identifiers.
- Missing values might indicate periods of inactivity, non-trading days, or missing data points for specific companies.

#### **Use**:
- **Trend Analysis**:
  - Track historical stock price trends for individual companies.
- **Market-Wide Insights**:
  - Compare stock performance across companies over time.
- **Event Impact Studies**:
  - Assess the impact of significant financial or global events on stock prices.

#### **Challenges**:
1. **High Dimensionality**:
   - 926 columns make direct analysis and visualization difficult.
2. **Missing Values**:
   - Need to handle missing values effectively for accurate analysis.
3. **Time Period Coverage**:
   - Ensure that analysis considers consistent date ranges for comparisons.

---

### **Overview of `sp500_item1_sec_filings_0.txt`**

#### **1. Structure**
- **Shape**:
  - **Chunk 1**: (5000, 6)
  - **Chunk 2**: (5000, 6)
  - **Chunk 3**: (4996, 6)
  - **Total Rows**: 14,996
  - **Columns**: ['company', 'date', 'link', 'type', 'cik', 'item1']

#### **2. Data Characteristics**
- **Data Types**:
  - `company`: Object (String)
  - `date`: Object (Datetime-like String)
  - `link`: Object (String, URLs to SEC filings)
  - `type`: Object (String, e.g., "10-K")
  - `cik`: Integer (Central Index Key for companies)
  - `item1`: Object (String, Item 1 description text)

#### **3. Missing Values**
- **`item1` Field**: Some missing values across all chunks:
  - **Chunk 1**: 99 missing rows
  - **Chunk 2**: 95 missing rows
  - **Chunk 3**: 80 missing rows
  - **Total Missing Rows**: 274 (approx. 1.83% of total rows)

#### **4. Unique Values**
- **`company`**:
  - Total unique companies: 445 (Chunk 1), 444 (Chunk 2), 430 (Chunk 3)
- **`date`**:
  - Total unique dates: 4774 (Chunk 1), 4754 (Chunk 2), 4734 (Chunk 3)
- **`link`**:
  - Total unique links: Nearly unique across all chunks.
- **`type`**:
  - Consistently "10-K" across all chunks.

#### **5. Memory Usage**
- **Chunk 1**: 245.74 MB
- **Chunk 2**: 245.16 MB
- **Chunk 3**: 227.96 MB
- **Total Memory Usage**: ~718.86 MB

#### **6. Descriptive Statistics**
- **Most Frequent Company**:
  - Chunk 1: BERKSHIRE HATHAWAY INC (23 occurrences)
  - Chunk 2: NUCOR CORP (23 occurrences)
  - Chunk 3: O'REILLY AUTOMOTIVE INC (23 occurrences)
- **Most Frequent Item in `item1`**:
  - "ITEM 1. BUSINESS" appears as the most common text across all chunks.

#### **7. Key Observations**
- **Consistency**: Data is clean with no duplicate rows and consistent column names across chunks.
- **Use**:
  - Analyzing trends in Item 1 descriptions.
  - Linking company filings to SEC events.
  - Exploring historical changes in business descriptions for individual companies.

---

### **Overview of `volume.csv`**

- **Structure**:
  - `Date`: Represents the trading dates.
  - Company-specific columns (by CIK or ticker): Each column represents the trading volume for a specific company or security, measured in shares traded per day.
  - **Total Columns**: 3,934 including `Date`.
  - **Total Rows**: 1,214

- **Key Characteristics**:
  - **Missing Values**: Vary significantly across columns; some have complete data while others have substantial gaps.
  - **No Duplicates**: The dataset does not contain duplicate rows.
  - **Memory Usage**: 36.51 MB, indicating a substantial dataset.
  - **Numerical Data**: Includes the trading volumes, which are numerical (float64).
  - **Date Range**: Extends from 2000-01-07 to 2023-04-06.

- **Use**:
  - **Market Activity Analysis**: Evaluate liquidity and activity levels across different securities over time.
  - **Sector/Company-Specific Trends**: Identify patterns in trading volume for specific companies or industries.
  - **Volatility Assessment**: Use trading volume as a proxy for market sentiment or volatility spikes.
  - **Event Analysis**: Correlate significant market events with changes in trading volume for specific companies or the entire market.

- **Potential Challenges**:
  - **High Dimensionality**: With 3,934 columns, visualization and analysis might require dimensionality reduction techniques or filtering for significant columns.
  - **Missing Data**: Requires imputation or handling strategies, especially for columns with substantial gaps.
  - **Performance**: Computational resource needs could escalate due to the dataset's size.

---

### **Overview of `price_daily.csv`**

#### **1. Structure**
- **Shape**: (5857, 927)
  - Rows represent daily observations.
  - Columns include `Date`, company-specific metrics identified by unique IDs (CIKs or tickers), and S&P500 indices (`SP500CapWeighted` and `SP500EqualWeighted`).

#### **2. Data Characteristics**
- **Data Types**:
  - `Date`: Object (String format, should be parsed as datetime for analysis).
  - Numerical Data: Float64, representing daily stock prices or metrics for various companies and indices.
  - 927 columns in total.

#### **3. Missing Values**
- **Summary**:
  - Missing data is prevalent across many company-specific columns, reflecting data gaps or inactive periods.
  - Notable columns with no missing data:
    - `Date`
    - `SP500CapWeighted`
    - `SP500EqualWeighted`
    - Some major company-specific IDs.
  - Other columns have substantial missing values, such as:
    - `1534701`: 3,088 missing rows.
    - `792985`: 2,320 missing rows.
    - Total missing data varies widely by column.

#### **4. Key Columns**
- **Indices**:
  - `SP500CapWeighted`: S&P 500 Capital-Weighted Index.
  - `SP500EqualWeighted`: S&P 500 Equal-Weighted Index.
- **Date**: Identifies daily observations.
- **Company-Specific Columns**:
  - Columns represent stock prices or related metrics tied to company identifiers (CIKs).

#### **5. Memory Usage**
- Approximately 41.75 MB.

#### **6. Descriptive Statistics**
- **Numerical Summary**:
  - Columns have varying ranges, depending on the company and type of metric.
  - Many metrics have a mean and standard deviation consistent with financial data.
  - Example:
    - `SP500CapWeighted`:
      - Mean: ~1899.35
      - Std Dev: ~987.55
      - Min: ~676.53
      - Max: ~4796.56
    - `SP500EqualWeighted`:
      - Mean: ~4427.23
      - Std Dev: ~3058.53
      - Min: ~1056.15
      - Max: ~12522.94

#### **7. First and Last Observations**
- **First Date**: January 3, 2000.
- **Last Date**: April 13, 2023.
- Data spans over two decades of daily observations, providing a rich dataset for longitudinal analysis.

#### **8. Key Observations**
- **Consistency**: No duplicate rows.
- **Use**:
  - **Market Analysis**:
    - Track daily market performance using S&P 500 indices.
  - **Company Performance**:
    - Analyze stock trends for individual companies.
  - **Volatility and Risk Assessment**:
    - Evaluate daily fluctuations in stock prices.
  - **Economic Event Correlation**:
    - Link market changes to macroeconomic or company-specific events.