In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Load and prepare the datasets
def load_data(financial_data_path, stock_data_path):
    # Load the financial metrics data
    financial_df = pd.read_csv(financial_data_path)
    
    # Load the historical stock price data
    stock_df = pd.read_csv(stock_data_path)
    
    return financial_df, stock_df

def preprocess_data(financial_df, stock_df):
    # Reshape financial data from wide to long format for easier processing
    financial_long = pd.melt(
        financial_df, 
        id_vars=['Company_name', 'Parameters'], 
        var_name='Year', 
        value_name='Value'
    )
    financial_long['Year'] = financial_long['Year'].astype(int)
    
    # Create a pivot table to have parameters as columns
    financial_pivot = financial_long.pivot_table(
        index=['Company_name', 'Year'], 
        columns='Parameters', 
        values='Value'
    ).reset_index()
    
    # Ensure all column names are strings
    financial_pivot.columns = [str(col) for col in financial_pivot.columns]
    
    # Rename stock repurchase column to avoid confusion
    stock_df = stock_df.rename(columns={'repurchase of common stock': 'historical_stock_repurchase'})
    
    return financial_pivot, stock_df

def train_models(financial_pivot, stock_df):
    # Dictionary to store models for each company
    company_models = {}
    company_scalers_X = {}
    company_scalers_y = {}
    
    # List of companies
    companies = financial_pivot['Company_name'].unique()
    
    for company in companies:
        # Filter data for this company
        company_financial = financial_pivot[financial_pivot['Company_name'] == company]
        company_stock = stock_df[stock_df['Company_name'] == company]
        
        # Merge the datasets on company name and year for historical period (2019-2023)
        merged_df = pd.merge(
            company_financial, 
            company_stock,
            on=['Company_name', 'Year'], 
            how='inner'
        )
        
        if len(merged_df) >= 3:  # Need at least 3 data points for meaningful regression
            # Extract features (X) and target (y)
            X = merged_df.drop(['Company_name', 'Year', 'historical_stock_repurchase'], axis=1)
            y = merged_df['historical_stock_repurchase']
            
            # Standardize the data
            scaler_X = StandardScaler()
            scaler_y = StandardScaler()
            
            X_scaled = scaler_X.fit_transform(X)
            y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()
            
            # Train a linear regression model
            model = LinearRegression()
            model.fit(X_scaled, y_scaled)
            
            # Store the model and scalers
            company_models[company] = model
            company_scalers_X[company] = scaler_X
            company_scalers_y[company] = scaler_y
    
    return company_models, company_scalers_X, company_scalers_y

def predict_stock_prices(financial_pivot, company_models, company_scalers_X, company_scalers_y, start_year, end_year):
    # Dictionary to store predictions
    predictions = {}
    
    # List of companies
    companies = financial_pivot['Company_name'].unique()
    
    for company in companies:
        if company not in company_models:
            continue
            
        # Filter data for this company and the specified years
        company_data = financial_pivot[
            (financial_pivot['Company_name'] == company) & 
            (financial_pivot['Year'] >= start_year) & 
            (financial_pivot['Year'] <= end_year)
        ]
        
        if len(company_data) > 0:
            # Extract features
            X_pred = company_data.drop(['Company_name', 'Year'], axis=1)
            
            # Standardize using the same scaler used for training
            X_pred_scaled = company_scalers_X[company].transform(X_pred)
            
            # Make predictions
            y_pred_scaled = company_models[company].predict(X_pred_scaled)
            
            # Inverse transform to get the actual stock price predictions
            y_pred = company_scalers_y[company].inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()
            
            # Store predictions along with years
            years = company_data['Year'].values
            predictions[company] = {year: pred for year, pred in zip(years, y_pred)}
    
    return predictions

def format_predictions(predictions, start_year, end_year):
    # Create a list to store formatted results
    results = []
    
    # Years range
    years = list(range(start_year, end_year + 1))
    
    # Add data for each company
    for company, company_predictions in predictions.items():
        row = {'Company_name': company}
        for year in years:
            if year in company_predictions:
                row[str(year)] = round(company_predictions[year], 2)
            else:
                row[str(year)] = None
        results.append(row)
    
    # Convert to DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df

def plot_predictions(predictions, start_year, end_year):
    # Create a figure for plotting
    plt.figure(figsize=(12, 8))
    
    # Years range
    years = list(range(start_year, end_year + 1))
    
    # Plot predictions for each company
    for company, company_predictions in predictions.items():
        company_years = [year for year in years if year in company_predictions]
        company_values = [company_predictions[year] for year in company_years]
        plt.plot(company_years, company_values, marker='o', label=company)
    
    plt.title('Stock Price Predictions')
    plt.xlabel('Year')
    plt.ylabel('Stock Price')
    plt.legend()
    plt.grid(True)
    
    return plt

def predict_stock_for_timeframe(financial_data_path, stock_data_path, start_year, end_year):
    """
    Main function to predict stock prices for the specified timeframe
    
    Parameters:
    financial_data_path (str): Path to the financial metrics CSV file
    stock_data_path (str): Path to the historical stock price CSV file
    start_year (int): Start year for predictions (should be >= 2024)
    end_year (int): End year for predictions
    
    Returns:
    DataFrame: Predicted stock prices for each company and year
    """
    # Validate input years
    if start_year < 2024:
        raise ValueError("Start year must be 2024 or later as 2023 is already in historical data")
    if end_year < start_year:
        raise ValueError("End year must be greater than or equal to start year")
    if end_year > 2033:
        raise ValueError("End year cannot exceed 2033 based on available future financial projections")
    
    # Load and preprocess data
    financial_df, stock_df = load_data(financial_data_path, stock_data_path)
    financial_pivot, stock_df = preprocess_data(financial_df, stock_df)
    
    # Train models
    company_models, company_scalers_X, company_scalers_y = train_models(financial_pivot, stock_df)
    
    # Make predictions
    predictions = predict_stock_prices(
        financial_pivot, company_models, company_scalers_X, company_scalers_y, start_year, end_year
    )
    
    # Format predictions
    results_df = format_predictions(predictions, start_year, end_year)
    
    return results_df

# Example usage:
if __name__ == "__main__":
    # Paths to your CSV files
    financial_data_path = "merged.csv"
    stock_data_path = "stock_historical.csv"
    
    # Define start and end years for prediction
    start_year = 2024
    end_year = 2030
    
    # Get predictions
    try:
        predictions_df = predict_stock_for_timeframe(
            financial_data_path, stock_data_path, start_year, end_year
        )
        
        # Display the predictions
        print("\nStock Price Predictions:")
        print(predictions_df)
        
        # You can also save to CSV
        predictions_df.to_csv("stock_predictions.csv", index=False)
        print("\nPredictions saved to 'stock_predictions.csv'")
        
        # Optional: Load and preprocess data for visualization
        financial_df, stock_df = load_data(financial_data_path, stock_data_path)
        financial_pivot, stock_df = preprocess_data(financial_df, stock_df)
        
        # Train models
        company_models, company_scalers_X, company_scalers_y = train_models(financial_pivot, stock_df)
        
        # Get prediction data in dictionary format for plotting
        predictions = predict_stock_prices(
            financial_pivot, company_models, company_scalers_X, company_scalers_y, start_year, end_year
        )
        
        # Plot predictions
        plt = plot_predictions(predictions, start_year, end_year)
        plt.savefig("stock_predictions.png")
        print("Prediction plot saved as 'stock_predictions.png'")
        plt.show()
        
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

An unexpected error occurred: 'Year'


In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt


# Load the datasets
def load_data(financial_data_path, stock_data_path):
    financial_df = pd.read_csv(financial_data_path)
    stock_df = pd.read_csv(stock_data_path)
    return financial_df, stock_df


# Preprocess the data
def preprocess_data(financial_df, stock_df):
    # Reshape financial data
    financial_long = pd.melt(
        financial_df,
        id_vars=["Company_name"],
        var_name="Year",
        value_name="Value"
    )

    # Filter out rows where "Year" contains non-numeric values
    financial_long = financial_long[financial_long["Year"].str.isdigit()]

    # Convert "Year" to integers
    financial_long["Year"] = financial_long["Year"].astype(int)

    # Pivot to get financial parameters as columns
    financial_pivot = financial_long.pivot_table(
        index=["Company_name", "Year"],
        columns="Variable",
        values="Value"
    ).reset_index()

    # Fill missing values
    financial_pivot = financial_pivot.fillna(financial_pivot.mean(numeric_only=True))

    # Process stock data
    stock_pivot = stock_df.rename(columns={"repurchase of common stock": "Stock_price"})

    return financial_pivot, stock_pivot


# Train models for each company
def train_models(financial_pivot, stock_pivot):
    company_models = {}
    company_scalers_X = {}
    company_scalers_y = {}

    companies = financial_pivot["Company_name"].unique()

    for company in companies:
        company_financial = financial_pivot[financial_pivot["Company_name"] == company]
        company_stock = stock_pivot[stock_pivot["Company_name"] == company]

        merged_df = pd.merge(company_financial, company_stock, on=["Company_name", "Year"], how="inner")

        if len(merged_df) >= 3:  # Need at least 3 data points for regression
            X = merged_df.drop(["Company_name", "Year", "Stock_price"], axis=1)
            y = merged_df["Stock_price"]

            scaler_X = StandardScaler()
            scaler_y = StandardScaler()

            X_scaled = scaler_X.fit_transform(X)
            y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()

            model = LinearRegression()
            model.fit(X_scaled, y_scaled)

            company_models[company] = model
            company_scalers_X[company] = scaler_X
            company_scalers_y[company] = scaler_y

    return company_models, company_scalers_X, company_scalers_y


# Predict stock prices
def predict_stock_prices(financial_pivot, company_models, company_scalers_X, company_scalers_y, start_year, end_year):
    predictions = {}
    companies = financial_pivot["Company_name"].unique()

    for company in companies:
        if company not in company_models:
            continue

        company_data = financial_pivot[
            (financial_pivot["Company_name"] == company) &
            (financial_pivot["Year"] >= start_year) &
            (financial_pivot["Year"] <= end_year)
        ]

        if len(company_data) > 0:
            X_pred = company_data.drop(["Company_name", "Year"], axis=1)
            X_pred_scaled = company_scalers_X[company].transform(X_pred)

            y_pred_scaled = company_models[company].predict(X_pred_scaled)
            y_pred = company_scalers_y[company].inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()

            years = company_data["Year"].values
            predictions[company] = {year: pred for year, pred in zip(years, y_pred)}

    return predictions


# Format predictions
def format_predictions(predictions, start_year, end_year):
    results = []
    years = list(range(start_year, end_year + 1))

    for company, company_predictions in predictions.items():
        row = {"Company_name": company}
        for year in years:
            row[str(year)] = round(company_predictions.get(year, np.nan), 2)
        results.append(row)

    results_df = pd.DataFrame(results)
    return results_df


# Plot predictions
def plot_predictions(predictions, start_year, end_year):
    plt.figure(figsize=(12, 8))
    years = list(range(start_year, end_year + 1))

    for company, company_predictions in predictions.items():
        company_years = [year for year in years if year in company_predictions]
        company_values = [company_predictions[year] for year in company_years]
        plt.plot(company_years, company_values, marker="o", label=company)

    plt.title("Stock Price Predictions")
    plt.xlabel("Year")
    plt.ylabel("Stock Price")
    plt.legend()
    plt.grid(True)
    return plt


# Main function
def predict_stock_for_timeframe(financial_data_path, stock_data_path, start_year, end_year):
    if start_year < 2024:
        raise ValueError("Start year must be 2024 or later as 2023 is already in historical data")
    if end_year < start_year:
        raise ValueError("End year must be greater than or equal to start year")
    if end_year > 2033:
        raise ValueError("End year cannot exceed 2033 based on available future financial projections")

    financial_df, stock_df = load_data(financial_data_path, stock_data_path)
    financial_pivot, stock_pivot = preprocess_data(financial_df, stock_df)

    company_models, company_scalers_X, company_scalers_y = train_models(financial_pivot, stock_pivot)

    predictions = predict_stock_prices(
        financial_pivot, company_models, company_scalers_X, company_scalers_y, start_year, end_year
    )

    results_df = format_predictions(predictions, start_year, end_year)
    return results_df


# Example usage
if __name__ == "__main__":
    financial_data_path = "merged.csv"
    stock_data_path = "stock_historical.csv"
    start_year = 2024
    end_year = 2030

    predictions_df = predict_stock_for_timeframe(financial_data_path, stock_data_path, start_year, end_year)
    print(predictions_df)

    predictions_df.to_csv("stock_predictions.csv", index=False)
    print("Predictions saved to 'stock_predictions.csv'")


KeyError: 'Variable'

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Load the datasets
parameters_df = pd.read_csv('merged.csv')  # Replace with actual path
stock_historical_df = pd.read_csv('stock_historical.csv')  # Replace with actual path

# Select the top 5 parameters for each company
top_5_parameters = parameters_df.groupby('Company_name').head(5)

# Pivot the data to have years as columns and parameters as rows
top_5_pivot = top_5_parameters.pivot(index='Company_name', columns='Parameters')

# Flatten multi-level columns in top_5_pivot
top_5_pivot.columns = ['_'.join(map(str, col)) if isinstance(col, tuple) else col for col in top_5_pivot.columns]
top_5_pivot.reset_index(inplace=True)

# Merge the parameters dataset with historical stock data
merged_data = pd.merge(
    stock_historical_df,
    top_5_pivot,
    on='Company_name',
    suffixes=('_stock', '_params')
)

# Handle missing values by forward-filling and backward-filling
filled_data = merged_data.fillna(method="ffill").fillna(method="bfill")

# Extract input features (parameters and historical stock prices) and outputs
input_columns = [
    col for col in filled_data.columns if ("2019" <= col <= "2023" or "_2019" <= col <= "_2023")
]
X = filled_data[input_columns].values
y = filled_data["2023"].values  # Use 2023 as a sample supervised learning target

# Normalize the input data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the neural network model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dense(1, activation='linear')  # Output layer for stock price prediction
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=4, verbose=1)

# Predict stock prices for future years (e.g., 2024 to 2033)
future_years = [2024, 2025, 2026, 2027, 2028, 2029, 2030, 2031, 2032, 2033]
future_predictions = []

# Use the last available data as a starting point for future predictions
last_data = X_scaled[-1].reshape(1, -1)

for year in future_years:
    prediction = model.predict(last_data)
    future_predictions.append(prediction[0][0])
    # Update the input for the next prediction (simulating new parameter values if needed)
    last_data[0][-1] = prediction[0][0]  # Replace the last stock price with the predicted value

# Display the future predictions
for year, pred in zip(future_years, future_predictions):
    print(f"Predicted stock price for {year}: {pred}")


  filled_data = merged_data.fillna(method="ffill").fillna(method="bfill")
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 7169328.0000 - mae: 1687.5111 - val_loss: 165731.4531 - val_mae: 371.5173
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step - loss: 7168969.0000 - mae: 1687.5349 - val_loss: 165655.7500 - val_mae: 371.4158
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 171ms/step - loss: 7168947.5000 - mae: 1687.6205 - val_loss: 165579.5000 - val_mae: 371.3135
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179ms/step - loss: 7167837.0000 - mae: 1687.4028 - val_loss: 165497.5000 - val_mae: 371.2029
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step - loss: 7168062.0000 - mae: 1687.4957 - val_loss: 165420.0625 - val_mae: 371.0984
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step - loss: 7167227.0000 - mae: 1687.2562 - val_loss: 165332.0781 - val_mae: 370.9788
Epoch 7

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import numpy as np

# Load the datasets
parameters_df = pd.read_csv('merged.csv')  # Replace with actual path
stock_historical_df = pd.read_csv('stock_historical.csv')  # Replace with actual path

# Select the top 5 parameters for each company
top_5_parameters = parameters_df.groupby('Company_name').head(5)

# Pivot the data to have years as columns and parameters as rows
top_5_pivot = top_5_parameters.pivot(index='Company_name', columns='Parameters')

# Flatten multi-level columns in top_5_pivot
top_5_pivot.columns = ['_'.join(map(str, col)) if isinstance(col, tuple) else col for col in top_5_pivot.columns]
top_5_pivot.reset_index(inplace=True)

# Merge the parameters dataset with historical stock data
merged_data = pd.merge(
    stock_historical_df,
    top_5_pivot,
    on='Company_name',
    suffixes=('_stock', '_params')
)

# Handle missing values by forward-filling and backward-filling
filled_data = merged_data.fillna(method="ffill").fillna(method="bfill")

# Group by company to train and predict separately for each company
companies = filled_data['Company_name'].unique()

# Dictionary to store predictions for each company
predictions_per_company = {}

for company in companies:
    # Filter data for the current company
    company_data = filled_data[filled_data['Company_name'] == company]

    # Extract input features and target output
    input_columns = [
        col for col in company_data.columns if ("2019" <= col <= "2023" or "_2019" <= col <= "_2023")
    ]
    X = company_data[input_columns].values
    y = company_data["2023"].values  # Use 2023 as a supervised learning target

    # Check if the company has sufficient data
    if len(X) < 2:
        print(f"Skipping {company} due to insufficient data (n_samples={len(X)}).")
        continue

    # Normalize the input data
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Define the neural network model
    model = Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dense(1, activation='linear')  # Output layer for stock price prediction
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

    # Train the model
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=4, verbose=0)

    # Predict stock prices for future years (e.g., 2024 to 2033)
    future_years = [2024, 2025, 2026, 2027, 2028, 2029, 2030, 2031, 2032, 2033]
    future_predictions = []

    # Use the last available data as a starting point for future predictions
    last_data = X_scaled[-1].reshape(1, -1)

    for year in future_years:
        prediction = model.predict(last_data)
        future_predictions.append(prediction[0][0])
        # Update the input for the next prediction (simulating new parameter values if needed)
        last_data[0][-1] = prediction[0][0]  # Replace the last stock price with the predicted value

    # Store predictions for the current company
    predictions_per_company[company] = {year: pred for year, pred in zip(future_years, future_predictions)}

# Display the predictions for each company
for company, predictions in predictions_per_company.items():
    print(f"Predictions for {company}:")
    for year, pred in predictions.items():
        print(f"  Year {year}: {pred:.2f}")



Skipping BKR due to insufficient data (n_samples=1).
Skipping FTI due to insufficient data (n_samples=1).
Skipping HAL due to insufficient data (n_samples=1).
Skipping NOV due to insufficient data (n_samples=1).
Skipping SLB due to insufficient data (n_samples=1).
Skipping WHD due to insufficient data (n_samples=1).


  filled_data = merged_data.fillna(method="ffill").fillna(method="bfill")
