In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.losses import Huber

# Load datasets (replace file paths with actual locations of your datasets)
data_2019_2023 = pd.read_excel("top 5+ stock 2019-2023.xlsx")
data_2024_2033 = pd.read_excel("top 5 2024_2033.xlsx")

# Ensure column names are stripped of spaces
data_2019_2023.columns = data_2019_2023.columns.str.strip()
data_2024_2033.columns = data_2024_2033.columns.str.strip()

# Extract relevant data
repurchase_data = data_2019_2023[data_2019_2023['Parameters'].str.strip() == 'repurchase of common stock']
top_5_parameters = data_2019_2023[data_2019_2023['Parameters'].str.strip() != 'repurchase of common stock']

# Debugging: Print dataset shapes
print("Data 2019-2023 Shape:", data_2019_2023.shape)
print("Data 2024-2033 Shape:", data_2024_2033.shape)

# Reshape and align datasets
# Melt datasets to align by year
top_5_data_flattened = top_5_parameters.melt(id_vars=['Company_name', 'Parameters'], 
                                             var_name='Year', 
                                             value_name='Value')
repurchase_flattened = repurchase_data.melt(id_vars=['Company_name', 'Parameters'], 
                                            var_name='Year', 
                                            value_name='Repurchase')

# Debugging: Print dataset shapes after melting
print("Top 5 Data Flattened Shape:", top_5_data_flattened.shape)
print("Repurchase Flattened Shape:", repurchase_flattened.shape)

# Merge datasets
merged_data = pd.merge(top_5_data_flattened, repurchase_flattened, on=['Company_name', 'Year'], how='inner')
merged_data.rename(columns={'Parameters_x': 'Parameters', 'Parameters_y': 'Repurchase_Parameter'}, inplace=True)

# Debugging: Check merge success
print("Merged Data Shape:", merged_data.shape)

# Compute top 5 parameters by correlation
if not merged_data.empty:
    correlations = (
        merged_data.groupby('Parameters')
        .apply(lambda group: group['Value'].corr(group['Repurchase']))
        .dropna()  # Remove NaN correlations
        .sort_values(ascending=False)
        .head(5)
    )
    top_5_selected_params = correlations.index.tolist()
else:
    top_5_selected_params = []

# Debugging: Print correlation results
print("Top 5 Selected Parameters:", top_5_selected_params)

# Ensure there are selected parameters
if len(top_5_selected_params) == 0:
    raise ValueError("No parameters selected based on correlation. Check data consistency.")

# Filter data for top 5 parameters
filtered_data = merged_data[merged_data['Parameters'].isin(top_5_selected_params)]

# Debugging: Check filtered data
print("Filtered Data Shape:", filtered_data.shape)

# Ensure numeric values before pivoting
filtered_data['Value'] = pd.to_numeric(filtered_data['Value'], errors='coerce')
filtered_data.dropna(subset=['Value'], inplace=True)

# Prepare features and target
data_pivoted = filtered_data.pivot_table(index=['Company_name', 'Year'], 
                                         columns='Parameters', 
                                         values='Value')

# Debugging: Check data pivoted
print("Data Pivoted Columns:", data_pivoted.columns.tolist())
print("Data Pivoted Shape:", data_pivoted.shape)

# Prepare target variable
target = repurchase_flattened.drop_duplicates(subset=['Company_name', 'Year']).set_index(['Company_name', 'Year'])['Repurchase']

# Debugging: Check target
print("Target Shape:", target.shape)

# Align features and target
data_pivoted, target = data_pivoted.align(target, join='inner', axis=0)

# Debugging: Check shapes after alignment
print("Aligned Data Shape:", data_pivoted.shape)
print("Aligned Target Shape:", target.shape)

# Ensure data is not empty before splitting
if data_pivoted.shape[0] == 0 or target.shape[0] == 0:
    raise ValueError("No valid training data after preprocessing. Check filtering steps.")

# Fill missing values
data_pivoted.fillna(0, inplace=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(data_pivoted, target, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build an enhanced neural network model
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    BatchNormalization(),  # Normalize activations
    Dropout(0.3),  # Add dropout for regularization
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dense(32, activation='relu'),
    Dense(1, activation='linear')  # Output layer for regression
])

# Compile the model with Huber loss
model.compile(optimizer='adam', loss=Huber(), metrics=['mae'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=500, batch_size=16, validation_split=0.2, verbose=1)

# Evaluate the model
test_loss, test_mae = model.evaluate(X_test_scaled, y_test, verbose=1)
print(f"Test Loss: {test_loss}, Test MAE: {test_mae}")

# Save predictions to CSV
forecast_features.to_csv("predicted_repurchase_2024_2033.csv", index=False)
print("Predictions saved to 'predicted_2024_2033.csv'")


Data 2019-2023 Shape: (36, 7)
Data 2024-2033 Shape: (30, 12)
Top 5 Data Flattened Shape: (150, 4)
Repurchase Flattened Shape: (30, 4)
Merged Data Shape: (750, 6)
Top 5 Selected Parameters: ['dividends paid', 'net cash flows used in financing activities', 'gross profit', 'other non-operating income', 'other intangible assets, net']
Filtered Data Shape: (200, 6)
Data Pivoted Columns: []
Data Pivoted Shape: (0, 0)
Target Shape: (6,)
Aligned Data Shape: (0, 0)
Aligned Target Shape: (0,)


  .apply(lambda group: group['Value'].corr(group['Repurchase']))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Value'] = pd.to_numeric(filtered_data['Value'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.dropna(subset=['Value'], inplace=True)


ValueError: No valid training data after preprocessing. Check filtering steps.

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.losses import Huber

# Load datasets (replace file paths with actual locations of your datasets)
data_2019_2023 = pd.read_excel("top 5+ stock 2019-2023.xlsx")
data_2024_2033 = pd.read_excel("top 5 2024_2033.xlsx")

# Extract relevant data
repurchase_data = data_2019_2023[data_2019_2023['Parameters'] == 'repurchase of common stock']
top_5_parameters = data_2019_2023[data_2019_2023['Parameters'] != 'repurchase of common stock']

# Reshape and align datasets
# Melt datasets to align by year
top_5_data_flattened = top_5_parameters.melt(id_vars=['Company_name', 'Parameters'], 
                                             var_name='Year', 
                                             value_name='Value')
repurchase_flattened = repurchase_data.melt(id_vars=['Company_name', 'Parameters'], 
                                            var_name='Year', 
                                            value_name='Repurchase')

# Merge datasets
merged_data = pd.merge(top_5_data_flattened, repurchase_flattened, on=['Company_name', 'Year'])
merged_data.rename(columns={'Parameters_x': 'Parameters', 'Parameters_y': 'Repurchase_Parameter'}, inplace=True)

# Compute top 5 parameters by correlation
correlations = (
    merged_data.groupby('Parameters')
    .apply(lambda group: group['Value'].corr(group['Repurchase']))
    .sort_values(ascending=False)
    .head(5)
)
top_5_selected_params = correlations.index.tolist()

# Filter data for top 5 parameters
filtered_data = merged_data[merged_data['Parameters'].isin(top_5_selected_params)]

# Prepare features and target
data_pivoted = filtered_data.pivot_table(index=['Company_name', 'Year'], 
                                         columns='Parameters', 
                                         values='Value')
target = filtered_data.drop_duplicates(subset=['Company_name', 'Year']).set_index(['Company_name', 'Year'])['Repurchase']

# Align features and target
data_pivoted, target = data_pivoted.align(target, join='inner', axis=0)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(data_pivoted, target, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build an enhanced neural network model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    BatchNormalization(),  # Normalize activations
    Dropout(0.3),  # Add dropout for regularization
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer for regression
])

# Compile the model with Huber loss
model.compile(optimizer='adam', loss=Huber(), metrics=['mae'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=300, batch_size=16, validation_split=0.2, verbose=1)

# Evaluate the model
test_loss, test_mae = model.evaluate(X_test_scaled, y_test, verbose=1)
print(f"Test Loss: {test_loss}, Test MAE: {test_mae}")

# Load forecast data for 2024-2033
forecast_data_flattened = data_2024_2033.melt(id_vars=['Company_name', 'Parameters'], 
                                              var_name='Year', 
                                              value_name='Value')
forecast_filtered = forecast_data_flattened[forecast_data_flattened['Parameters'].isin(top_5_selected_params)]

# Prepare features for prediction
forecast_features = forecast_filtered.pivot_table(index=['Company_name', 'Year'], 
                                                   columns='Parameters', 
                                                   values='Value')

# Standardize forecast features
forecast_features_scaled = scaler.transform(forecast_features)

# Make predictions
predictions = model.predict(forecast_features_scaled)

# Prepare output
forecast_features['Predicted_Repurchase'] = predictions
forecast_features.reset_index(inplace=True)
forecast_features = forecast_features[['Company_name', 'Year', 'Predicted_Repurchase']]

# Save predictions to CSV
forecast_features.to_csv("predicted_repurchase_2024_2033.csv", index=False)
print("Predictions saved to 'predicted_2024_2033.csv'")


Epoch 1/300


  .apply(lambda group: group['Value'].corr(group['Repurchase']))
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - loss: 1187.0000 - mae: 1187.5000 - val_loss: 29.6248 - val_mae: 30.0000
Epoch 2/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 199ms/step - loss: 1186.9990 - mae: 1187.4990 - val_loss: 29.6245 - val_mae: 30.0000
Epoch 3/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 515ms/step - loss: 1186.9980 - mae: 1187.4980 - val_loss: 29.6243 - val_mae: 30.0000
Epoch 4/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 231ms/step - loss: 1186.9971 - mae: 1187.4971 - val_loss: 29.6240 - val_mae: 30.0000
Epoch 5/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 197ms/step - loss: 1186.9961 - mae: 1187.4961 - val_loss: 29.6238 - val_mae: 30.0000
Epoch 6/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 200ms/step - loss: 1186.9951 - mae: 1187.4951 - val_loss: 29.6235 - val_mae: 30.0000
Epoch 7/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0