In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv(
    "../../data/SP500_EGS_Score_avarage_per_year.csv",
    index_col=0,
    parse_dates=True,
    sep=",",
)

In [None]:
df.head(20)

In [None]:
# all lower case

df.columns = map(str.lower, df.columns)

# - to _

df.columns = df.columns.str.replace("-", "_")

In [None]:
len(df.company_symbol.unique())

In [None]:
import sys

sys.path.append("../")

In [None]:
from helpers.text_preprocessing.preprocess_text import preprocess_text

In [None]:
import pandas as pd

# Load or create your DataFrame (replace 'your_data.csv' with the path to your data file)
df = pd.read_csv(
    "../data/extracted_text_sustainability_reports.csv",
    index_col=0,
    parse_dates=True,
    sep=",",
)

In [None]:
# Preprocess the text and get the preprocessed DataFrame
preprocessed_df = preprocess_text(df)

In [None]:
import optuna
import pandas as pd
import matplotlib.pyplot as plt
import plotly
from gensim import corpora
import mlflow

In [None]:
import sys

sys.path.append("../")

In [None]:
from models.LDA_optuna_tuning.tune_lda_optuna import train_lda, compute_coherence
from models.LDA_optuna_tuning.call_optuna_tune import (
    preprocess_data,
    execute_optuna_study,
)

In [1]:
from sklearn.model_selection import train_test_split, KFold

import tpot2
import sklearn
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("../data/ready_to_model/df_filtered_feature_importance.csv")


In [None]:
df.head()

In [3]:
df.fillna(0, inplace=True)

In [4]:
'''"e_score",
"s_score",
"g_score",
"unnamed: 0",
"filename",
"ticker",
"year",
"preprocessed_content",
"ner_entities",
"company_symbol",'''

# columns to drop
columns_to_drop = [

    "total_score",
    "Unnamed: 0"
]

# Separate features and target
y = df["total_score"]
X = df.drop(columns=columns_to_drop)

In [5]:
# drop the last two rows
X = X.iloc[:-2, :]

In [6]:
y = y.iloc[:-2]

In [None]:
X

In [None]:
X

In [None]:
y

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=100
)

In [None]:
scorer = sklearn.metrics.get_scorer("neg_mean_squared_error")

# Initialize TPOT2 regressor with K-Fold cross-validation
est = tpot2.TPOTEstimatorSteadyState(
    n_jobs=6,
    cv=KFold(n_splits=5),  # 5-Fold cross-validation
    verbose=2,
    classification=False,
    scorers=[scorer],
    scorers_weights=[1],
    max_eval_time_seconds=60 * 10,
    max_time_seconds=60 * 90,
)

# Fit the model
est.fit(X_train, y_train)
print('Done fitting/training TPOT2 session.')


df_individuals = est.evaluated_individuals

# Convert the 'mean_squared_error' column to numeric, errors='coerce' will replace non-numeric with NaN
df_individuals['mean_squared_error'] = pd.to_numeric(df_individuals['mean_squared_error'], errors='coerce')

# Drop NaN values
filtered_df = df_individuals.dropna(subset=['mean_squared_error'])

# Sort the DataFrame by 'mean_squared_error' and get the top 10
top_10_mse = filtered_df.nlargest(10, 'mean_squared_error')

print(est.pareto_front)
print(top_10_mse)
print(est.get_params())



In [None]:
top_10_mse

In [None]:
top_30_mse = filtered_df.nlargest(30, 'mean_squared_error')


In [None]:
# save the top 30 models to csv
top_30_mse.to_csv("../data/model_data/top_30_mse.csv")

In [None]:
top_30_mse

In [7]:
import sys

sys.path.append("../")

In [8]:
from models.XGBoost.train_and_evaluate_model import train_and_evaluate_model
from models.XGBoost.tune_xgb_hyperparameters import tune_xgb_hyperparameters
from models.Random_Forest.tune_rf_hyperparameters import tune_rf_hyperparameters
from models.Lasso.tune_lasso_hyperparameters import tune_lasso_hyperparameters
from models.Neural_Network.tune_nn_hyperparameters import tune_nn_hyperparameters
from models.Ridge.tune_ridge_hyperparameters import tune_ridge_hyperparameters
from models.perform_stacking import perform_stacking
from models.optimize_stacking import optimize_stacking

In [None]:
print(X_train.dtypes)


In [None]:
X_train = X_train.apply(pd.to_numeric, errors='ignore')


In [None]:
best_params_rf = tune_rf_hyperparameters(X_train, y_train, 25)

In [None]:
#Best hyperparameters: {'n_estimators': 266, 'max_depth': 32, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_features': None}
# Best RMSE: 5.883527888313719

In [None]:
best_params_lasso = tune_lasso_hyperparameters(X_train, y_train, X_test, y_test, 100)

In [None]:
# [I 2023-10-04 17:44:22,225] Trial 23 finished with value: 4.497792051287519 and parameters: {'alpha': 0.08706251396825564}. Best is trial 23 with value: 4.497792051287519.
# Training RMSE: 5.277817560823488, Test RMSE: 4.497792051287519

In [None]:
best_params_ridge = tune_ridge_hyperparameters(X_train, y_train, X_test, y_test, 100)

In [None]:
# Best hyperparameters: {'alpha': 0.9992235606570956}
# Best Test RMSE: 5.210845073517565

In [None]:
best_params_nn = tune_nn_hyperparameters(X_train, y_train, X_test, y_test, 30)

In [None]:
# Best hyperparameters: {'hidden_layer_sizes': (50, 50), 'activation': 'tanh', 'alpha': 0.00010434024177879637}
# Best Test RMSE: 6.9895163966718235

In [None]:
best_params = tune_xgb_hyperparameters(X_train, y_train, n_trials=30)


In [None]:
# Best hyperparameters: {'learning_rate': 0.02998822459568964, 'max_depth': 5, 'subsample': 0.7095254982521659, 'colsample_bytree': 0.6113315193397809, 'min_child_weight': 13}
# Best RMSE: 5.098858708449277

In [9]:
import pickle

In [10]:
with open('../models/XGBoost/best_params_features_cleaned.pkl', 'rb') as f:
    best_params_xgb = pickle.load(f)

with open('../models/Lasso/best_params_lasso.pkl', 'rb') as f:
    best_params_lasso = pickle.load(f)

with open('../models/Ridge/best_params_ridge.pkl', 'rb') as f:
    best_params_ridge = pickle.load(f)
    
with open('../models/Random_Forest/best_params_rf.pkl', 'rb') as f:
    best_params_rf = pickle.load(f)

with open('../models/Neural_Network/best_params_nn.pkl', 'rb') as f:
    best_params_nn = pickle.load(f)

In [None]:
best_params = optimize_stacking(X, y, n_trials=25)


In [11]:
stacking_model, test_rmse = perform_stacking(X, y, best_params_lasso, best_params_rf, best_params_xgb)



In [None]:
trained_model, validation_rmse, feature_importances_df = train_and_evaluate_model(X_train, y_train, X_test, y_test, best_params)

In [None]:
feature_importances_df

In [None]:
filtered_feature_importances_df = feature_importances_df[feature_importances_df['Importance'] != 0]


In [None]:
df

In [None]:
filtered_feature_importances_df

In [None]:
filtered_main_df = df.loc[:, filtered_feature_importances_df['Feature'].tolist() + ['total_score']]



In [None]:
filtered_main_df

In [None]:
filtered_main_df.to_csv("../data/ready_to_model/filtered_feature_importance_df.csv")

In [None]:
from concurrent.futures import ThreadPoolExecutor
import yfinance as yf
import pandas as pd


def fetch_data_for_row(named_tuple_row):
    ticker = "Unknown"  # Initialize with a default value
    year = "Unknown"    # Initialize with a default value
    
    try:
        ticker = named_tuple_row.ticker
        year = named_tuple_row.year

        yf_ticker = yf.Ticker(ticker)
        financials = yf_ticker.financials
        cashflow = yf_ticker.cashflow
        balance = yf_ticker.balance_sheet
        info = {k: v for k, v in yf_ticker.info.items() if isinstance(v, (int, float))}

        financials = financials.loc[:, pd.to_datetime(financials.columns).year == year].transpose()
        cashflow = cashflow.loc[:, pd.to_datetime(cashflow.columns).year == year].transpose()
        balance = balance.loc[:, pd.to_datetime(balance.columns).year == year].transpose()
        info_df = pd.DataFrame([info])

        financials.columns = 'financials_' + financials.columns.astype(str)
        cashflow.columns = 'cashflow_' + cashflow.columns.astype(str)
        balance.columns = 'balance_' + balance.columns.astype(str)
        info_df.columns = 'info_' + info_df.columns.astype(str)

        merged_data = pd.concat([financials, cashflow, balance, info_df], axis=1)
        merged_data['ticker'] = ticker
        merged_data['year'] = year

        return merged_data.reset_index(drop=True)
    except Exception as e:
        print(f"An error occurred in fetch_data_for_row for ticker: {ticker} and year: {year}. Error: {e}")
        return pd.DataFrame()



def fetch_and_merge_data(df):
    try:
        # Initialize an empty list to store fetched data
        fetched_data_list = []
        
        with ThreadPoolExecutor() as executor:
            fetched_data_list = list(executor.map(fetch_data_for_row, df.itertuples(index=False)))

        # Concatenate all the fetched data
        new_data = pd.concat([data.iloc[[0]] for data in fetched_data_list if not data.empty], ignore_index=True)

        # Debug: Print the shape and columns of new_data
        print(f"new_data shape: {new_data.shape}, columns: {new_data.columns}")

        # Merge new_data with df based on 'ticker' and 'year'
        final_df = pd.merge(df, new_data, on=['ticker', 'year'], how='left')

        return final_df
    except Exception as e:
        print(f"An error occurred in fetch_and_merge_data: {e}")
        return df  # Return the original DataFrame as a fallback




# Example usage
# df = pd.DataFrame({'ticker': ['AAPL', 'GOOGL'], 'year': [2020, 2021]})
# final_df = fetch_and_merge_data(df)


In [None]:
df

In [None]:
final_df = fetch_and_merge_data(df)

In [None]:
df = final_df

In [None]:
final_df.sample(10)

In [None]:
# Calculate the percentage of missing values for each column
missing_percent = df_cleaned.isnull().mean() * 100

# Sort the columns by percentage of missing values in descending order
missing_percent_sorted = missing_percent.sort_values(ascending=False)

# Show the sorted series
print(missing_percent_sorted)


In [None]:
nan_count = final_df.isna().sum()


In [None]:
def remove_columns_with_nans(df, threshold=800):
    nan_count = final_df.isna().sum()
    columns_to_remove = nan_count[nan_count > threshold].index.tolist()
    df_cleaned = final_df.drop(columns=columns_to_remove)
    return df_cleaned

In [None]:
df_cleaned = remove_columns_with_nans(df, threshold=300)


In [None]:
df_cleaned



In [None]:
df_cleaned.to_csv("../data/ready_to_model/df_cleaned_with_yfinance.csv")

In [None]:
# Sorting columns by the number of NaN values (in descending order)
sorted_nan_count = nan_count.sort_values(ascending=False)

print(sorted_nan_count.value_counts())

In [None]:
import pandas as pd

In [None]:
df_topics = pd.read_csv("../data/ready_to_model/df_cleaned_with_yfinance.csv", index_col=0, parse_dates=True, sep=",")

In [None]:
df_topics.head()