In [7]:
from capfourpy.databases import Database
from langchain_ollama import OllamaLLM
from tqdm import tqdm

import json
import numpy as np
import os
import pandas as pd
import re
import sys

# Add the project root directory to sys.path
notebook_dir = os.getcwd()  # Current working directory of the notebook
project_root = os.path.abspath(os.path.join(notebook_dir, '..'))
sys.path.insert(0, project_root)

from src.data_collection.database_utils import get_fundamental_score, get_findox_mapping_with_rms

In [9]:
db = Database(database="CfRms_prod", azure=True)
db_c4dw = Database(database="C4DW")

fundamental_score = get_fundamental_score(db)
findox_mapping_with_rms = get_findox_mapping_with_rms(db_c4dw)

findox_mapping_with_rms.head()

Unnamed: 0,ExtIssuerId,FinDoxIssuerId,AbbrevName,RmsId
0,8443,12583,BNP Paribas,319
1,12946,17655,Nexans,1511
2,13690,17403,Permanent TSB Group,1651
3,13703,5288,Heathrow,139
4,14086,17402,Sabadell,262


In [16]:
# merge the findox mapping with the fundamental score
fundamental_score_with_findox_rms = fundamental_score.merge(findox_mapping_with_rms, how='left', on='RmsId')

# Get new dataframe with one row per distinct RmsId and ScoringDate
fundamental_score_with_findox_rms = fundamental_score_with_findox_rms.groupby(['RmsId', 'ScoringDate']).first().reset_index()

# Order by RmsId and ScoringDate
fundamental_score_with_findox_rms = fundamental_score_with_findox_rms.sort_values(['RmsId', 'ScoringDate'])

# Print a list of all RmsId, ScoringDate
fundamental_score_with_findox_rms[['RmsId', 'AbbrevName', 'ScoringDate']]

Unnamed: 0,RmsId,AbbrevName,ScoringDate
0,1,,2021-01-04
1,1,,2021-01-15
2,2,,2021-01-01
3,5,Adler Group,2022-06-29
4,5,Adler Group,2022-08-10
...,...,...,...
852,2216,,2024-11-12
853,2234,,2024-11-18
854,2235,,2024-11-15
855,2236,,2024-11-14


**Find list of all prospectuses that we can try to extract manually**:
- Find all RmsId with Fundamental Score
- (From that list, extract all RmsId that are bonds?)
- From the list of RmsId with Fundamental Score: find all that either don't have the folder or do not have file in "./data/raw/str(RmsId).


In [1]:
# Got the list from running the "run_sharepoint_download.py" script
rms_id_without_folder = [1, 2, 5, 10, 12, 13, 14, 17, 20, 24, 32, 33, 35, 37, 40, 41, 46, 51, 52, 55, 56, 63, 65, 66, 68, 74, 75, 78, 80, 81, 83, 84, 86, 89, 95, 98, 105, 112, 116, 120, 128, 132, 134, 135, 140, 143, 145, 147, 149, 153, 159, 160, 161, 168, 171, 174, 178, 183, 185, 196, 200, 209, 218, 223, 224, 225, 227, 229, 232, 236, 238, 241, 242, 246, 249, 257, 263, 265, 268, 271, 273, 274, 275, 278, 279, 280, 282, 291, 293, 295, 296, 300, 301, 302, 305, 307, 309, 311, 312, 316, 319, 323, 324, 327, 329, 331, 333, 335, 339, 342, 344, 345, 349, 355, 356, 358, 360, 362, 370, 372, 374, 376, 384, 385, 400, 413, 414, 420, 428, 431, 432, 438, 451, 453, 454, 488, 489, 496, 499, 508, 512, 517, 518, 593, 613, 614, 615, 616, 621, 622, 626, 634, 639, 645, 646, 647, 648, 649, 653, 654, 655, 659, 662, 663, 664, 672, 673, 674, 675, 679, 680, 682, 683, 684, 765, 767, 768, 772, 816, 839, 845, 884, 901, 904, 905, 906, 907, 908, 911, 913, 917, 920, 924, 935, 936, 945, 946, 947, 948, 949, 950, 951, 953, 954, 976, 986, 987, 988, 990, 991, 993, 996, 998, 1003, 1004, 1006, 1009, 1015, 1016, 1020, 1022, 1028, 1029, 1032, 1034, 1037, 1041, 1045, 1046, 1047, 1049, 1051, 1054, 1055, 1056, 1057, 1058, 1060, 1063, 1064, 1066, 1067, 1069, 1070, 1071, 1072, 1073, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1095, 1097, 1098, 1099, 1101, 1104, 1105, 1106, 1109, 1126, 1127, 1130, 1132, 1133, 1134, 1138, 1141, 1146, 1150, 1153, 1159, 1182, 1188, 1197, 1220, 1221, 1222, 1247, 1249, 1261, 1281, 1283, 1284, 1285, 1314, 1319, 1355, 1365, 1456, 1474, 1475, 1480, 1490, 1495, 1512, 1524, 1525, 1529, 1535, 1536, 1539, 1540, 1542, 1547, 1555, 1558, 1560, 1562, 1563, 1564, 1565, 1568, 1592, 1609, 1613, 1615, 1616, 1617, 1618, 1634, 1639, 1654, 1662, 1666, 1669, 1670, 1691, 1700, 1709, 1710, 1720, 1730, 1733, 1749, 1750, 1766, 1767, 1768, 1778, 1792, 1796, 1797, 1801, 1802, 1803, 1807, 1809, 1845, 1868, 1889, 1900, 1916, 1920, 1923, 1931, 1960, 1961, 1994, 2101, 2118, 2124, 2125, 2128, 2140, 2145, 2178, 2187]

In [None]:
filtered_df = df[df["RmsId"].isin(rms_id_without_folder)]
