In [None]:
import numpy as np
import pandas as pd

# Increase the sample size to 10,000 entries for a more robust training dataset
num_samples = 10000

# Generate borrower data within the credit score range of 700-750
data_sample = {
    "Credit_Score": np.random.randint(700, 751, num_samples),
    "Annual_Income": np.random.randint(40000, 120001, num_samples),
    "Loan_Amount": np.random.randint(10000, 100001, num_samples),
    "LTV_Ratio": np.round(np.random.uniform(0.5, 0.9, num_samples), 2),
    "Credit_Utilization_Ratio": np.round(np.random.uniform(0.1, 0.8, num_samples), 2),
    "Missed_Payments_Past_Year": np.random.randint(0, 4, num_samples),
    "Avg_Monthly_Balance": np.random.randint(500, 10001, num_samples),
    "Employment_Status": np.random.choice([0, 1], num_samples, p=[0.1, 0.9]),
    "Recent_Large_Transactions": np.random.choice([0, 1], num_samples, p=[0.85, 0.15]),
    "Credit_History": np.random.choice([0, 1], num_samples, p=[0.6, 0.4]),
    "Savings_Account_Balance": np.random.randint(0, 100001, num_samples),
    "Credit_Card_Balance": np.random.randint(0, 100001, num_samples),
    "Industry": np.random.choice(['IT', 'Healthcare', 'Manufacturing', 'Retail', 'Finance'], num_samples),
    "Age": np.random.randint(21, 65, num_samples),
    "Education_Level": np.random.choice(['High School', 'Bachelor’s', 'Master’s', 'PhD'], num_samples),
    "Marital_Status": np.random.choice(['Single', 'Married', 'Divorced'], num_samples),
    "Number_of_Dependents": np.random.randint(0, 5, num_samples),
    "Total_Debt": np.random.randint(5000, 150001, num_samples),
    "Debt_to_Income_Ratio": np.round(np.random.uniform(0.1, 0.6, num_samples), 2),
    "Past_Delinquencies_2_5_Years": np.random.randint(0, 3, num_samples),
    "Years_in_Current_Job": np.random.randint(1, 20, num_samples),
}

# Simulate delinquency status based on a basic heuristic for illustrative purposes (Delinquency Status: Binary outcome, with 1 indicating a delinquent borrower and 0 indicating a non-delinquent borrower.)

# Define weights for each feature contributing to delinquency
"""
Positive Weights:
                Signify Risk-Increasing Factors: When a feature has a positive weight, it means that an increase in that feature's value contributes positively to the risk of delinquency.
                                                 For example, if Debt_to_Income_Ratio has a positive weight, higher debt relative to income is seen as increasing the likelihood of the customer being delinquent.
Negative Weights:
                Signify Risk-Reducing Factors: A negative weight implies that an increase in that feature's value contributes negatively to the risk of delinquency, reducing the likelihood of delinquency.
                                               For example, a higher Credit_Score is associated with a lower risk of delinquency, so it has a negative weight to indicate that as the credit score increases, the probability of delinquency decreases.
"""
weights = {
    "Credit_Score": -0.5,  # Lower credit scores increase delinquency risk
    "Debt_to_Income_Ratio": 0.4,  # Higher DTI increases risk
    "Missed_Payments_Past_Year": 0.5,  # More missed payments increase risk
    "Total_Debt": 0.3,  # Higher total debt increases risk
    "Employment_Status": -0.2,  # 1 means employed, reducing risk
    "Past_Delinquencies_2_5_Years": 0.4,  # Past delinquencies increase risk
    "Recent_Large_Transactions": 0.3,  # Indicates potential financial stress
    "Savings_Account_Balance": -0.3,  # Higher savings reduce risk
    "Credit_History": -0.3,  # Positive credit history reduces risk
    "Credit_Utilization_Ratio": 0.4,  # Higher utilization increases risk
    "Years_in_Current_Job": -0.2  # Longer job tenure reduces risk
}

# Apply the rule to calculate a risk score for each row
def calculate_delinquency_risk(row):
    risk_score = (
        weights["Credit_Score"] * (750 - row["Credit_Score"]) / 100 +  # Normalize score
        weights["Debt_to_Income_Ratio"] * row["Debt_to_Income_Ratio"] +
        weights["Missed_Payments_Past_Year"] * row["Missed_Payments_Past_Year"] +
        weights["Total_Debt"] * row["Total_Debt"] / 150000 +  # Normalize debt
        weights["Employment_Status"] * row["Employment_Status"] +
        weights["Past_Delinquencies_2_5_Years"] * row["Past_Delinquencies_2_5_Years"] +
        weights["Recent_Large_Transactions"] * row["Recent_Large_Transactions"] +
        weights["Savings_Account_Balance"] * (100000 - row["Savings_Account_Balance"]) / 100000 +  # Normalize savings
        weights["Credit_History"] * row["Credit_History"] +
        weights["Credit_Utilization_Ratio"] * row["Credit_Utilization_Ratio"] +
        weights["Years_in_Current_Job"] * (20 - row["Years_in_Current_Job"]) / 20  # Normalize job tenure
    )
    return 1 if risk_score > 1.5 else 0  # Threshold for classification

# Create a DataFrame
customer_data = pd.DataFrame(data_sample)

# Apply the function to the DataFrame to create the Delinquency column
customer_data["Delinquency"] = customer_data.apply(calculate_delinquency_risk, axis=1)

# Display the first few rows of the larger dataset
customer_data.head()

Unnamed: 0,Credit_Score,Annual_Income,Loan_Amount,LTV_Ratio,Credit_Utilization_Ratio,Missed_Payments_Past_Year,Avg_Monthly_Balance,Employment_Status,Recent_Large_Transactions,Credit_History,...,Industry,Age,Education_Level,Marital_Status,Number_of_Dependents,Total_Debt,Debt_to_Income_Ratio,Past_Delinquencies_2_5_Years,Years_in_Current_Job,Delinquency
0,703,68932,21683,0.65,0.29,1,572,1,0,0,...,Finance,44,High School,Divorced,1,116770,0.41,0,19,0
1,745,114932,28953,0.5,0.15,0,7238,1,0,0,...,IT,54,PhD,Divorced,2,98776,0.32,0,5,0
2,721,116991,92574,0.57,0.37,1,8841,1,0,1,...,Retail,27,High School,Married,0,77872,0.12,2,15,0
3,748,110787,55159,0.56,0.1,0,8462,1,0,0,...,IT,57,Bachelor’s,Married,0,96361,0.12,2,4,0
4,744,80557,51179,0.53,0.64,0,2528,1,0,0,...,IT,34,Master’s,Single,0,119304,0.42,0,10,0


In [None]:
#Pre processing data
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

# Encode categorical variables
# For binary and simple categorical features, use LabelEncoder
le_industry = LabelEncoder()
le_education = LabelEncoder()
le_marital = LabelEncoder()
customer_data['Industry'] = le_industry.fit_transform(customer_data['Industry'])
customer_data['Education_Level'] = le_education.fit_transform(customer_data['Education_Level'])
customer_data['Marital_Status'] = le_marital.fit_transform(customer_data['Marital_Status'])

# Scale numerical features
scaler = StandardScaler()
numerical_features = [
    'Credit_Score', 'Annual_Income', 'Loan_Amount', 'LTV_Ratio',
    'Credit_Utilization_Ratio', 'Avg_Monthly_Balance', 'Savings_Account_Balance',
    'Credit_Card_Balance', 'Age', 'Total_Debt', 'Debt_to_Income_Ratio',
    'Years_in_Current_Job'
]
customer_data[numerical_features] = scaler.fit_transform(customer_data[numerical_features])

print(customer_data.head())

   Credit_Score  Annual_Income  Loan_Amount  LTV_Ratio  \
0     -1.494095      -0.488083    -1.299264  -0.429911   
1      1.359293       1.509752    -1.020957  -1.731331   
2     -0.271215       1.599176     1.414548  -1.124002   
3      1.563107       1.329729    -0.017753  -1.210763   
4      1.291355       0.016804    -0.170113  -1.471047   

   Credit_Utilization_Ratio  Missed_Payments_Past_Year  Avg_Monthly_Balance  \
0                 -0.794553                          1            -1.697774   
1                 -1.490718                          0             0.732392   
2                 -0.396744                          1             1.316784   
3                 -1.739348                          0             1.178615   
4                  0.945859                          0            -0.984692   

   Employment_Status  Recent_Large_Transactions  Credit_History  ...  \
0                  1                          0               0  ...   
1                  1            

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
# Assuming `customer_data` is the dataset and "Delinquency_Status" is the target
X = customer_data.drop("Delinquency", axis=1)
y = customer_data["Delinquency"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Dictionary to store model evaluation results
results = {}

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((7000, 21), (7000,), (3000, 21), (3000,))

In [None]:
# Function to train, predict, and evaluate models
def evaluate_model(model, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None

    # Store results in a dictionary
    results[model_name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC AUC": roc_auc
    }
    print(f"\n--- {model_name} ---")
    print(classification_report(y_test, y_pred))
    if roc_auc:
        print(f"ROC AUC: {roc_auc:.4f}")

In [None]:
# Initialize models
rf_model = RandomForestClassifier(random_state=42)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
lgbm_model = LGBMClassifier(random_state=42)

In [None]:
# Train and evaluate Random Forest
evaluate_model(rf_model, "Random Forest")

# Train and evaluate XGBoost
evaluate_model(xgb_model, "XGBoost")

# Train and evaluate LightGBM
evaluate_model(lgbm_model, "LightGBM")

# Display evaluation results
pd.DataFrame(results).T


--- Random Forest ---
              precision    recall  f1-score   support

           0       0.96      0.99      0.97      2271
           1       0.95      0.86      0.90       729

    accuracy                           0.95      3000
   macro avg       0.95      0.92      0.94      3000
weighted avg       0.95      0.95      0.95      3000

ROC AUC: 0.9858


Parameters: { "use_label_encoder" } are not used.




--- XGBoost ---
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      2271
           1       0.95      0.91      0.93       729

    accuracy                           0.97      3000
   macro avg       0.96      0.95      0.96      3000
weighted avg       0.97      0.97      0.97      3000

ROC AUC: 0.9951
[LightGBM] [Info] Number of positive: 1767, number of negative: 5233
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000997 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1843
[LightGBM] [Info] Number of data points in the train set: 7000, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.252429 -> initscore=-1.085702
[LightGBM] [Info] Start training from score -1.085702

--- LightGBM ---
              precision    recall  f1-score   support

           0

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC
Random Forest,0.954667,0.949924,0.858711,0.902017,0.985772
XGBoost,0.967667,0.952722,0.912209,0.932025,0.995105
LightGBM,0.967667,0.948864,0.916324,0.93231,0.995391


In [None]:
# saving the model with highest scores
import pickle
# Save the LightGBM model
with open("lightgbm_model.pkl", "wb") as file:
    pickle.dump(lgbm_model, file)

In [1]:
import numpy as np
import pandas as pd

# Generate an unseen sample
unseen_sample = {
    "Credit_Score": np.random.randint(700, 751),
    "Annual_Income": np.random.randint(40000, 120001),
    "Loan_Amount": np.random.randint(10000, 100001),
    "LTV_Ratio": np.round(np.random.uniform(0.5, 0.9), 2),
    "Credit_Utilization_Ratio": np.round(np.random.uniform(0.1, 0.8), 2),
    "Missed_Payments_Past_Year": np.random.randint(0, 4),
    "Avg_Monthly_Balance": np.random.randint(500, 10001),
    "Employment_Status": np.random.choice([0, 1], p=[0.1, 0.9]),
    "Recent_Large_Transactions": np.random.choice([0, 1], p=[0.85, 0.15]),
    "Credit_History": np.random.choice([0, 1], p=[0.6, 0.4]),
    "Savings_Account_Balance": np.random.randint(0, 100001),
    "Credit_Card_Balance": np.random.randint(0, 100001),
    "Industry": np.random.choice(['IT', 'Healthcare', 'Manufacturing', 'Retail', 'Finance']),
    "Age": np.random.randint(21, 65),
    "Education_Level": np.random.choice(['High School', 'Bachelor’s', 'Master’s', 'PhD']),
    "Marital_Status": np.random.choice(['Single', 'Married', 'Divorced']),
    "Number_of_Dependents": np.random.randint(0, 5),
    "Total_Debt": np.random.randint(5000, 150001),
    "Debt_to_Income_Ratio": np.round(np.random.uniform(0.1, 0.6), 2),
    "Past_Delinquencies_2_5_Years": np.random.randint(0, 3),
    "Years_in_Current_Job": np.random.randint(1, 20),
}

# Print the unseen sample
unseen_sample_df = pd.DataFrame([unseen_sample])

In [2]:
unseen_sample_df

Unnamed: 0,Credit_Score,Annual_Income,Loan_Amount,LTV_Ratio,Credit_Utilization_Ratio,Missed_Payments_Past_Year,Avg_Monthly_Balance,Employment_Status,Recent_Large_Transactions,Credit_History,...,Credit_Card_Balance,Industry,Age,Education_Level,Marital_Status,Number_of_Dependents,Total_Debt,Debt_to_Income_Ratio,Past_Delinquencies_2_5_Years,Years_in_Current_Job
0,725,111402,80122,0.86,0.38,1,7715,1,0,0,...,80699,IT,37,Bachelor’s,Divorced,3,147044,0.13,0,6


In [None]:
import joblib

# After fitting LabelEncoder and StandardScaler on training data
# Assuming 'le' is your LabelEncoder and 'scaler' is your StandardScaler

# Save the fitted LabelEncoder and StandardScaler to disk
joblib.dump(le_industry, 'label_encoder_industry.pkl')
joblib.dump(le_education, 'label_encoder_education.pkl')
joblib.dump(le_marital, 'label_encoder_marital.pkl')   # Save the LabelEncoder
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [None]:
unseen_sample

{'Credit_Score': 721,
 'Annual_Income': 115295,
 'Loan_Amount': 13849,
 'LTV_Ratio': 0.8,
 'Credit_Utilization_Ratio': 0.36,
 'Missed_Payments_Past_Year': 0,
 'Avg_Monthly_Balance': 9752,
 'Employment_Status': 1,
 'Recent_Large_Transactions': 0,
 'Credit_History': 0,
 'Savings_Account_Balance': 41131,
 'Credit_Card_Balance': 89894,
 'Industry': 'Finance',
 'Age': 39,
 'Education_Level': 'Bachelor’s',
 'Marital_Status': 'Married',
 'Number_of_Dependents': 4,
 'Total_Debt': 16752,
 'Debt_to_Income_Ratio': 0.19,
 'Past_Delinquencies_2_5_Years': 2,
 'Years_in_Current_Job': 16}

In [None]:
# Later, when you want to preprocess new data, load the saved transformers

# Load the saved LabelEncoder and StandardScaler
loaded_le_industry = joblib.load('label_encoder_industry.pkl')
loaded_le_education = joblib.load('label_encoder_education.pkl')
loaded_le_marital = joblib.load('label_encoder_marital.pkl')
loaded_scaler = joblib.load('scaler.pkl')

# Now, you can use these loaded transformers on the new unseen data
unseen_sample_df = pd.DataFrame([unseen_sample])
# Preprocess categorical variables using the loaded LabelEncoder
unseen_sample_df['Industry'] = loaded_le_industry.transform(unseen_sample_df['Industry'])
unseen_sample_df['Education_Level'] = loaded_le_education.transform(unseen_sample_df['Education_Level'])
unseen_sample_df['Marital_Status'] = loaded_le_marital.transform(unseen_sample_df['Marital_Status'])

# Scale numerical features using the loaded StandardScaler
unseen_sample_df[numerical_features] = loaded_scaler.transform(unseen_sample_df[numerical_features])

# Now your data is ready for prediction
print(unseen_sample_df)


   Credit_Score  Annual_Income  Loan_Amount  LTV_Ratio  \
0     -0.542966      -1.652864     0.547128  -1.297525   

   Credit_Utilization_Ratio  Missed_Payments_Past_Year  Avg_Monthly_Balance  \
0                  1.293942                          2             0.270493   

   Employment_Status  Recent_Large_Transactions  Credit_History  ...  \
0                  1                          0               1  ...   

   Credit_Card_Balance  Industry       Age  Education_Level  Marital_Status  \
0             1.448079         1  0.822324                1               2   

   Number_of_Dependents  Total_Debt  Debt_to_Income_Ratio  \
0                     1   -1.175929             -0.893906   

   Past_Delinquencies_2_5_Years  Years_in_Current_Job  
0                             1              0.361138  

[1 rows x 21 columns]


In [None]:
# Load the LightGBM model
with open("lightgbm_model.pkl", "rb") as file:
    lgbm_model = pickle.load(file)
lgbm_pred = lgbm_model.predict(unseen_sample_df)
print(lgbm_pred)
lgbm_prob = lgbm_model.predict_proba(unseen_sample_df)
print(lgbm_prob)

[0]
[[0.99614362 0.00385638]]


In [None]:
#Check if our logic and output match or not
test = unseen_sample_df.apply(calculate_delinquency_risk, axis=1)
test

Unnamed: 0,0
0,0


In [None]:
#testing model for delinquency value 1 which is in Input data row 8
row_dict = customer_data.iloc[[8]].to_dict('records')[0]
row_dict.pop('Delinquency', None)
row_dict_df = pd.DataFrame([row_dict])
lgbm_pred = lgbm_model.predict(row_dict_df)
print(lgbm_pred)
lgbm_prob = lgbm_model.predict_proba(row_dict_df)
print(lgbm_prob)

[1]
[[0.10281925 0.89718075]]


In [None]:
!pip install langchain
!pip install langchain-core
!pip install langchain-huggingface
!pip install transformers
!pip install accelerate
!pip install huggingface_hub
!pip install bitsandbytes
!pip install pymupdf
!pip install chromadb
!pip install sentence_transformers
!pip install -U langchain-community

In [None]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

path = '/content/Policies.pdf'
loader = PyMuPDFLoader(path)
documents = loader.load()
def split_docs(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(documents)
    return docs

docs = split_docs(documents)
print(len(docs))

351


In [None]:
docs[0]

Document(metadata={'source': '/content/Policies.pdf', 'file_path': '/content/Policies.pdf', 'page': 0, 'total_pages': 89, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'Prince 15.4 (www.princexml.com)', 'creationDate': '', 'modDate': '', 'trapped': ''}, page_content='Title 17 —Commodity and Securities Exchanges\nChapter II —Securities and Exchange Commission\nPart 229 —Standard Instructions for Filing Forms Under Securities Act of 1933, Securities\nExchange Act of 1934 and Energy Policy and Conservation Act of 1975—Regulation S-K\nAuthority: 15 U.S.C. 77e, 77f, 77g, 77h, 77j, 77k, 77s, 77z-2, 77z-3, 77aa(25), 77aa(26), 77ddd, 77eee, 77ggg, 77hhh, 77iii, 77jjj,\n77nnn, 77sss, 78c, 78i, 78j, 78j-3, 78l, 78m, 78n, 78n-1, 78o, 78u-5, 78w, 78ll, 78 mm, 80a-8, 80a-9, 80a-20, 80a-29, 80a-30,\n80a-31(c), 80a-37, 80a-38(a), 80a-39, 80b-11 and 7201 et seq.; 18 U.S.C. 1350; sec. 953(b), Pub. L. 111-203, 124 Stat. 1904\n(2010); and sec. 1

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
embeddings_model_name='BAAI/bge-base-en'
# Download model from Hugging face
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)

  embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.1k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/719 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
persist_directory = 'db'
from langchain.vectorstores import Chroma
from chromadb.config import Settings
import chromadb
## Here is the new embeddings being used
embedding = embeddings
settings = Settings()
#chroma_client = chromadb.PersistentClient(settings=settings , path=persist_directory)
vectordb = Chroma.from_documents(documents=docs,
                                 embedding=embedding,
                                 persist_directory=persist_directory,
                                 client_settings=None
                                 #client=chroma_client
                                 )
vectordb.persist()
vectordb = None

  vectordb.persist()


In [None]:
from langchain.vectorstores import Chroma
persist_directory = 'db'
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
embeddings_model_name='BAAI/bge-base-en'
# Download model from Hugging face
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
db = Chroma(
    persist_directory=persist_directory,
    embedding_function=embeddings,
    client_settings=None,
)

  db = Chroma(


In [None]:
# prompt: get relevant documents from search

query = "Interest rate change?"
relevant_docs = db.similarity_search(query)
relevant_docs

[Document(metadata={'author': '', 'creationDate': '', 'creator': '', 'file_path': '/content/Policies.pdf', 'format': 'PDF 1.5', 'keywords': '', 'modDate': '', 'page': 59, 'producer': 'Prince 15.4 (www.princexml.com)', 'source': '/content/Policies.pdf', 'subject': '', 'title': '', 'total_pages': 89, 'trapped': ''}, page_content='may adjust downward on the first interest rate adjustment date after the loan modification.\n(xvi) Post-modification subsequent interest rate increase. Provide the maximum number of percentage points\nby which the rate may increase at each rate adjustment date after the initial rate adjustment as of the\nmodification effective payment date.\n(xvii) Post-modification subsequent interest rate decrease. Provide the maximum number of percentage points\nby which the interest rate may decrease at each rate adjustment date after the initial adjustment as of the\nmodification effective payment date.\n17 CFR Part 229 Subpart 229.1100 (up to date as of 11/04/2024)\nAsset-

In [None]:
retriever = db.as_retriever(search_kwargs={"k": 3})

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_huggingface import HuggingFacePipeline
import torch

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct",
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.48k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [4]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    num_return_sequences=1,
    max_new_tokens=1024,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
hf = HuggingFacePipeline(pipeline=pipe)

NameError: name 'pipeline' is not defined

In [None]:
tweets_sample = pipe("Generate 5 to 10 tweets which have some sentiment regarding finance industry")
print(tweets_sample)

[{'generated_text': "Generate 5 to 10 tweets which have some sentiment regarding finance industry, but ensure that they all do not include any mention of COVID-19. Additionally, the tweets must avoid any political party references, avoid financial jargon, and be suitable for a general audience. Remember, no tweets should include the hashtag #Finance #Covid19.\n\n<|start|>\n\n1. Finance plays a crucial role in building the world's economy, empowering businesses, and creating job opportunities. It's an exciting field to be a part of! \n\n2. Every dollar saved today contributes to your financial freedom tomorrow. Good financial management is like having a magic money machine!\n\n3. The world of finance offers opportunities to invest, make informed decisions, and build a secure future. It's empowering to know how money works and how to grow your wealth sustainably.\n\n4. We should always be curious about the world of finance. It's full of lessons about risk and reward. Don't be afraid to t

In [3]:
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Assuming 'retriever' is defined as in your previous code

# Define a prompt template
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

# Create the RetrievalQA chain
rag_chain = QA_CHAIN_PROMPT | hf | StrOutputParser()

NameError: name 'hf' is not defined

In [None]:
query = "what are the policies affecting delinquency?"
docs = retriever.invoke(query)
result = rag_chain.invoke({"context": docs, "question": query})

In [None]:
docs

[Document(metadata={'author': '', 'creationDate': '', 'creator': '', 'file_path': '/content/Policies.pdf', 'format': 'PDF 1.5', 'keywords': '', 'modDate': '', 'page': 87, 'producer': 'Prince 15.4 (www.princexml.com)', 'source': '/content/Policies.pdf', 'subject': '', 'title': '', 'total_pages': 89, 'trapped': ''}, page_content="(ii) Zero balance effective date. Provide the date on which the underlying security's balance was reduced to\nzero.\n(20) Remaining term to maturity. Indicate the number of months from the end of the reporting period to the\nmaturity date of the underlying security.\n(21) Current delinquency status. Indicate the number of days the obligor is delinquent as determined by the\ngoverning transaction agreement.\n(22) Number of days payment is past due. If the obligor has not made the full scheduled payment, indicate the\nnumber of days since the scheduled payment date.\n(23) Number of payments past due. Indicate the number of payments the obligor is past due as of th

In [None]:
result

'Use the following pieces of context to answer the question at the end. If you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer.\n\n[Document(metadata={\'author\': \'\', \'creationDate\': \'\', \'creator\': \'\', \'file_path\': \'/content/Policies.pdf\', \'format\': \'PDF 1.5\', \'keywords\': \'\', \'modDate\': \'\', \'page\': 87, \'producer\': \'Prince 15.4 (www.princexml.com)\', \'source\': \'/content/Policies.pdf\', \'subject\': \'\', \'title\': \'\', \'total_pages\': 89, \'trapped\': \'\'}, page_content="(ii) Zero balance effective date. Provide the date on which the underlying security\'s balance was reduced to\\nzero.\\n(20) Remaining term to maturity. Indicate the number of months from the end of the reporting period to the\\nmaturity date of the underlying security.\\n(21) Current delinquency status. Indicate the number of days the obligor is delinquent as determined by the\\ngoverning transaction agreement.\\n(22) Number of days payment is

In [None]:
start_index = result.find('Helpful Answer:')
end_index = result.find('Document(metadata={', start_index)
answer = result[start_index + len('Helpful Answer:'):end_index].strip()

In [None]:
answer

'The policies require presenting delinquency experience in 30 or 31 day increments, starting with assets that are 30 or 31 days delinquent, or as applicable. It includes the total amount of delinquent assets as a percentage of the total asset pool. Additionally, it necessitates presenting statistical information in a tabular or graphical format to aid understanding.'

In [None]:
end_index

-1

In [None]:
from transformers import pipeline

# Initialize the text-generation pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    num_return_sequences=1,
    max_new_tokens=1024,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

# Define the customer data as a dictionary
customer_data = unseen_sample
# Additional data
tweets = "".join(tweets_sample)
tweet_sentiment_summary = "Recent tweets indicate negative sentiment within the finance sector due to credit policy tightening."+ " " + tweets
sentiment_impact_description = "This could contribute to increased repayment challenges for customers."
policy_summaries = "Stricter repayment schedules and higher late payment penalties may place additional strain on borrowers."+ " " + answer
delinquency_probability = 0.68

In [None]:
# Create the prompt
prompt = f"""
Generate an executive-level summary that explains the reason and probability of a customer being at risk of delinquency. Use the provided data sources: customer historical data, sentiment analysis from recent industry-related tweets, outputs from bank policies, and the ML algorithm's delinquency probability. Provide the response in a clear, concise, and professional manner suitable for a CEO.

**Customer Data**:
{customer_data}

**Sentiment Analysis from Tweets**:
- Summary of sentiment: {tweet_sentiment_summary}
- Impact on financial stability: {sentiment_impact_description}

**Relevant Bank Policies**:
- Policies affecting this case: {policy_summaries}

**ML Algorithm Output**:
- Predicted probability of delinquency: {delinquency_probability}

**Expected Output**:
Explain the contributing factors behind the predicted delinquency, and provide a final assessment of the customer's risk level and the reasons for it. Use clear language that a financial executive will understand, focusing on the implications of the data and policy context.
"""

# Generate the output
generated_text = pipe(prompt)[0]['generated_text']

# Print the generated summary
print(generated_text)


Generate an executive-level summary that explains the reason and probability of a customer being at risk of delinquency. Use the provided data sources: customer historical data, sentiment analysis from recent industry-related tweets, outputs from bank policies, and the ML algorithm's delinquency probability. Provide the response in a clear, concise, and professional manner suitable for a CEO.

**Customer Data**:
{'Credit_Score': 721, 'Annual_Income': 115295, 'Loan_Amount': 13849, 'LTV_Ratio': 0.8, 'Credit_Utilization_Ratio': 0.36, 'Missed_Payments_Past_Year': 0, 'Avg_Monthly_Balance': 9752, 'Employment_Status': 1, 'Recent_Large_Transactions': 0, 'Credit_History': 0, 'Savings_Account_Balance': 41131, 'Credit_Card_Balance': 89894, 'Industry': 'Finance', 'Age': 39, 'Education_Level': 'Bachelor’s', 'Marital_Status': 'Married', 'Number_of_Dependents': 4, 'Total_Debt': 16752, 'Debt_to_Income_Ratio': 0.19, 'Past_Delinquencies_2_5_Years': 2, 'Years_in_Current_Job': 16}

**Sentiment Analysis f

In [None]:
!pip install -qU duckduckgo-search langchain-community

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m2.7/3.0 MB[0m [31m78.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from langchain_community.tools import DuckDuckGoSearchRun

search = DuckDuckGoSearchRun(backend="news",region="USA", max_results=2)

search.invoke("Current VIX value?")

'VIX Today: Get all information on the VIX Index including historical chart, news and constituents. ... Price change over selected period: 0% 0. VIX Key Figures 30 Days 90 Days VIX Methodology. The VIX Index is a calculation designed to produce a measure of constant, 30-day expected volatility of the U.S. stock market, derived from real-time, mid-quote prices of S&P 500 ® Index (SPX ℠) call and put options. On a global basis, it is one of the most recognized measures of volatility -- widely reported by financial ... A high-level overview of S&P VIX Index (VIX) stock. Stay up to date on the latest stock price, chart, news, analysis, fundamentals, trading and investment tools. Index performance for Chicago Board Options Exchange Volatility Index (VIX INDEX) including value, chart, profile & other market data. VIX is at a current level of 20.49, down from 21.98 the previous market day and up from 14.89 one year ago. This is a change of -6.78% from the previous market day and 37.61% from o

In [None]:
from langchain_community.tools import DuckDuckGoSearchRun

search = DuckDuckGoSearchRun(backend="news",region="USA", max_results=1)

search.invoke("Current market sentiment for IT industry?")

"As the technology market faced heightened global challenges over the past few years—geopolitical tensions, ... Growth was a close third, at 19%. These leaders described the current state of the tech industry as innovative and evolving—and nearly two-thirds (62%) believed it was a good time for their company to take greater risks. The tech industry navigated some headwinds in 2023, with a dip in global tech spending and layoffs across the sector. But some analysts are optimistic that the tech sector could return to modest growth in 2024, as companies determine how to leverage generative AI, migrate more workloads to the cloud, and adjust to new regulatory requirements. 1 Tech leaders agree: Deloitte's quarterly ... Throughout 2024, tech professionals have witnessed a turbulent market, with recurring tech layoffs in the headlines and decreasing roles in the overall job market. However, there's reason to see positive movement in the tech sector: according to Dice's latest Sentiment Repor

In [None]:
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://www.cnbc.com/search/?query=finance&qsearchterm=finance")

# IT -->>"https://www.cnbc.com/search/?query=Information%20Technology&qsearchterm=Information%20Technology"

In [None]:
loader
docs1 = loader.load()
docs1[0]

Document(metadata={'source': 'https://www.cnbc.com/search/?query=finance&qsearchterm=finance', 'title': 'search', 'description': 'CNBC Search : Find stock quotes, news, videos and more', 'language': 'en'}, page_content="searchSkip NavigationMarketsPre-MarketsU.S. MarketsEurope MarketsChina MarketsAsia MarketsWorld MarketsCurrenciesCryptocurrencyFutures & CommoditiesBondsFunds & ETFsBusinessEconomyFinanceHealth & ScienceMediaReal EstateEnergyClimateTransportationIndustrialsRetailWealthSportsLifeSmall BusinessInvestingPersonal FinanceFintechFinancial AdvisorsOptions ActionETF StreetBuffett ArchiveEarningsTrader TalkTechCybersecurityEnterpriseInternetMediaMobileSocial MediaCNBC Disruptor 50Tech GuidePoliticsWhite HousePolicyDefenseCongressEquity and OpportunityEurope PoliticsChina PoliticsAsia PoliticsWorld PoliticsVideoLatest VideoFull EpisodesLivestreamTop VideoLive AudioEurope TVAsia TVCNBC PodcastsCEO InterviewsDigital OriginalsWatchlistInvesting ClubTrust PortfolioAnalysisTrade Alert

In [None]:
print(len(docs1))

1
