In [138]:
%pip install https://github.com/ydataai/pandas-profiling/archive/master.zip

import pandas as pd
import numpy as np
import sys
from pandas_profiling import ProfileReport
from sklearn.preprocessing import LabelEncoder 

# Data Modeling Libraries
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, median_absolute_error, r2_score

# Data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import re

#Save the model
#from joblib import dump, load

Collecting https://github.com/ydataai/pandas-profiling/archive/master.zip
  Using cached https://github.com/ydataai/pandas-profiling/archive/master.zip
Note: you may need to restart the kernel to use updated packages.


### **2. Perfilamiento y entendimiento de los datos**



#### Lectura de los datos
Primero, se leen los datos y se visualizan que se hayan leido correctamente.

In [53]:
source = 'https://raw.githubusercontent.com/nparis87/MachineLearning/main/SearchBonds.csv'

data = pd.read_csv(source, sep=',')
data.head()

Unnamed: 0,Issuer,Ticker,Coupon,Maturity,Issue Date,ISIN,Principal Currency,Country of Issue,Issuer Type,Instrument Type,...,Yield to Maturity,Option Adjusted Spread,Amount Outstanding,Parent Domicile,Sector,Rating,Actual,Outlook,Watch,Scope
0,Telecom Argentina SA,TECOBT,7.25,30/06/2002,30/06/1999,XS0099123712,Euro,Eurobond,Corporate,Bond,...,--,--,250000000,,Telecommunications,B-,Actual,Rating Outlook Stable,,Foreign
1,Telecom Argentina SA,TECOBT,12.5,1/03/2003,1/03/2000,US12686NAD75,US Dollar,United States,Corporate,Note,...,--,--,100000000,,Telecommunications,B-,Actual,Rating Outlook Stable,,Foreign
2,Telecom Argentina SA,TECOBT,12.5,1/03/2003,1/03/2000,USP19157AB50,US Dollar,Eurobond,Corporate,Note,...,--,--,100000000,,Telecommunications,B-,Actual,Rating Outlook Stable,,Foreign
3,Telecom Argentina SA,TECOBT,7.625,6/04/2003,6/04/2000,XS0109260686,Euro,Eurobond,Corporate,Note,...,--,--,250000000,,Telecommunications,B-,Actual,Rating Outlook Stable,,Foreign
4,Banco Hipotecario SA,BHIP,10.0,16/04/2003,16/04/1998,US05959FAA49,US Dollar,Eurobond,Corporate,Inhaberschuldverschreibung,...,--,--,300000000,Argentina,Banking,WD,Actual,,,Foreign


In [54]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Issuer                  10000 non-null  object
 1   Ticker                  9977 non-null   object
 2   Coupon                  10000 non-null  object
 3   Maturity                10000 non-null  object
 4   Issue Date              10000 non-null  object
 5   ISIN                    9994 non-null   object
 6   Principal Currency      10000 non-null  object
 7   Country of Issue        9989 non-null   object
 8   Issuer Type             10000 non-null  object
 9   Instrument Type         10000 non-null  object
 10  Coupon Type             10000 non-null  object
 11  Amount Issued (USD)     10000 non-null  object
 12  Bond Grade              10000 non-null  object
 13  Yield to Maturity       10000 non-null  object
 14  Option Adjusted Spread  10000 non-null  object
 15  Am

In [55]:
data.Watch.fillna('Neutral', inplace=True)

In [61]:
columnas1 = ['Coupon', 'Yield to Maturity', 'Option Adjusted Spread', 'Amount Outstanding']
data.dropna(inplace=True)
for col in columnas1:
    data[col] = data[col].apply(lambda x: x.replace('--','0'))
    data[col] = data[col].apply(lambda x: float(x.replace(',','')))
        

In [65]:
rating = ['AAA','AA+','AA','AA-','A+','A', 'A-', 'BBB+', 'BBB','BBB-', 'BB+','BB', 'BB-','B+','B','B-','CCC+', 'CCC','CCC-','CC' , 'C','D','RD','WD']
rating2 = dict(zip(rating, range(1, len(rating)+1)))

In [66]:
data.Rating = data.Rating.map(rating2)
data.Maturity = pd.to_datetime(data.Maturity)

In [67]:
data.iloc[0,:]

Issuer                    CESP Companhia Energetica de Sao Paulo
Ticker                                                    AURECA
Coupon                                                      10.5
Maturity                                     2004-04-03 00:00:00
Issue Date                                             4/03/2001
ISIN                                                US12517GAA31
Principal Currency                                     US Dollar
Country of Issue                                   United States
Issuer Type                                            Corporate
Instrument Type                                             Note
Coupon Type                           Plain Vanilla Fixed Coupon
Amount Issued (USD)                                  300,000,000
Bond Grade                                            High Yield
Yield to Maturity                                            0.0
Option Adjusted Spread                                       0.0
Amount Outstanding       

In [118]:
curr = data['Principal Currency'].value_counts().index[:5].tolist() # detectamos las 5 principales monedas y las dejamos como lista
dom = data['Parent Domicile'].value_counts().index[:5].tolist() # detectamos las 5 principales paises de riesgo y las dejamos como lista
sector = data['Sector'].value_counts().index[:5].tolist() # detectamos las 5 principales paises de riesgo y las dejamos como lista

data.loc[:,'Maturity'] = (data.Maturity - pd.datetime.today()).dt.days/365 # convertimos MAturuty a años al vencimiento
data = data[data['Maturity'] > 0.5]                                        # filtramos maturity > 0.5 años

  data.loc[:,'Maturity'] = (data.Maturity - pd.datetime.today()).dt.days/365


In [119]:
select = data['Parent Domicile'].isin(dom) & data['Principal Currency'].isin(curr) & data['Sector'].isin(sector)

data = data.loc[select,:]

In [137]:
columnasX = ['Maturity','Principal Currency', 'Issuer Type', 'Bond Grade','Amount Outstanding','Parent Domicile', 'Sector', 'Rating', 'Outlook', 'Watch']
X = pd.get_dummies(data[columnasX])
Y = data['Yield to Maturity']

In [133]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, random_state = 20, test_size=0.2)

In [134]:
lr = LinearRegression().fit(X_train, Y_train)

In [135]:
pd.options.display.float_format = '{:,.2f}'.format
pd.DataFrame(lr.coef_, index=X.columns)

Unnamed: 0,0
Maturity,21.09
Amount Outstanding,0.0
Rating,5.38
Principal Currency_British Pound,32.01
Principal Currency_Chinese Yuan,5.85
Principal Currency_Euro,-7.04
Principal Currency_Japanese Yen,-37.97
Principal Currency_US Dollar,7.15
Issuer Type_Corporate,-0.0
Bond Grade_High Yield,154.69


In [136]:
Y_new  = lr.predict(X_test)

print('Mean Squared Error:', mean_squared_error(Y_test, Y_new)) 
print('Mean Absolute Error:', median_absolute_error(Y_test, Y_new))
print('R2 score:', r2_score(Y_test, Y_new))

Mean Squared Error: 34419.2254265878
Mean Absolute Error: 43.01804786134352
R2 score: 0.10718604340601989


#### Modelo de Clasificación


In [145]:
columnasX = ['Maturity','Principal Currency', 'Issuer Type', 'Bond Grade','Amount Outstanding','Parent Domicile', 'Sector', 'Rating', 'Outlook', 'Watch', 'Yield to Maturity']

X = pd.get_dummies(data[columnasX])

le = LabelEncoder().fit(data['Bond Grade'])
Y = le.transform(data['Bond Grade'])


In [148]:
from sklearn.linear_model import LogisticRegression

In [147]:
# Trabajo en clase....