# Evaluating credit worthiness of a company

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
credit_data = pd.read_csv('./datasets/DNB_SIRA.csv')
credit_data['datepll'] = pd.to_datetime(credit_data['datepll'], format='%Y%m')
credit_data = credit_data.set_index(['datepll']).sort_index().reset_index()
credit_data = credit_data.set_index(['datepll','ticker'])

credit_data = credit_data.drop(columns=['businessname', 'isin', 'cusip6'])

credit_data_f_company = credit_data[credit_data['basecat'] == 'R'] # only keep data about full company
prop_missing = credit_data_f_company.isna().sum(axis=0) / credit_data_f_company.shape[0]

columns_keep = prop_missing[prop_missing < 0.1].index.tolist() # Threshold
credit_data_low_missing = credit_data_f_company[columns_keep]

prop_missing_low = credit_data_low_missing.isna().sum(axis=0) / credit_data_low_missing.shape[0]

low_missing_cols = prop_missing_low[prop_missing_low > 0].index.tolist()

final_cols = credit_data_low_missing[low_missing_cols].dtypes[~(credit_data_low_missing[low_missing_cols].dtypes == 'object')].index.tolist()

credit_data_low_missing = credit_data_low_missing[final_cols]

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# Use mean to fill missing values
credit_data_clean = credit_data_low_missing.fillna(credit_data_low_missing.mean())

In [5]:
included_tickers = pd.read_csv('correct_tickers.csv')
included_tickers = list(included_tickers.T.values[0])

In [6]:
available_tickers = []
for ticker, sub_df in credit_data_clean.reset_index().groupby('ticker'):
    if ticker in included_tickers:
        available_tickers.append(ticker)
        sub_df.to_csv('./credit/%s.csv' % ticker)

In [9]:
max_size = 0
for ticker in available_tickers:
    size = pd.read_csv('./credit/%s.csv' % ticker).shape[0]
    if size > max_size:
        max_size = size
        
print(max_size)

177


In [10]:
temp = credit_data_clean.reset_index()
temp = temp[temp['ticker'].isin(included_tickers)]
credit_data_clean = temp.set_index(['datepll','ticker'])

In [11]:
credit_data_clean.to_csv('credit_data_reduced.csv')

In [10]:
def do_pca(ticker):
    from sklearn.preprocessing import StandardScaler
    from sklearn.decomposition import PCA, KernelPCA
    # Apply the KPCA transformation to the ADL dataset = GT variables and their one-month lags

    df_google_only = pd.read_csv('./credit/%s.csv' % ticker).set_index(['datepll'])
    df_google_only = df_google_only.drop(columns=['ticker'])

    # Standard Scale the data
    df_google_only_st = StandardScaler().fit_transform(df_google_only)

    transformer = KernelPCA(kernel='rbf', gamma=10)
    transformer = transformer.fit(df_google_only_st)

    train_img = transformer.transform(df_google_only_st)

    train_img = pd.DataFrame(data = train_img)
    train_img.index = df_google_only.index
    # gather data into a csv
    train_img.to_csv('./pca_out/%s.csv' % ticker)

In [None]:
for ticker in available_tickers:
    do_pca(ticker)

In [11]:
def get_days_in_month(month):
    if month == 2:
        return 29
    else:
        if month % 2 == 0:
            return 31
        else:
            return 30

for ticker in available_tickers:
    expanded_ticker = []
    pca_data = pd.read_csv('./pca_out/%s.csv' % ticker)
    price_data = pd.DataFrame(pd.read_csv('./price_data/%s.csv' % ticker)[['Date','Adj_Close']])  
    pca_data['datepll'] = pd.to_datetime(pca_data['datepll'])
    price_data['Date'] = pd.to_datetime(price_data['Date'])

    for i in range(pca_data.shape[0]):
        row = pca_data.iloc[i,:]
        month = pca_data.iloc[i,0].month
        multiple = get_days_in_month(month)
        dates = pd.date_range(start=pca_data.iloc[i,0], periods=multiple, freq="1d")

        repeated_values = pd.concat([pd.DataFrame(pca_data.iloc[i, :]).T]*multiple)
        repeated_values.iloc[:,0] = dates

        expanded_ticker.append(repeated_values)


    interpolated_data = pd.concat(expanded_ticker)
    merged_data = interpolated_data.merge(price_data, how='inner', left_on='datepll', right_on='Date')
    merged_data = merged_data.drop(columns=['Date'])
    merged_data.to_csv('./interpolated_pca_data/%s.csv' % ticker)
    

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn import linear_model
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.metrics import mean_squared_error
from math import sqrt
from itertools import combinations

selected_features = []
    
def select_kpca_components(ticker):

    interpolated_data = pd.read_csv('./interpolated_pca_data/%s.csv' % ticker)
    adj_close = interpolated_data['Adj_Close']
    adj_close_diff = adj_close.diff(1)
    interpolated_data['Adj_Close_Lag'] = adj_close_diff.shift(1)

    def get_y_and_date(df_param):
        df_param = df_param.drop([0], axis=0)
        df_param = df_param.dropna()
        adj_close_diff = df_param['Adj_Close'].diff(1)
        adj_close_diff = adj_close_diff.dropna()
        return df_param['datepll'], adj_close_diff

    def evaluate_model_performance(true_consumption, predictions):
        from sklearn.metrics import mean_squared_error
        mean_squared_error = mean_squared_error(true_consumption, predictions)
        root_mean_squared_error = sqrt(mean_squared_error) 
        return root_mean_squared_error

    def forecaster(X, y, date):
        tscv = TimeSeriesSplit(n_splits=int(y.shape[0]*0.85))
        predictions = []
        true_consumption = []
        dates = []

        for train_index, test_index in tscv.split(X):
        # print("TRAIN:", train_index, "TEST:", test_index)

            X_train, X_test = X[:len(train_index) - 1], X[len(train_index): len(train_index) + 1]
            y_train, y_test = y[:len(train_index) - 1], y[len(train_index): len(train_index) + 1]
            date_train, date_test = date[:len(train_index)], date[len(train_index): len(train_index) +1]

            clf = linear_model.LinearRegression()
            clf.fit(X_train, y_train) # training is conducted on the sample before t
            # print(clf.coef_)
    #         print("reached")
    #         print(X_test.head(1))
    #         print(y_test.head(1))
            prediction = clf.predict(X_test) # nowcast and forecast are distinguished by having different X sets, the caster is the same!!!
            # print(prediction.item(0))
    #         print(y_test.values[0])
            predictions.append(prediction.item(0))
            true_consumption.append(y_test.values[0])
            # print("THE TRUE CONS")
    #         print(y_test.values[0])
    #         print(true_consumption)
            dates.append(date_test.values[0])
            # print(y_test.values[0])
    #         print(prediction.item(0))
            return true_consumption, predictions, dates

    def recursive_kpca_iterator(X_used, X_unused, y, date, df_param, root_mean_sq_error_param):
    #     print("Starting the recursive iterative procedure")
        root_mean_sq_error = root_mean_sq_error_param # as a reference point
        useful_x = ''
        global selected_features 
        
        if len(X_unused) == 0:
    #         print("X_unused is 0")
    #         print(X_used)
            return X_used

        for x in X_unused:
            combo = X_used
            combo.append(x)
            # print(combo)
            X = df_param[combo]
            X = X.dropna()

            true_consumption, predictions, dates = forecaster(X, y, date)

            if root_mean_sq_error > evaluate_model_performance(true_consumption, predictions):
                root_mean_sq_error = evaluate_model_performance(true_consumption, predictions)
                useful_x = x
            combo.remove(x)

        if root_mean_sq_error < root_mean_sq_error_param:
    #         print(useful_x)
            X_used_new = X_used.copy()
            X_used_new.append(useful_x)
            X_unused_new = X_unused.copy()
            X_unused_new.remove(useful_x)
    #         print(X_used_new) 
    #         print(X_unused_new)
    #         print(root_mean_sq_error)
            recursive_kpca_iterator(X_used_new, X_unused_new, y, date, df_param, root_mean_sq_error)
        else:
    #         print("The error term did not improve!")
            selected_features = list(set(X_used))
            print(selected_features)
            print(root_mean_sq_error)
        return X_used

    def generate_lag_consumption_and_composite_X_baseline(df_param):
    #     print(df_param.head())

        return df_param['Adj_Close_Lag']

    def perform_recursive_kpca_iterator(df_param):

        date, y = get_y_and_date(df_param)
        X_baseline = generate_lag_consumption_and_composite_X_baseline(df_param)
        X_baseline = X_baseline.dropna()
        del df_param['Adj_Close']
        del df_param['datepll']

        X_used = ['Adj_Close_Lag']
        X_unused = df_param.columns

    #     print(X_used)
    #     print(X_unused)

        recursive_kpca_iterator(list(X_used), list(X_unused), y, date, df_param, 10000.0) # instead of saving, just print the result

        # At the end of the procedure the best performing KPCA components will be printed with the RMSFE of the best GT-based model
    
    perform_recursive_kpca_iterator(interpolated_data)
#     print(interpolated_data.columns.tolist()[0],type(interpolated_data.columns.tolist()[0]))
    print(selected_features)
#     print(selected_features[0], type(selected_features[0]))
#     print(set(interpolated_data.columns.tolist()).intersection(set(selected_features)))
    selected_data = interpolated_data[selected_features]
    selected_data.to_csv('./selected_kpca_components/%s.csv' % ticker)

In [10]:
for ticker in available_tickers:
    print(ticker)
    select_kpca_components(ticker)

AA
['15', '69', 'Adj_Close_Lag']
2.9426733756166e-05
['15', '69', 'Adj_Close_Lag']
AAPL
['156', '44', 'Adj_Close_Lag', '136', '161', '77', '165', '4']
1.077009805922069e-07
['156', '44', 'Adj_Close_Lag', '136', '161', '77', '165', '4']
ABBV
['79', '3', '39', 'Adj_Close_Lag', '11', '68', '81', '5']
1.1409665503947508e-05
['79', '3', '39', 'Adj_Close_Lag', '11', '68', '81', '5']
ABC
['123', '41', '39', '44', 'Adj_Close_Lag', '147', '84', '43']
3.47375289516845e-07
['123', '41', '39', '44', 'Adj_Close_Lag', '147', '84', '43']
ABT
['Adj_Close_Lag', '158', '96']
1.0363104420613861e-05
['Adj_Close_Lag', '158', '96']
ACRX
['103', '59', '68', 'Adj_Close_Lag', '55']
6.383000257442761e-05
['103', '59', '68', 'Adj_Close_Lag', '55']
ADBE
['90', '171', '166', '173', '129', 'Adj_Close_Lag']
6.872868678460542e-05
['90', '171', '166', '173', '129', 'Adj_Close_Lag']
ADI
['164', '79', '38', '123', '175', '133', '138', '159', '170', '39', 'Adj_Close_Lag', '145', '88', '13', '76', '141', '81', '96', '64']

In [73]:
interpolated_data.shape

(3699, 179)

In [51]:
sanity = 0

def trythis():
    global sanity
    sanity = 1

In [52]:
trythis()

In [53]:
sanity 

1

In [14]:
test = pd.read_csv('../test.csv')
test

Unnamed: 0,Date,Headline
0,01/01/2005,This is
1,01/01/2005,A Test
2,02/01/2005,To See
3,03/01/2005,If this
4,03/01/2005,is working


In [20]:
test.groupby('Date')['Headline'].apply(lambda x: ' '.join(x))

Date
01/01/2005       This is  A Test
02/01/2005               To See 
03/01/2005    If this is working
Name: Headline, dtype: object