# The logic flow

1. select one company
2. find the corresponding state temp data
3. use dimession reducation method on the tempearture data
4. add a random term
5. fit the model $ CFP - temp + bushfire$


# Set up

In [3]:
import numpy as np
import os
import pandas as pd
import json
import datetime
import calendar

from sklearn.decomposition import FastICA as ICA
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from skfda.preprocessing.dim_reduction.feature_extraction import FPCA
from skfda.representation.grid import FDataGrid
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Generate Data and Present Results

In [21]:
CFP = ['CurrentRatio', 'NetMargin', 'OperatingMargin', 'ROA1', 'ROE1']

def read_cfp():
    """
    output = cfp_data of agriculture, state_data
    """
    root_path = os.path.abspath(os.path.dirname(os.getcwd()))
    data_path = os.path.join(root_path, 'data')

    # load us state data
    file_name = os.path.join(data_path, 'us_state.json')
    with open(file_name, 'r') as f:
        us_state_dict = json.load(f)

    # load cfp data
    data_path = os.path.join(data_path, 'SEC_FIN_data/agriculture_individual_with_ratios')
    data = {}
    for file in os.listdir(data_path):
        if file.endswith('.csv'):
            file_name = file.strip('.csv')
            file_path = os.path.join(data_path, file)
            with open(file_path, 'r') as f:
                data[file_name] = pd.read_csv(f)

    with open(file_path, 'r') as f:
        df = pd.read_csv(f)

    cfp_data = {}
    state_data = {}

    for cik in data.keys():
        cfp_data[cik] = {}
        state = data[cik]['location'].values[0]
        state_data[cik] = us_state_dict[state]
        for cfp in CFP:
            # get the timeline
            df = pd.DataFrame(columns = data[cik]['period.end'], index = [cfp])
            # get the cfp data
            df.loc[cfp] = list(data[cik][cfp])
            # drop the columns with NaN
            for i in df.columns:
                if np.isnan(df[i][0]):
                    del df[i]
            cfp_data[cik][cfp] = df
            
    return cfp_data, state_data

def read_climate(climate:str):
    """
    :climate = 'bushfire' or 'temp'
    """
    
    assert climate == 'bushfire' or climate == 'temp', "plz type 'bushfire' or 'temp'"
    
    root_path = os.path.abspath(os.path.dirname(os.getcwd()))
    data_path = os.path.join(root_path, 'data')
    
    if climate == 'bushfire':
        climate_path = os.path.join(data_path, 'climate')
        file_name = 'BushFireData_ByState.csv'
        file_path = os.path.join(climate_path, file_name)
        climate_data = pd.read_csv(file_path, index_col=0)
    else:
        climate_path = os.path.join(data_path, 'climate')
        climate_path = os.path.join(climate_path, 'US_temperature_data')
        file_name = 'Monthly_state_ave_Temp(1980-2020).csv'
        file_path = os.path.join(climate_path, file_name)
        # set the first column as the index
        climate_data = pd.read_csv(file_path, index_col=0)
        
    return climate_data

def generate_temp_pairs(cfp_data, state_data, temp_data, cfp_diff:int, climate_diff:int):
    """
    :cfp_data = 
    :state_data = 
    output = a nested dict, the key is the company name, and the value is a dictionary with the four keys: 
                "cfp_data", "climate_data", "cfp_year", "climate_year", and "state", 
                the "cfp_data" and "climate_data" will feed to correlation analysis
    """
    result = {}
    
    # shape data as year, cfp, temp
    for comp in cfp_data.keys():
        # init the values
        result[comp] = {}
        state = state_data[comp]
        result[comp]['state'] = state
        try:
            state_temp = temp_data.loc[:, [state]] # get state temp date
        except:
            print("can't find the corresponding state in temperature dataset.")
        
        for c in CFP:
            result[comp][c] = {}
            cfp = []
            temp = []
            cfp_time = []
            temp_time = []
            cfp_timestamp = []
            # get cfp timeline 
            for i in cfp_data[comp][c].columns:
                try:
                    i = i.split('/')
                    cfp_time.append(datetime.date(int('20' + i[-1]), int(i[1]), int(i[0])))
                except:
                    pass  
            
            if not cfp_time:
                print(comp, c, cfp_time)  
        
            # get temp timeline
            state_temp_time = []
            for i in state_temp.index.values:
                i = i.split('/')
                # get the last day of that month
                state_temp_time.append(datetime.date(int(i[0]), int(i[1]), calendar.monthrange(int(i[0]), int(i[1]))[1]))     
            
            # get the temp avg
            temp_curr_avg = []
            temp_past_avg = []
            cfp_index = []
            # for each interval(elementc) in roa, we need to find the corresponding temp difference
            # the corresponding temp difference = the current avg. temp - the avg. temp over past n years in the same interval
            for time in range(len(cfp_time) - 1): # don't check the last element of cfp_time    
                now = cfp_time[time]
                now_year = cfp_time[time].year
                if now_year >= 2021: # put the data within 2021
                    continue
                now_month = cfp_time[time].month
                now_day = cfp_time[time].day
                try: # use exisiting data to get the interval
                    past = cfp_time[time + 1]
                    past_year = cfp_time[time + 1].year
                    past_month = cfp_time[time + 1].month
                    past_day = cfp_time[time + 1].day
                except: # assume the past year for the last element
                    # to prevent nan value situation
                    past_year = now_year - 1
                    past_month = now_month
                    past_day = now_day
                    past = datetime.date(past_year, past_month, past_day)
                temp_sum = 0
                temp_count = 0
                # get temp avg. of current year 
                for temp in range(len(state_temp_time)):
                    if past <= state_temp_time[temp] < now:
                        year = str(state_temp_time[temp].year)
                        month = str(state_temp_time[temp].month)
                        timestamp = year + '/' + month
                        temp_sum += state_temp.loc[timestamp].values[0]
                        temp_count += 1       
                if temp_count == 0: # can't find corresponding data points
                    continue
                temp_curr_avg.append(temp_sum/temp_count)
                temp_sum = 0
                temp_count = 0
                # get temp avg. of the past n years
                for n in range(1, climate_diff + 1):
                    now_year -= 1
                    past_year -= 1
                    now = datetime.date(now_year, now_month, now_day)
                    past = datetime.date(past_year, past_month, past_day)
                    for temp in range(len(state_temp_time)):
                        if past <= state_temp_time[temp] < now:
                            year = str(state_temp_time[temp].year)
                            month = str(state_temp_time[temp].month)
                            timestamp = year + '/' + month
                            temp_sum += state_temp.loc[timestamp].values[0]
                            temp_count += 1
                if climate_diff != 0:
                    temp_past_avg.append(temp_sum/temp_count)
                else:
                    temp_past_avg.append(0)
                cfp_index.append(time)
                temp_time.append(cfp_time[time])
            columns = list(cfp_data[comp][c].columns)
            # get cfp data
            for i in range(len(columns)):
                if i in cfp_index:
                    if i + cfp_diff >= len(columns):
                        # i = current cfp data point
                        # i + cfp_diff = the farest data cfp data point
                        # if count + cfp_diff is out of range, terminate computing the cfp moving avg.,
                        # otherwise the array can provide sufficient data points 
                        break
                    try:
                        cfp_curr = float(cfp_data[comp][c][columns[i]].values[0])
                    except:
                        cfp_curr = 0
                    cfp_past = 0
                    for n in range(1, cfp_diff + 1):
                        cfp_past += float(cfp_data[comp][c][columns[i+n]].values[0])
                    if cfp_diff != 0:
                        cfp_avg = cfp_past/cfp_diff
                    else:
                        cfp_avg = 0
                    # get the cfp difference
                    cfp.append(cfp_curr - cfp_avg)
                    cfp_timestamp.append(columns[i])
            # get temp difference
            temp = [a - b for a, b in zip(temp_curr_avg, temp_past_avg)]
        
            result[comp][c]["cfp_data"] = cfp
            result[comp][c]["climate_data"] = temp
            result[comp][c]["cfp_year"] = cfp_timestamp
            result[comp][c]["climate_year"] = temp_time
            result[comp][c]['state'] = state
    return result     

def shape_data(pairs, temp_data, bush_data):
    """
    shape the temp data into the fpca/ica fridendly style, numpy array shape
    return a dictionary, key = the state, value = the shaped dataset
    :results = {cik: {cfp: {cfp_data, cfp_binary, temp_binary_data, bush_binary_data}, state: shaped_state_data}}
    """

    results = {}
    # scale bush_data
    columns = []
    for i in bush_data.columns:
        columns.append(i.title().strip()) # title and remove whitespace
    bush_data.columns = columns # rename the columns
    index = bush_data.index # get the index
    scaler = StandardScaler().fit(bush_data.to_numpy())
    bush_data = pd.DataFrame(scaler.transform(bush_data.to_numpy()), index = index, columns = columns) # scaled bush_data

    for cik in pairs.keys():
    # for cik in ['0001575858']:
        if cik == '0001756180': # only 2 data points
            continue
        results[cik] = {}
        state = pairs[cik]['state']
        if state not in temp_data.columns:
            print(pairs[cik], state, 'doesnot in the temp data.')
            continue
        # for cfp in ['CurrentRatio']:
        for cfp in CFP:
            results[cik][cfp] = {}
            climate_year = pairs[cik][cfp]['climate_year']
            cfp_data = pairs[cik][cfp]['cfp_data']
            temp_period = [] # temp period of the temp_data
            cfp_binary = [] # build cfp_binary data
            bush_value = []
            for i in range(0, len(climate_year) - 1):
                if datetime.timedelta(days=306) <= climate_year[i] - climate_year[i+1] <= datetime.timedelta(days=425):
                    temp_period.append(climate_year[i]) # keep the time with the continous timestamp
                    if cfp_data[i] - cfp_data[i+1] > 0:
                        cfp_binary.append(1)
                    else:
                        cfp_binary.append(0)  
             
            # print(len(temp_period), len(cfp_binary))
            # print(temp_period)

            shaped_state_temp_data = np.empty([1, 12]) # build the shaped state data
            state_temp_data = temp_data[state]
            state_temp_data = pd.DataFrame(state_temp_data)
            state_temp_data = pd.DataFrame.transpose(state_temp_data) # shaped state temp with column = time
            state_bush_data = bush_data[state]
            state_bush_data = pd.DataFrame(state_bush_data)
            state_bush_data = pd.DataFrame.transpose(state_bush_data) # shaped state bush with column = time

            # temp_period = temp_period[:-1] # remove the last element to align with the cfp_binary_data
            temp_value = np.empty([1, 12]) # remove the first element after build the data, temp_data
            if not temp_period:
                print(cik, cfp, 'does not have enough data points.')
                results[cik][cfp] = None
                continue
            month = temp_period[0].month
            # start_year = temp_period[0].year
            # end_year = temp_period[-1].year
            temp_timestamp = []
            for t in temp_period:
                temp_timestamp.append(str(t.year) + '/' + str(month)) # shape and 'year/month' format
                if t.year in state_bush_data.columns:
                    bush_value.append(state_bush_data[t.year])
                
            # start_datetime = str(start_year) + '/' + str(month) # shape and 'year/month' format
            # end_datetime = str(end_year) + '/' + str(month)
            # print('time range', cik, cfp, start_datetime, end_datetime)

            start_point = state_temp_data.filter(regex = eval("'/' + str(month)")).columns[-1]
            # print(state_temp_data[start_point])
            start_point =  state_temp_data.columns.get_loc(start_point) # get the index of renctly month
            # print(start_point)
            i = 0
            # A = False
            cnt = start_point
            while cnt >= 12:
                # get one year column name, to avoid 2020.6 - 2019.6 situation
                period = state_temp_data.columns[cnt - 11: cnt + 1]
                # print(cnt - 11, cnt + 1, cnt, i)
                # get teh temp data of that year
                period_data = state_temp_data[period].loc[state,].to_numpy()
                # print(period, period_data)
                # append the data into shaped_state_temp_data
                shaped_state_temp_data = np.vstack((shaped_state_temp_data, period_data))
                for tp in temp_timestamp:
                    if tp in period: # build the temp_data
                        temp_value = np.vstack((temp_value, period_data))
                # if start_datetime in period: # build the temp_data
                #     A = True
                # if A:
                    
                #     # print(period, period_data)
                # if end_datetime in period:
                #     A = False
                i = i + 1
                cnt = start_point - 12 * i
            
            shaped_state_temp_data = shaped_state_temp_data[1:] # remove the first element
            temp_value = temp_value[1:] # remove the first element

            if len(cfp_binary) == len(temp_value) == len(bush_value):
                results[cik][cfp]['cfp_binary'] = cfp_binary
                results[cik][cfp]['temp_data'] = temp_value
                results[cik][cfp]['bush_data'] = bush_value
                results[cik]['state_data'] = shaped_state_temp_data
            else:
                print('problem')
                print(cik, cfp, len(temp_value), len(cfp_binary))

    return results
          
def dimession_reducation_model(data, mode: str, n_comps: int, model: str, title: bool):
    """
    :mode = 'fpca' or 'ica'
    :n_comps = the components 
    :model = 'svm' or 'rf' or 'lr'
    :title = boolean, whether a title in the output
    """
    if model == 'svm':
        columns = pd.MultiIndex.from_product([CFP, ['avg_score', 'avg_nSV(avg_data_len)', 'proportion']],
                                            names = ["CFP", "SVM"])
    elif model == 'rf' or model == 'lr':
        columns = CFP
    index = pd.Index(data.keys(), name = "Company:")
    df = pd.DataFrame(columns = columns, index = index)

    for cik in data.keys():
        if mode == "ica":
            transformer = ICA(n_components = n_comps, random_state = 0, max_iter = 10000000)
        elif mode == 'fpca':
            transformer = FPCA(n_components = n_comps)
            grid_points = [i for i in range(1, 13)]
        else:
            print("only support 'fpca' and 'ica' methods.")
        # TODO scale data 
        scaler = StandardScaler().fit(data[cik]['state_data']) 
        X_temp_scaled = scaler.transform(data[cik]['state_data']) # scale state_data
        # tranform temp_data into same manner
        if mode == "ica":
            X_transformed = transformer.fit(X_temp_scaled)  
        else:
            X_transformed = transformer.fit(FDataGrid(X_temp_scaled, grid_points))  
        for cfp in CFP:
            result = []
            temp_scaled = scaler.transform(data[cik][cfp]['temp_data']) # scale temp_data of cfp
            if mode == 'ica':
                X = np.hstack((X_transformed.transform(temp_scaled), data[cik][cfp]['bush_data'])) # cbind the temp + bushfire
            else:
                X = np.hstack((X_transformed.transform(FDataGrid(temp_scaled, grid_points)), data[cik][cfp]['bush_data'])) # cbind the temp + bushfire
            Y = data[cik][cfp]['cfp_binary']
            # print(data[cik]['state_data'], X_ICA, Y)
            score = 0
            cnt = 0
            nSV = 0
            data_len = 0
            for i in range(0, len(Y)):
                X_LOO = X[:] # get the shadow copy
                Y_LOO = Y[:]
                X_LOO = np.delete(X_LOO, i, axis = 0)
                Y_LOO = np.delete(Y_LOO, i)
                # print(Y_LOO, X_LOO)
                if model == 'svm':
                    clf = SVC(gamma='auto')
                elif model == 'rf':
                    clf = RFC(bootstrap = False, random_state = 0) # use all data
                elif model == 'lr':
                    clf = LogisticRegression(random_state=0, max_iter = 10000)
                try:
                    clf.fit(X_LOO, Y_LOO)
                    pred = clf.predict([X[i]])[0] # get the prediction of the LOO
                    if pred == Y[i]:
                        score += 1
                    else:
                        pass
                    if model == 'svm':
                        nSV += sum(clf.n_support_)
                        data_len += len(Y_LOO)
                    else:      
                        pass
                    cnt += 1
                except:
                    pass
            if cnt and model == 'svm':
                result.append(str(round(score/cnt, 3))) # avg_score
                avg_nSV = round(nSV/cnt, 3)
                avg_data_len = round(data_len/cnt, 3)
                result.append(eval("str(avg_nSV) + '(' + str(avg_data_len) + ')'")) # avg_nSV(avg_data_len)
                result.append(str(round(avg_nSV/avg_data_len, 3)))
                df.loc[cik, cfp] = result
            elif cnt and model == 'rf':
                df.loc[cik, cfp] = str(round(score/cnt, 3))
            elif cnt and model == 'lr':
                df.loc[cik, cfp] = str(round(score/cnt, 3))
            else:
                df.loc[cik, cfp] = None
    
    df_title = 'The avg score result of '
    df_title += mode + ' with ' + str(n_comps) + ' components and ' + model
    # df.to_csv(df_title + '.csv')
    
    if title:
        return df.style.set_caption(df_title)
    else:
        return df

In [None]:
cfp_data, state_data = read_cfp()
temp_data = read_climate('temp')
bush_data = read_climate('bushfire')
pairs = generate_temp_pairs(cfp_data, state_data, temp_data, 0, 1)
data = shape_data(pairs, temp_data, bush_data)

In [24]:
dimession_reducation_model(data, 'fpca', 4, 'lr', title = True)

Unnamed: 0_level_0,CurrentRatio,NetMargin,OperatingMargin,ROA1,ROE1
Company:,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1159275,0.545,0.0,0.0,0.7,0.6
835011,0.3,0.4,0.1,0.9,0.6
1575858,1.0,0.0,0.0,0.5,1.0
37785,,,,,
1441693,0.571,0.75,0.375,0.571,0.75
1469443,0.2,0.2,0.2,0.8,0.8
1482541,0.667,0.333,0.333,0.444,0.444
3545,0.583,0.308,0.462,0.667,0.5
16160,0.643,0.357,0.429,0.357,0.357
1477246,0.333,0.111,0.667,0.111,0.111


In [25]:
dimession_reducation_model(data, 'ica', 4, 'lr', title = True)

Unnamed: 0_level_0,CurrentRatio,NetMargin,OperatingMargin,ROA1,ROE1
Company:,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1159275,0.545,0.0,0.0,0.5,0.6
835011,0.3,0.6,0.0,0.9,0.7
1575858,1.0,0.0,0.0,0.5,1.0
37785,,,,,
1441693,0.571,0.75,0.5,0.571,0.75
1469443,0.4,0.4,0.4,0.8,0.8
1482541,0.556,0.444,0.444,0.556,0.556
3545,0.5,0.077,0.538,0.5,0.0
16160,0.714,0.214,0.429,0.214,0.214
1477246,0.556,0.0,0.556,0.0,0.0


In [26]:
dimession_reducation_model(data, 'ica', 4, 'svm', title = True)

CFP,CurrentRatio,CurrentRatio,CurrentRatio,NetMargin,NetMargin,NetMargin,OperatingMargin,OperatingMargin,OperatingMargin,ROA1,ROA1,ROA1,ROE1,ROE1,ROE1
SVM,avg_score,avg_nSV(avg_data_len),proportion,avg_score,avg_nSV(avg_data_len),proportion,avg_score,avg_nSV(avg_data_len),proportion,avg_score,avg_nSV(avg_data_len),proportion,avg_score,avg_nSV(avg_data_len),proportion
Company:,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
1159275,0.636,8.727(10.0),0.873,0.0,2.5(3.0),0.833,0.0,2.5(3.0),0.833,0.5,8.1(9.0),0.9,0.7,6.4(9.0),0.711
835011,0.2,8.0(9.0),0.889,0.7,8.4(9.0),0.933,0.0,8.9(9.0),0.989,0.9,6.3(9.0),0.7,0.7,4.6(9.0),0.511
1575858,1.0,2.667(3.0),0.889,0.0,2.5(3.0),0.833,0.0,2.5(3.0),0.833,0.0,2.5(3.0),0.833,1.0,2.667(3.0),0.889
37785,,,,,,,,,,,,,,,
1441693,0.571,5.429(6.0),0.905,0.75,4.375(7.0),0.625,0.5,6.125(7.0),0.875,0.571,5.429(6.0),0.905,0.75,4.75(7.0),0.679
1469443,0.2,3.6(4.0),0.9,0.4,3.6(4.0),0.9,0.4,3.6(4.0),0.9,0.6,3.4(4.0),0.85,0.6,3.4(4.0),0.85
1482541,0.222,7.111(8.0),0.889,0.444,7.444(8.0),0.93,0.444,7.444(8.0),0.93,0.222,7.111(8.0),0.889,0.667,6.556(8.0),0.82
3545,0.5,9.583(11.0),0.871,0.0,11.923(12.0),0.994,0.615,10.538(12.0),0.878,0.583,10.25(11.0),0.932,0.0,11.0(11.0),1.0
16160,0.714,8.786(13.0),0.676,0.0,13.0(13.0),1.0,0.571,12.5(13.0),0.962,0.0,13.0(13.0),1.0,0.0,13.0(13.0),1.0
1477246,0.667,7.333(8.0),0.917,0.0,7.778(8.0),0.972,0.222,7.444(8.0),0.93,0.0,7.778(8.0),0.972,0.0,7.778(8.0),0.972


In [5]:
dimession_reducation_model(data, 'fpca', 4, 'svm', title = True)

CFP,CurrentRatio,CurrentRatio,CurrentRatio,NetMargin,NetMargin,NetMargin,OperatingMargin,OperatingMargin,OperatingMargin,ROA1,ROA1,ROA1,ROE1,ROE1,ROE1
SVM,avg_score,avg_nSV(avg_data_len),proportion,avg_score,avg_nSV(avg_data_len),proportion,avg_score,avg_nSV(avg_data_len),proportion,avg_score,avg_nSV(avg_data_len),proportion,avg_score,avg_nSV(avg_data_len),proportion
Company:,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
1159275,0.364,9.818(10.0),0.982,0.0,3.0(3.0),1.0,0.0,3.0(3.0),1.0,0.3,9.0(9.0),1.0,0.5,8.9(9.0),0.989
835011,0.4,9.0(9.0),1.0,0.1,9.0(9.0),1.0,0.0,9.0(9.0),1.0,0.7,9.0(9.0),1.0,0.8,8.9(9.0),0.989
1575858,1.0,3.0(3.0),1.0,0.0,3.0(3.0),1.0,0.0,3.0(3.0),1.0,0.0,3.0(3.0),1.0,1.0,3.0(3.0),1.0
37785,,,,,,,,,,,,,,,
1441693,0.429,6.0(6.0),1.0,0.75,6.625(7.0),0.946,0.625,7.0(7.0),1.0,0.429,6.0(6.0),1.0,0.75,6.25(7.0),0.893
1469443,0.2,3.8(4.0),0.95,0.2,3.8(4.0),0.95,0.2,3.8(4.0),0.95,0.4,3.8(4.0),0.95,0.4,3.8(4.0),0.95
1482541,0.667,8.0(8.0),1.0,0.222,8.0(8.0),1.0,0.222,8.0(8.0),1.0,0.333,8.0(8.0),1.0,0.556,8.0(8.0),1.0
3545,0.5,11.0(11.0),1.0,0.231,12.0(12.0),1.0,0.538,11.923(12.0),0.994,0.333,11.0(11.0),1.0,0.0,11.0(11.0),1.0
16160,0.714,12.071(13.0),0.929,0.071,13.0(13.0),1.0,0.429,13.0(13.0),1.0,0.071,13.0(13.0),1.0,0.071,13.0(13.0),1.0
1477246,0.667,7.778(8.0),0.972,0.111,8.0(8.0),1.0,0.333,8.0(8.0),1.0,0.111,8.0(8.0),1.0,0.111,8.0(8.0),1.0


In [6]:
dimession_reducation_model(data, 'ica', 4, 'rf', title = True)

Unnamed: 0_level_0,CurrentRatio,NetMargin,OperatingMargin,ROA1,ROE1
Company:,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1159275,0.364,0.0,0.0,0.5,0.6
835011,0.1,0.4,0.2,0.7,0.6
1575858,0.25,0.5,0.5,0.5,0.25
37785,,,,,
1441693,0.286,0.625,0.625,0.286,0.75
1469443,0.4,0.2,0.2,0.6,0.6
1482541,0.778,0.333,0.333,0.778,0.222
3545,0.833,0.385,0.308,0.583,0.25
16160,0.429,0.429,0.357,0.429,0.429
1477246,0.111,0.444,0.667,0.444,0.444


In [7]:
dimession_reducation_model(data, 'fpca', 4, 'rf', title = True)

Unnamed: 0_level_0,CurrentRatio,NetMargin,OperatingMargin,ROA1,ROE1
Company:,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1159275,0.364,0.0,0.0,0.6,0.8
835011,0.3,0.4,0.3,0.8,0.7
1575858,0.75,0.25,0.25,0.5,0.75
37785,,,,,
1441693,0.429,0.625,0.375,0.429,0.625
1469443,0.0,0.4,0.4,0.8,0.8
1482541,0.667,0.444,0.444,0.556,0.222
3545,0.833,0.615,0.385,0.333,0.333
16160,0.571,0.571,0.5,0.571,0.571
1477246,0.333,0.222,0.667,0.222,0.222


# Summary Results

## 4 fPCs and LR

In [28]:
fpca_4_lr = dimession_reducation_model(data, 'fpca', 4, 'lr', title = False)
columns = pd.MultiIndex.from_product([['avg_score'], ['Min', 'Quartile 1', 'Median', 'Quartile 3', 'Max']],
                                            names = ["Evaluation", "Quartile"])
index = pd.Index(CFP, name = "CFP")
df = pd.DataFrame(columns = columns, index = index)
for ind in ['avg_score']:
    for cfp in CFP:
        result = fpca_4_lr[cfp].tolist()
        for i in range(len(result) - 1, -1, -1):
            if not result[i]:
                result.pop(i)
        result = [float(i) for i in result] 
        result = np.percentile(result, [0, 25, 50, 75, 100])
        result = [round(i, 3) for i in result]
        df.loc[cfp, ind] = result
df

Evaluation,avg_score,avg_score,avg_score,avg_score,avg_score
Quartile,Min,Quartile 1,Median,Quartile 3,Max
CFP,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
CurrentRatio,0.0,0.306,0.545,0.621,1.0
NetMargin,0.0,0.146,0.308,0.458,0.75
OperatingMargin,0.0,0.114,0.333,0.42,0.9
ROA1,0.0,0.355,0.444,0.634,0.9
ROE1,0.0,0.326,0.444,0.6,1.0


## 4 ICs and LR

In [35]:
ica_4_lr = dimession_reducation_model(data, 'ica', 4, 'lr', title = False)
columns = pd.MultiIndex.from_product([['avg_score'], ['Min', 'Quartile 1', 'Median', 'Quartile 3', 'Max']],
                                            names = ["Evaluation", "Quartile"])
index = pd.Index(CFP, name = "CFP")
df = pd.DataFrame(columns = columns, index = index)
for ind in ['avg_score']:
    for cfp in CFP:
        result = ica_4_lr[cfp].tolist()
        for i in range(len(result) - 1, -1, -1):
            if not result[i]:
                result.pop(i)
        result = [float(i) for i in result] 
        result = np.percentile(result, [0, 25, 50, 75, 100])
        result = [round(i, 3) for i in result]
        df.loc[cfp, ind] = result
df

Evaluation,avg_score,avg_score,avg_score,avg_score,avg_score
Quartile,Min,Quartile 1,Median,Quartile 3,Max
CFP,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
CurrentRatio,0.0,0.35,0.556,0.657,1.0
NetMargin,0.0,0.094,0.4,0.634,0.8
OperatingMargin,0.0,0.091,0.4,0.514,0.6
ROA1,0.0,0.274,0.5,0.619,1.0
ROE1,0.0,0.0,0.438,0.634,1.0


## 4 fPCs and SVM

In [65]:
fpca_4_svm = dimession_reducation_model(data, 'fpca', 4, 'svm', title = False)
columns = pd.MultiIndex.from_product([['avg_score', 'proportion'], ['Min', 'Quartile 1', 'Median', 'Quartile 3', 'Max']],
                                            names = ["Evaluation", "Quartile"])
index = pd.Index(CFP, name = "CFP")
df = pd.DataFrame(columns = columns, index = index)
for ind in ['avg_score', 'proportion']:
    for cfp in CFP:
        result = fpca_4_svm[cfp][ind].tolist()
        for i in range(len(result) - 1, -1, -1):
            if not result[i]:
                result.pop(i)
        result = [float(i) for i in result] 
        result = np.percentile(result, [0, 25, 50, 75, 100])
        result = [round(i, 3) for i in result]
        df.loc[cfp, ind] = result
df

Evaluation,avg_score,avg_score,avg_score,avg_score,avg_score,proportion,proportion,proportion,proportion,proportion
Quartile,Min,Quartile 1,Median,Quartile 3,Max,Min,Quartile 1,Median,Quartile 3,Max
CFP,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
CurrentRatio,0.0,0.267,0.429,0.686,1.0,0.929,0.968,1.0,1.0,1.0
NetMargin,0.0,0.106,0.222,0.472,0.75,0.946,0.998,1.0,1.0,1.0
OperatingMargin,0.0,0.059,0.222,0.402,0.8,0.95,1.0,1.0,1.0,1.0
ROA1,0.0,0.166,0.333,0.45,0.889,0.95,1.0,1.0,1.0,1.0
ROE1,0.0,0.111,0.4,0.528,1.0,0.893,0.996,1.0,1.0,1.0


## 4 ICs and SVM

In [38]:
ica_4_svm = dimession_reducation_model(data, 'ica', 4, 'svm', title = False)
columns = pd.MultiIndex.from_product([['avg_score', 'proportion'], ['Min', 'Quartile 1', 'Median', 'Quartile 3', 'Max']],
                                            names = ["Evaluation", "Quartile"])
index = pd.Index(CFP, name = "CFP")
df = pd.DataFrame(columns = columns, index = index)
for ind in ['avg_score', 'proportion']:
    for cfp in CFP:
        result = ica_4_svm[cfp][ind].tolist()
        for i in range(len(result) - 1, -1, -1):
            if not result[i]:
                result.pop(i)
        result = [float(i) for i in result] 
        result = np.percentile(result, [0, 25, 50, 75, 100])
        result = [round(i, 3) for i in result]
        df.loc[cfp, ind] = result
df

Evaluation,avg_score,avg_score,avg_score,avg_score,avg_score,proportion,proportion,proportion,proportion,proportion
Quartile,Min,Quartile 1,Median,Quartile 3,Max,Min,Quartile 1,Median,Quartile 3,Max
CFP,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
CurrentRatio,0.0,0.211,0.562,0.686,1.0,0.411,0.872,0.9,0.944,1.0
NetMargin,0.0,0.0,0.444,0.684,0.75,0.625,0.838,0.933,0.973,1.0
OperatingMargin,0.0,0.03,0.444,0.579,0.625,0.833,0.895,0.93,0.959,1.0
ROA1,0.0,0.0,0.5,0.6,1.0,0.7,0.869,0.911,0.97,1.0
ROE1,0.0,0.0,0.455,0.667,1.0,0.511,0.835,0.954,0.976,1.0


## 4 ICs and RF

In [39]:
ica_4_rf = dimession_reducation_model(data, 'ica', 4, 'rf', title = False)
columns = pd.MultiIndex.from_product([['avg_score'], ['Min', 'Quartile 1', 'Median', 'Quartile 3', 'Max']],
                                            names = ["Evaluation", "Quartile"])
index = pd.Index(CFP, name = "CFP")
df = pd.DataFrame(columns = columns, index = index)
for ind in ['avg_score']:
    for cfp in CFP:
        result = ica_4_rf[cfp].tolist()
        for i in range(len(result) - 1, -1, -1):
            if not result[i]:
                result.pop(i)
        result = [float(i) for i in result] 
        result = np.percentile(result, [0, 25, 50, 75, 100])
        result = [round(i, 3) for i in result]
        df.loc[cfp, ind] = result
df

Evaluation,avg_score,avg_score,avg_score,avg_score,avg_score
Quartile,Min,Quartile 1,Median,Quartile 3,Max
CFP,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
CurrentRatio,0.1,0.268,0.429,0.684,0.889
NetMargin,0.0,0.292,0.4,0.472,0.647
OperatingMargin,0.0,0.225,0.412,0.646,0.889
ROA1,0.222,0.4,0.444,0.591,0.778
ROE1,0.0,0.25,0.444,0.6,0.778


## fPCs and RF

In [40]:
fpca_4_rf = dimession_reducation_model(data, 'fpca', 4, 'rf', title = False)
columns = pd.MultiIndex.from_product([['avg_score'], ['Min', 'Quartile 1', 'Median', 'Quartile 3', 'Max']],
                                            names = ["Evaluation", "Quartile"])
index = pd.Index(CFP, name = "CFP")
df = pd.DataFrame(columns = columns, index = index)
for ind in ['avg_score']:
    for cfp in CFP:
        result = fpca_4_rf[cfp].tolist()
        for i in range(len(result) - 1, -1, -1):
            if not result[i]:
                result.pop(i)
        result = [float(i) for i in result] 
        result = np.percentile(result, [0, 25, 50, 75, 100])
        result = [round(i, 3) for i in result]
        df.loc[cfp, ind] = result
df

Evaluation,avg_score,avg_score,avg_score,avg_score,avg_score
Quartile,Min,Quartile 1,Median,Quartile 3,Max
CFP,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
CurrentRatio,0.0,0.316,0.429,0.667,0.833
NetMargin,0.0,0.292,0.412,0.522,0.625
OperatingMargin,0.0,0.3,0.385,0.528,0.889
ROA1,0.111,0.316,0.429,0.585,0.8
ROE1,0.0,0.222,0.438,0.598,0.8


## Final Table to Present

In [66]:
columns = pd.MultiIndex.from_product([['lr', 'svm', 'rf'], ['ica', 'pca']],
                                            names = ["Model", "Reducation"])
index = pd.Index(CFP, name = "CFP")
df = pd.DataFrame(columns = columns, index = index)
for cfp in CFP:
    result = []
    for model in ['lr', 'svm', 'rf']:
        for method in ['ica', 'fpca']:
            varible = method + '_4_' + model
            varible = locals()[varible]
            if model == 'svm':
                 avg = varible[cfp]['avg_score'].tolist()
            else:
                avg = varible[cfp].tolist()
            for i in range(len(avg) - 1, -1, -1):
                if not avg[i]:
                    avg.pop(i)
            avg = [float(i) for i in avg]
            avg = np.average(avg)
            result.append(avg)
    df.loc[cfp,] = result
df

Model,lr,lr,svm,svm,rf,rf
Reducation,ica,pca,ica,pca,ica,pca
CFP,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
CurrentRatio,0.526,0.472684,0.502368,0.468,0.465579,0.428158
NetMargin,0.381895,0.311947,0.377,0.293368,0.373737,0.388105
OperatingMargin,0.331895,0.310316,0.332368,0.275263,0.427842,0.430053
ROA1,0.467421,0.465474,0.402053,0.353526,0.490895,0.448737
ROE1,0.372737,0.470105,0.355368,0.397211,0.423842,0.421211
