# The logic flow

1. select one company
2. find the corresponding state temp data
3. use dimession reducation method on the tempearture data
4. add a random term
5. fit the model $ CFP - temp + random term$

Q1: how to set a random term?

Q2: how to structure the output?

In [456]:
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import kendalltau

import matplotlib.pyplot as plt

import numpy as np
import os
import pandas as pd
import json
import re, datetime
import calendar
from geopy.geocoders import Nominatim

from sklearn.decomposition import FastICA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [551]:
CFP = ['CurrentRatio', 'NetMargin', 'OperatingMargin', 'ROA1', 'ROE1']

def read_cfp():
    """
    output = cfp_data of agriculture, state_data
    """
    root_path = os.path.abspath(os.path.dirname(os.getcwd()))
    data_path = os.path.join(root_path, 'data')

    # load us state data
    file_name = os.path.join(data_path, 'us_state.json')
    with open(file_name, 'r') as f:
        us_state_dict = json.load(f)

    # load cfp data
    data_path = os.path.join(data_path, 'SEC_FIN_data/agriculture_individual_with_ratios')
    data = {}
    for file in os.listdir(data_path):
        if file.endswith('.csv'):
            file_name = file.strip('.csv')
            file_path = os.path.join(data_path, file)
            with open(file_path, 'r') as f:
                data[file_name] = pd.read_csv(f)

    with open(file_path, 'r') as f:
        df = pd.read_csv(f)

    cfp_data = {}
    state_data = {}

    for cik in data.keys():
        cfp_data[cik] = {}
        state = data[cik]['location'].values[0]
        state_data[cik] = us_state_dict[state]
        for cfp in CFP:
            # get the timeline
            df = pd.DataFrame(columns = data[cik]['period.end'], index = [cfp])
            # get the cfp data
            df.loc[cfp] = list(data[cik][cfp])
            # drop the columns with NaN
            for i in df.columns:
                if np.isnan(df[i][0]):
                    del df[i]
            cfp_data[cik][cfp] = df
            
    return cfp_data, state_data

def read_climate(climate:str):
    """
    :climate = 'bushfire' or 'temp'
    """
    
    assert climate == 'bushfire' or climate == 'temp', "plz type 'bushfire' or 'temp'"
    
    root_path = os.path.abspath(os.path.dirname(os.getcwd()))
    data_path = os.path.join(root_path, 'data')
    
    if climate == 'bushfire':
        climate_path = os.path.join(data_path, 'climate')
        file_name = 'Wildfire_data.csv'
        file_path = os.path.join(climate_path, file_name)
        climate_data = pd.read_csv(file_path)
    else:
        climate_path = os.path.join(data_path, 'climate')
        climate_path = os.path.join(climate_path, 'US_temperature_data')
        file_name = 'Monthly_state_ave_Temp(1980-2020).csv'
        file_path = os.path.join(climate_path, file_name)
        # set the first column as the index
        climate_data = pd.read_csv(file_path, index_col=0)
        
    return climate_data

def generate_temp_pairs(cfp_data, state_data, temp_data, cfp_diff:int, climate_diff:int):
    """
    :cfp_data = 
    :state_data = 
    output = a nested dict, the key is the company name, and the value is a dictionary with the four keys: 
                "cfp_data", "climate_data", "cfp_year", "climate_year", and "state", 
                the "cfp_data" and "climate_data" will feed to correlation analysis
    """
    result = {}
    
    # shape data as year, cfp, temp
    for comp in cfp_data.keys():
        # init the values
        result[comp] = {}
        state = state_data[comp]
        result[comp]['state'] = state
        try:
            state_temp = temp_data.loc[:, [state]] # get state temp date
        except:
            print("can't find the corresponding state in temperature dataset.")
        
        for c in CFP:
            result[comp][c] = {}
            cfp = []
            temp = []
            cfp_time = []
            temp_time = []
            cfp_timestamp = []
            # get cfp timeline 
            for i in cfp_data[comp][c].columns:
                try:
                    i = i.split('/')
                    cfp_time.append(datetime.date(int('20' + i[-1]), int(i[1]), int(i[0])))
                except:
                    pass  
            
            if not cfp_time:
                print(comp, c, cfp_time)  
        
            # get temp timeline
            state_temp_time = []
            for i in state_temp.index.values:
                i = i.split('/')
                # get the last day of that month
                state_temp_time.append(datetime.date(int(i[0]), int(i[1]), calendar.monthrange(int(i[0]), int(i[1]))[1]))     
            
            # get the temp avg
            temp_curr_avg = []
            temp_past_avg = []
            cfp_index = []
            # for each interval(elementc) in roa, we need to find the corresponding temp difference
            # the corresponding temp difference = the current avg. temp - the avg. temp over past n years in the same interval
            for time in range(len(cfp_time) - 1): # don't check the last element of cfp_time    
                now = cfp_time[time]
                now_year = cfp_time[time].year
                now_month = cfp_time[time].month
                now_day = cfp_time[time].day
                try: # use exisiting data to get the interval
                    past = cfp_time[time + 1]
                    past_year = cfp_time[time + 1].year
                    past_month = cfp_time[time + 1].month
                    past_day = cfp_time[time + 1].day
                except: # assume the past year for the last element
                    # to prevent nan value situation
                    past_year = now_year - 1
                    past_month = now_month
                    past_day = now_day
                    past = datetime.date(past_year, past_month, past_day)
                temp_sum = 0
                temp_count = 0
                # get temp avg. of current year 
                for temp in range(len(state_temp_time)):
                    if past <= state_temp_time[temp] < now:
                        year = str(state_temp_time[temp].year)
                        month = str(state_temp_time[temp].month)
                        timestamp = year + '/' + month
                        temp_sum += state_temp.loc[timestamp].values[0]
                        temp_count += 1       
                if temp_count == 0: # can't find corresponding data points
                    continue
                temp_curr_avg.append(temp_sum/temp_count)
                temp_sum = 0
                temp_count = 0
                # get temp avg. of the past n years
                for n in range(1, climate_diff + 1):
                    now_year -= 1
                    past_year -= 1
                    now = datetime.date(now_year, now_month, now_day)
                    past = datetime.date(past_year, past_month, past_day)
                    for temp in range(len(state_temp_time)):
                        if past <= state_temp_time[temp] < now:
                            year = str(state_temp_time[temp].year)
                            month = str(state_temp_time[temp].month)
                            timestamp = year + '/' + month
                            temp_sum += state_temp.loc[timestamp].values[0]
                            temp_count += 1
                if climate_diff != 0:
                    temp_past_avg.append(temp_sum/temp_count)
                else:
                    temp_past_avg.append(0)
                cfp_index.append(time)
                temp_time.append(cfp_time[time])
            columns = list(cfp_data[comp][c].columns)
            # get cfp data
            for i in range(len(columns)):
                if i in cfp_index:
                    if i + cfp_diff >= len(columns):
                        # i = current cfp data point
                        # i + cfp_diff = the farest data cfp data point
                        # if count + cfp_diff is out of range, terminate computing the cfp moving avg.,
                        # otherwise the array can provide sufficient data points 
                        break
                    try:
                        cfp_curr = float(cfp_data[comp][c][columns[i]].values[0])
                    except:
                        cfp_curr = 0
                    cfp_past = 0
                    for n in range(1, cfp_diff + 1):
                        cfp_past += float(cfp_data[comp][c][columns[i+n]].values[0])
                    if cfp_diff != 0:
                        cfp_avg = cfp_past/cfp_diff
                    else:
                        cfp_avg = 0
                    # get the cfp difference
                    cfp.append(cfp_curr - cfp_avg)
                    cfp_timestamp.append(columns[i])
            # get temp difference
            temp = [a - b for a, b in zip(temp_curr_avg, temp_past_avg)]
        
            result[comp][c]["cfp_data"] = cfp
            result[comp][c]["climate_data"] = temp
            result[comp][c]["cfp_year"] = cfp_timestamp
            result[comp][c]["climate_year"] = temp_time
            result[comp][c]['state'] = state
    return result     

def process_temp_pairs():
    """
    1. shape the temp data into the pca/ica fridendly style, numpy array shape
    2. apply pca/ica into the temp data, get the transformed temp component data 
    one company = one state = one state temp data set = one pca/ica results
    different cfp = may in different time range -> each cfp has its own transformed pca/ica 

    logic flow:
    1. find one comp;
    2. find its state, and its state temp dataset;
    3. shape the dataset into the pca/ica fridendly style, numpy array shape;
    """

def shape_temp_data(pairs, temp_data):
    """
    shape the temp data into the pca/ica fridendly style, numpy array shape
    return a dictionary, key = the state, value = the shaped dataset
    :results = {cik: {cfp: {cfp_data, cfp_binary, temp_binary_data}, state: shaped_state_data}}
    """

    results = {}
    for cik in pairs.keys():
    # for cik in ['0001575858']:
        if cik == '0001756180':
            continue
        results[cik] = {}
        state = pairs[cik]['state']
        if state not in temp_data.columns:
            print(pairs[cik], state, 'doesnot in the temp data.')
            continue
        # for cfp in ['CurrentRatio']:
        for cfp in CFP:
            results[cik][cfp] = {}
            climate_year = pairs[cik][cfp]['climate_year']
            cfp_data = pairs[cik][cfp]['cfp_data']
            temp_period = [] # temp period of the temp_data
            cfp_binary = [] # build cfp_binary data
            for i in range(0, len(climate_year) - 1):
                if datetime.timedelta(days=306) <= climate_year[i] - climate_year[i+1] <= datetime.timedelta(days=425):
                    temp_period.append(climate_year[i]) # keep the time with the continous timestamp
                    if cfp_data[i] - cfp_data[i+1] > 0:
                        cfp_binary.append(1)
                    else:
                        cfp_binary.append(0)  
                else:
                    break
             
            print(len(temp_period), len(cfp_binary))
            print(temp_period)

            # temp_period = temp_period[:-1] # remove the last element to align with the cfp_binary_data
            temp_value = np.empty([1, 12]) # remove the first element after build the data, temp_data
            if not temp_period:
                print(cik, cfp, 'does not have enough data points.')
                results[cik][cfp] = None
                continue
            month = temp_period[0].month
            start_year = temp_period[0].year
            end_year = temp_period[-1].year
            start_datetime = str(start_year) + '/' + str(month) # shape and 'year/month' format
            end_datetime = str(end_year) + '/' + str(month)
            print('time range', cik, cfp, start_datetime, end_datetime)
            
            shaped_state_temp_data = np.empty([1, 12]) # build the shaped state data
            state_temp_data = temp_data[state]
            state_temp_data = pd.DataFrame(state_temp_data)
            state_temp_data = pd.DataFrame.transpose(state_temp_data) # shaped state temp with column = time

            start_point = state_temp_data.filter(regex = eval("'/' + str(month)")).columns[-1]
            # print(state_temp_data[start_point])
            start_point =  state_temp_data.columns.get_loc(start_point) # get the index of renctly month
            # print(start_point)
            i = 0
            A = False
            cnt = start_point
            while cnt >= 12:
                # get one year column name, to avoid 2020.6 - 2019.6 situation
                period = state_temp_data.columns[cnt - 11: cnt + 1]
                print(cnt - 11, cnt + 1, cnt, i)
                # get teh temp data of that year
                period_data = state_temp_data[period].loc[state,].to_numpy()
                # print(period, period_data)
                # append the data into shaped_state_temp_data
                shaped_state_temp_data = np.vstack((shaped_state_temp_data, period_data))
                if start_datetime in period: # build the temp_data
                    A = True
                if A:
                    temp_value = np.vstack((temp_value, period_data))
                    # print(period, period_data)
                if end_datetime in period:
                    A = False
                i = i + 1
                cnt = start_point - 12 * i
            
            shaped_state_temp_data = shaped_state_temp_data[1:] # remove the first element
            temp_value = temp_value[1:] # remove the first element

            if len(cfp_binary) == len(temp_value):
                results[cik][cfp]['cfp_binary'] = cfp_binary
                results[cik][cfp]['temp_data'] = temp_value
                results[cik]['state_data'] = shaped_state_temp_data
            else:
                print('problem')
                print(cik, cfp, len(temp_value), len(cfp_binary))

    return results
          
def ica_svm(data):
    columns = CFP
    index = pd.Index(data.keys(), name = "Company:")
    df = pd.DataFrame(columns = columns, index = index)
    for cik in data.keys():
        ICA_transformer = FastICA(n_components = 2, random_state = 0)
        try:
            X_ICA_transformed = ICA_transformer.fit(data[cik]['state_data'])    
        except:
            continue
            print(cik, cfp)
        for cfp in CFP:
            X_ICA = X_ICA_transformed.transform(data[cik][cfp]['temp_data'])
            Y = data[cik][cfp]['cfp_binary']
            # print(data[cik]['state_data'], X_ICA, Y)
            score = 0
            cnt = 0
            for i in range(0, len(Y)):
                X_LOO = X_ICA[:] # get the shadow copy
                Y_LOO = Y[:]
                X_LOO = np.delete(X_LOO, i, axis = 0)
                Y_LOO = np.delete(Y_LOO, i)
                # print(Y_LOO, X_LOO)
                clf = SVC(gamma='auto')
                try:
                    clf.fit(X_LOO, Y_LOO)
                    score += clf.score(X_LOO, Y_LOO)
                    cnt += 1
                except:
                    pass
            if cnt:
                df.loc[cik, cfp] = score/cnt
            else:
                df.loc[cik, cfp] = None
    
    return df


In [552]:
output = ica_svm(data)
output




Unnamed: 0_level_0,CurrentRatio,NetMargin,OperatingMargin,ROA1,ROE1
Company:,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1159275,0.636364,1.0,1.0,0.6,0.7
835011,0.666667,0.652778,0.611111,0.666667,0.777778
1575858,0.666667,0.666667,0.666667,0.666667,0.666667
37785,0.709091,0.6,0.555556,0.672727,0.709091
1441693,0.690476,0.75,0.625,0.690476,0.75
1469443,0.75,0.75,0.75,0.8,0.8
1482541,0.75,0.611111,0.611111,0.680556,0.666667
3545,0.583333,0.666667,0.615385,0.583333,0.545455
16160,,,,,
1477246,,,,,


In [530]:
cfp_data, state_data = read_cfp()
temp_data = read_climate('temp')
pairs = generate_temp_pairs(cfp_data, state_data, temp_data, 0, 0)
data = shape_temp_data(pairs, temp_data)

11 11
[datetime.date(2019, 12, 31), datetime.date(2018, 12, 31), datetime.date(2017, 12, 31), datetime.date(2016, 12, 31), datetime.date(2015, 12, 31), datetime.date(2014, 12, 31), datetime.date(2013, 12, 31), datetime.date(2012, 12, 31), datetime.date(2011, 12, 31), datetime.date(2010, 12, 31), datetime.date(2009, 12, 31)]
time range 0001159275 CurrentRatio 2019/12 2009/12
480 492 491 0
468 480 479 1
456 468 467 2
444 456 455 3
432 444 443 4
420 432 431 5
408 420 419 6
396 408 407 7
384 396 395 8
372 384 383 9
360 372 371 10
348 360 359 11
336 348 347 12
324 336 335 13
312 324 323 14
300 312 311 15
288 300 299 16
276 288 287 17
264 276 275 18
252 264 263 19
240 252 251 20
228 240 239 21
216 228 227 22
204 216 215 23
192 204 203 24
180 192 191 25
168 180 179 26
156 168 167 27
144 156 155 28
132 144 143 29
120 132 131 30
108 120 119 31
96 108 107 32
84 96 95 33
72 84 83 34
60 72 71 35
48 60 59 36
36 48 47 37
24 36 35 38
12 24 23 39
3 3
[datetime.date(2019, 12, 31), datetime.date(2018, 1

In [228]:
pairs.keys()

dict_keys(['0001159275', '0000835011', '0001575858', '0001756180', '0000037785', '0001441693', '0001469443', '0001482541', '0000003545', '0000016160', '0001477246', '0001705843', '0001425292', '0001548240', '0001592016', '0001121702', '0000825542', '0000277638', '0000005981', '0001302946', '0001342423', '0001285785', '0001133470'])

In [65]:
t1 = t.loc[t.index.str.startswith('1980/',na=False)].values.tolist()
t1 = np.array([i[0] for i in t1])
t1

array([ 8.452,  6.275, 11.83 , 16.182, 21.644, 25.623, 28.518, 27.945,
       25.961, 16.036, 11.487,  7.489])