In [2]:
#!/home/admin/anaconda3/bin/python3
#   author:martinmhan@yahoo.com date:  17/06/2020
#   Copyright (C) <2020>  <Martin Mohan>
#
#   This program is free software; you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation; either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program; if not, write to the Free Software Foundation,
#   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA
from datetime import datetime
import pandas as pd
import numpy as np
import argparse,sys,re
import matplotlib.pyplot as plt
import seaborn; seaborn.set()
import mmutils
import myfit
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


class Treat2():
    """ 
    This class will modify the input file depending on the user selection.
    After modification name of the output file is modifed to reflect the change.
    e.g. if outlier removal applied ifile.csv -> ifile_out.csv
    Only one method can be called at time.
    The TCE file is modified the same way as the TK file

    """
    def __init__(self,TK):
        """
        The input file is saved

        """
        self.TK=TK
        self.TCE1=TK.replace("TK","TCE1")
        
    def bin(self):
        ''' Merge FP and CANDIDATE - "b" added to start of all files. '''
        bTK=self.TK.replace("data/", "data/b")
        df = pd.read_csv(self.TK,comment= '#')
        df["koi_disposition"] = df["koi_disposition"].replace(to_replace="CANDIDATE",value="FALSE POSITIVE")
        df.to_csv(bTK,index=False)
        
        # Just rename TCE1 it has no koi_disposition
        bTCE1=self.TCE1.replace("data/","data/b")
        df = pd.read_csv(self.TCE1,comment= '#') 
        df.to_csv(bTCE1,index=False)
        return bTK,bTCE1
    
    def get_vif(self,exogs, data):
        ''' 
        This is here for reference (removing vif is done iteratively one at a time.
        From https://stackoverflow.com/questions/42658379/variance-inflation-factor-in-python
        Return VIF (variance inflation factor) DataFrame

        Args:
        exogs (list): list of exogenous/independent variables
        data (DataFrame): the df storing all variables

        Returns:
        VIF and Tolerance DataFrame for each exogenous variable

        Notes:
        Assume we have a list of exogenous variable [X1, X2, X3, X4].
        To calculate the VIF and Tolerance for each variable, we regress
        each of them against other exogenous variables. For instance, the
        regression model for X3 is defined as:
                        X3 ~ X1 + X2 + X4
        And then we extract the R-squared from the model to calculate:
                    VIF = 1 / (1 - R-squared)
                    Tolerance = 1 - R-squared
        The cutoff to detect multicollinearity:
                    VIF > 10 or Tolerance < 0.1
        '''

        # initialize dictionaries
        vif_dict, tolerance_dict = {}, {}
        # create formula for each exogenous variable
        for exog in exogs:
            not_exog = [i for i in exogs if i != exog]
            formula = f"{exog} ~ {' + '.join(not_exog)}"
            # extract r-squared from the fit
            r_squared = smf.ols(formula, data=data).fit().rsquared

            # calculate VIF
            vif = 1/(1 - r_squared)
            vif_dict[exog] = vif

            # calculate tolerance
            tolerance = 1 - r_squared
            tolerance_dict[exog] = tolerance
        # return VIF DataFrame
        df_vif = pd.DataFrame({'VIF': vif_dict, 'Tolerance': tolerance_dict})
        return df_vif 
    
    def cap(self,iv):
        ''' Cap all outliers Interquartile range IQR*1.5 '''        
        outlierConstant=3
        a = np.array(iv)
        upper_quartile = np.percentile(a, 75)
        lower_quartile = np.percentile(a, 25)

        IQR = (upper_quartile - lower_quartile) * outlierConstant
        quartileSet = (lower_quartile - IQR, upper_quartile + IQR)
        resultList = []
        for y in a.tolist():
#            if y >= quartileSet[0] and y <= quartileSet[1]:
#                resultList.append(y)
# Cap outliers
            if y <= quartileSet[0]:
                resultList.append(quartileSet[0])
            elif y >= quartileSet[1]:
                resultList.append(quartileSet[1])
            else: 
                resultList.append(y)
        noCap=a
        Cap=resultList
        return Cap
        fig,axs = plt.subplots(1, 2)
        for ax in axs.flat:
            ax.set_ylim(0,8500)
        # ax.set_xlim(0.3,1)
        bins=20
#        fig.suptitle(f"{iv.name} qtiles {round(upper_quartile,1)} to {round(lower_quartile,1)} ")
#        fig.suptitle(f"qtiles {round(upper_quartile,1)} to {round(lower_quartile,1)} ")
        fig.suptitle(f"{iv.name} IQR Cap: ({round(upper_quartile,1)}) - ({round(lower_quartile,1)}) * {outlierConstant} ") 
        axs[0].hist(noCap, bins=bins,color='orange')
        axs[0].set_title(f'Before CAP')
        axs[0].set_ylabel(f'Number')
        axs[0].set_xlabel(f'{iv.name}')
    
             
        axs[1].hist(Cap, bins=bins,color='green')
        axs[1].set_title(f'After CAP')
        axs[1].set_xlabel(f'{iv.name}')
        return resultList
    
    def pca(self):
        df=pd.read_csv(argv.TK,comment= '#')
        #.to_numpy()
        dfnew = df[['kepid','koi_disposition','kepoi_name']].copy()     
        X=df.drop(['kepid','koi_disposition','kepoi_name'], axis=1)
        pca=PCA(n_components=0.95)
        dfnew['pca']=pca.fit_transform(X)

        ofile=self.TK.replace(".csv", "_pca.csv")       
        dfnew.to_csv(ofile,index=False)
        print(f"{self.TK} to {ofile} with col 'pca' var ={pca.explained_variance_ratio_}")
        
#    def pca_TCE1(self):
#        df=pd.read_csv(argv.TCE1,comment= '#')
#        #.to_numpy()
#        dfnew = df[['kepid']].copy()     
#        X=df.drop(['kepid'], axis=1)
#        pca=PCA(n_components=0.95)
#        dfnew['pca']=pca.fit_transform(X)

#        ofile=self.TK.replace(".csv", "_pca.csv")       
#        dfnew.to_csv(ofile,index=False)
#        print(f"{self.TK} to {ofile} with col 'pca' var ={pca.explained_variance_ratio_}")

#    def pca(self):
#        self.pca_TK()
#        self.pca_TCE1()
        
    def vif(self):
        """ A list of highly correlate IV's (VIF>10) removed iteratively
        usign get_vif. """

        cols=['tce_period','tce_eqt_sn','tcet_time0bk','boot_messtd','tcet_time0',\
                   'tce_ldm_coeff3','tce_dof1','tce_time0bk_sn','tce_max_mult_ev',\
                   'tce_time0','tce_smet_prov','tce_ldm_coeff2',\
                   'tce_rb_tcount0','tce_maxmes','tce_sma','tce_fwm_sra_sn',\
                   'tcet_duration','tce_chisqgofdof','tce_robstat','wst_robstat',\
                   'tce_period_sn','tce_smet','tce_depth','tce_dicco_mdec_sn']
        
        df=pd.read_csv(argv.TK,comment= '#') 
        df.drop(cols, axis=1,inplace=True)
        fname=argv.TK.replace(".csv","_vif.csv")   
        df.to_csv(fname)
        print(f"Created {fname}")
        
#        df=pd.read_csv(argv.TCE1,comment= '#') 
#        df.drop(cols, axis=1,inplace=True)
#        fname=argv.TCE1.replace(".csv","_vif.csv")   
#        df.to_csv(fname)
#        print(f"Created {fname}")
    
if __name__ == '__main__':
    """ The _main__ models is used for testing"""

    parser = argparse.ArgumentParser(description=" Read files data/TK.csv and data/TCE1.csv created by Treat1.py. Create new file depending on options e.g --b bTK.csv, bTCE.csv --out create TK_out.csv,TCE1_out.csv ",formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    group = parser.add_mutually_exclusive_group()
    
    group.add_argument("--bin", action="store_true",
            help="koi_disposition (DV): Create CONFIRMED vs REST file binary(bin) add TK=>bTK (i.e merge CANDIDATE with FALSE POSITIVE)")

    group.add_argument("--cap", action="store_true",
            help="Cap Outliers 5qtile and 95qtile. NB TCE1 is also be modified to TCE1_cap:")
    #argv=parser.parse_args()

    class Args:
        cap = True
        bin = False
        pca = False
        npca= True
        vif = False
        ifile="data/TK.csv"
        TK="data/TK.csv"
        TCE1="data/TCE1.csv"
        model="GB"
    argv=Args()

    df = pd.read_csv(argv.TK,comment= '#')

    if(argv.bin): # No effect on TCE1 which does not have koi_disposition
        print(f"Mulitclass in TK={TK}")
        bTK,bTCE1=mytreat.bin(TK)
        print(f"Binary class out bTK={bTK} bTK={bTCE1}")
    elif(argv.vif):
        mytreat=Treat2(argv.TK)
        mytreat.vif()
    elif(argv.npca):
        #mytreat=Treat2(argv.TK)
        #mytreat.pca()
        """ Normalized pca """
        cols=['kepid','tce_plnt_num','koi_disposition','kepoi_name']
        df=pd.read_csv(argv.ifile,comment= '#')
        dfsave=df[df.columns & cols].copy() # Save index cols for later
        
        X=df.drop(cols, axis=1,errors='ignore') # Get X values for pca
        S = StandardScaler().fit_transform(X)
        Xstd = pd.DataFrame.from_records(S)
       
        pca=PCA(n_components=0.95)
#        dfx=X.apply(pipe)
        dfsave['pca']=pca.fit_transform(X) #  Add pca'd cols

        ofile=argv.ifile.replace(".csv", "_npca.csv")
        dfsave.to_csv(ofile,index=False)
        print(f"{argv.ifile} to {ofile} with col 'pca' var ={pca.explained_variance_ratio_} {pca.singular_values_}")

        
    elif(argv.cap): # Problem tce_plnt_num returns 1 for all 
        pass
    else: 
        print("Supply one argument e.g --vif")


#unique = df.apply(pd.Series.nunique,dropna=False)
#Xnew=X.apply(mytreat.cap)   

#df_c = pd.concat([df.reset_index(drop=True), Xnew], axis=1)
    
# Function to iteratively find VIF. Removed if > 10        
#df_a=df[['kepid', 'kepoi_name','koi_disposition']].copy()
#df_c = pd.concat([df_a.reset_index(drop=True), X], axis=1)
        
#fname=argv.TK.replace(".csv","_vif.csv")               
#df_c.to_csv(fname,index=False)
        
#exogs=list(X.columns)
#pd.options.display.float_format = '{:.2f}'.format
#vif_smf=get_vif(exogs=exogs, data=df)
#vifs=vif_smf.sort_values(by=['VIF'])
        

data/TK.csv to data/TK_npca.csv with col 'pca' var =[0.99994871] [5.05165153e+12]
