# Data Preparation

### Settings/Functions
Read in settings and functions.

In [38]:
libraries <-c('here','missForest','stringr','imputeMissings','regclass'
             ,'purrr','DescTools')
suppressWarnings(lapply(libraries, require, character.only = TRUE))
suppressWarnings(source(here::here('Stock Estimation', 'settings.R')))

### Data
Read in the final data set from the data preparation notebook.

In [39]:
data <- fread(paste0(dir$final_data,'combined_financial.csv'))

## More Cleaning

### Duplicates

In [40]:
#Checking for duplicate column sums
dups <- data[ , which(duplicated(t(data)))]
dups <- names(dups)
dups

In [41]:
#Removing any duplicate column sums after verifying them
data <- data %>% dplyr::select(-c(dups))
dim(data)

In [42]:
#Looking for missing values & evaluating list of variable names
na <- apply(is.na(data),2,sum)
max(na)
# NOTE: The following code has been commmented out due to the length of its output.
#print(na)
head(sort(na, decreasing = TRUE), n=25)

In [43]:
#Merging and dropping duplicated variable names
#NOTE: Portions of the following code has been commmented out due to the length of its output.
#view(data[, c("Payout Ratio", "payoutRatio")])
data <- Name_Changer(dat=data,x='Payout Ratio',y='payoutRatio')

#view(data[, c('interestCoverage', 'Interest Coverage')])
data <- Name_Changer(dat=data,x='Interest Coverage',y='interestCoverage')

#view(data[, c('netProfitMargin', 'Net Profit Margin')])
data <- Name_Changer(dat=data,x='Net Profit Margin',y='netProfitMargin')

#view(data[, c('PE ratio', 'priceEarningsRatio')])
data <- Name_Changer(dat=data,x='PE ratio',y='priceEarningsRatio')

#view(data[, c('priceToFreeCashFlowsRatio', 'PFCF ratio')])
data <- Name_Changer(dat=data,x='PFCF ratio',y='priceToFreeCashFlowsRatio')

#view(data[, c('priceToOperatingCashFlowsRatio', 'POCF ratio')])
data <- Name_Changer(dat=data,x='POCF ratio',y='priceToOperatingCashFlowsRatio')

#view(data[, c('priceToSalesRatio', 'Price to Sales Ratio')])
data <- Name_Changer(dat=data,x='Price to Sales Ratio',y='priceToSalesRatio')

#view(data[, c('Days Payables Outstanding', 'daysOfPayablesOutstanding')])
data <- Name_Changer(dat=data,x='Days Payables Outstanding',y='daysOfPayablesOutstanding')

#view(data[, c('Free Cash Flow per Share', 'freeCashFlowPerShare')])
data <- Name_Changer(dat=data,x='Free Cash Flow per Share',y='freeCashFlowPerShare')

#view(data[, c('ROE', 'returnOnEquity')])
data <- Name_Changer(dat=data,x='ROE',y='returnOnEquity')

#view(data[, c('priceToBookRatio', 'PTB ratio')])
data <- Name_Changer(dat=data,x='PTB ratio',y='priceToBookRatio')

#view(data[, c('priceBookValueRatio', 'PB ratio')])
data <- Name_Changer(dat=data,x='PB ratio',y='priceBookValueRatio')

#view(data[, c('operatingCashFlowPerShare', 'Operating Cash Flow per Share')])
data <- Name_Changer(dat=data,x='Operating Cash Flow per Share',y='operatingCashFlowPerShare')

#view(data[, c('Cash per Share', 'cashPerShare')])
data <- Name_Changer(dat=data,x='Cash per Share',y='cashPerShare')

dim(data)

### Variable Names

In [44]:
#Checking variable names
names(data)
data <- setDT(data)

In [45]:
#Changing all names to lower case and replacing spaces with "_"
#Amending various features to make more compatible models
names(data) <- str_trim(names(data), side = "both")
names(data) <- str_to_lower(names(data), locale = "en")
names(data) <- str_replace_all(names(data), " ", "_")
names(data) <- str_replace_all(names(data), "-", "")
names(data) <- str_replace_all(names(data), "&", ".")
names(data) <- str_replace_all(names(data), "\\(", "")
names(data) <- str_replace_all(names(data), "\\)", "")
names(data) <- str_replace_all(names(data), "3y", "three_yr")
names(data) <- str_replace_all(names(data), "5y", "five_yr")
names(data) <- str_replace_all(names(data), "10y", "ten_yr")
names(data) <- str_replace_all(names(data), "\\\\", "")
names(data) <- str_replace_all(names(data), "////", "_")
names(data) <- str_replace_all(names(data), ",", "")
names(data) <- str_replace_all(names(data), "_._", "_")
names(data) <- str_replace_all(names(data), "/", "_")
setnames(data, 'eps', 'earnings_per_share')
names(data)

### Categorical Encoding

In [46]:
#Categorical Encoding
data[, sector := as.factor(sector)]
data[, sector_num := as.numeric(sector)]

#Reordering data to put "sector" with "sector_num"
data <- data %>%
  dplyr::select('stock','nextyr_price_var','class','year','sector','sector_num', everything()) %>%
  setDT()

### Missing Data

In [47]:
na <- apply(is.na(data),2,sum)
#print(na)
max(na)
#sort(na, decreasing = TRUE)
head(sort(na, decreasing = TRUE), n=25)
summary(na)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      0    1822    2342    3087    3307   10744 

In [48]:
#Checking how many rows are complete
sum(complete.cases(data))

#Checking for NA across rows
data$na <- rowSums(is.na(data))
max(data$na)
head(sort(data$na, decreasing = TRUE),n = 20)
summary(data$na)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00    2.00   10.00   27.41   23.00  190.00 

In [49]:
#Found that 50 was a good cut off for dropping rows
drop <- data %>% 
  filter(na >= 50)
dim(drop)

data <- data %>%
  filter(na <= 50)

data <- dplyr::select(data, -c(na))

#Re-checking the NAs across columns
na <- apply(is.na(data),2,sum)
max(na)
# NOTE: The following code has been commmented out due to the length of its output.
#print(na)
#sort(na, decreasing = TRUE)
head(sort(na, decreasing = TRUE), n=25)

In [50]:
#Keeping only columns with less than ~15 percent missing
perc <- apply(data,2,Perc_Missing)
max(perc)
# NOTE: The following code has been commmented out due to the length of its output.
#print(perc)
#sort(perc, decreasing = TRUE)
head(sort(perc, decreasing = TRUE), n=25)

#Choosing to only keep variables with less than 15% missing data
data <- data[, which(apply(data,2,Perc_Missing) < 15.0)]

### Multicollinearity/Linear Dependence/Winsorization

In [51]:
#Splitting datasets
data <- setDT(data)
data2 <- select(data, c('stock','nextyr_price_var','sector'))
data <- select(data, -c('stock','nextyr_price_var','sector'))

#Converting class to a factor
data <- data[, class := as.factor(class)]

#Run regression to identify linearly dependent variables
set.seed(123)
glm <- suppressWarnings(glm(class~., family = binomial
           , data = data))

In [52]:
#Find the linearly dependent variables
vars <- attributes(alias(glm)$Complete)$dimnames[[1]]
vars

# Remove the linearly dependent variables
remove <- match(vars,names(data))
remove

dim(data)
data <- select(data, -c(remove))
dim(data)

In [53]:
#Re-run regression without linearly dependent variables
set.seed(123)
glm <- suppressWarnings(glm(class~., family = binomial
           , data = data))

In [54]:
#NOTE: This section of the code will take some time to run.
#The function VIF_Check runs a regression and removes the max
#VIF, repeating this process until all VIFs are below threshold
#Removing variables with VIFs above 5
data <- VIF_Check(dat=data, threshold=5)

[1] 42501422
[1] 42501422
[1] 14857839
[1] 7233613
[1] 589255.9
[1] 224275.1
[1] 136088.1
[1] 90188.32
[1] 85687.64
[1] 64580.32
[1] 47223.74
[1] 36937.24
[1] 34305.54
[1] 15651.42
[1] 8154.741
[1] 6957.033
[1] 3236.491
[1] 3432.8
[1] 3006.367
[1] 2107.776
[1] 1695.465
[1] 1459.61
[1] 1059
[1] 897.1616
[1] 805.2764
[1] 847.6565
[1] 787.2204
[1] 729.5147
[1] 593.1282
[1] 296.241
[1] 368.0201
[1] 235.8145
[1] 234.6721
[1] 207.8195
[1] 206.0238
[1] 175.2889
[1] 164.1711
[1] 119.1208
[1] 112.1676
[1] 86.7602
[1] 75.9692
[1] 98.85711
[1] 65.27094
[1] 44.12094
[1] 41.82128
[1] 38.25669
[1] 38.15252
[1] 1099.636
[1] 34.42022
[1] 25.95463
[1] 25.16551
[1] 24.20825
[1] 21.9634
[1] 21.5175
[1] 20.7845
[1] 17.37877
[1] 15.94764
[1] 11.32786
[1] 9.723508
[1] 9.203622
[1] 8.782897
[1] 8.662801
[1] 7.623118
[1] 7.273039
[1] 6.057325
[1] 6.021532
[1] 5.181338
[1] 5.031095
[1] 4.370228
[1] "All VIFs are below threshold of 5"


In [55]:
#Re-combine data
data <- cbind(data2,data) %>% setDT()
dim(data)

In [56]:
#Re-run regression without linearly dependent variables
set.seed(123)
glm <- suppressWarnings(glm(class~., family = binomial(link = "logit")
                  , data = data[, -c('stock','nextyr_price_var','sector')], control = list(maxit = 100)))

In [57]:
#Split data for winsorization
data2 <- select(data, c('class','year','sector_num','stock'
                        ,'nextyr_price_var','sector'))
data <- select(data, -c('class','year','sector_num','stock'
                        ,'nextyr_price_var','sector'))

In [58]:
#Winsorize each column accordingly
data <- map_df(data, ~Winsorize(., probs=c(0.05,0.95),na.rm=TRUE))

#Recombine datasets
data <- cbind(data2,data) %>% setDT()
dim(data)

In [59]:
#Re-run regression to see if error has been corrected
#No warning message occurs
set.seed(123)
glm <- glm(class~., family = binomial(link = "logit")
           , data = data[, -c('stock','nextyr_price_var','sector')], control = list(maxit = 100))
summary(glm)


Call:
glm(formula = class ~ ., family = binomial(link = "logit"), data = data[, 
    -c("stock", "nextyr_price_var", "sector")], control = list(maxit = 100))

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-2.308  -1.190   0.764   1.030   2.088  

Coefficients:
                                                Estimate Std. Error z value
(Intercept)                                   -1.370e+02  3.063e+01  -4.475
year                                           6.777e-02  1.519e-02   4.461
sector_num                                     6.455e-02  7.979e-03   8.090
cost_of_revenue                                4.328e-12  1.801e-11   0.240
r.d_expenses                                  -3.605e-10  4.957e-10  -0.727
sg.a_expense                                  -3.005e-11  4.774e-11  -0.629
interest_expense                              -2.202e-10  4.897e-10  -0.450
income_tax_expense                            -3.651e-11  2.491e-10  -0.147
net_income__noncontrolling_int        

### Imputing Missing Values

In [60]:
#Imputation will be implemented if necessary in the modeling notebook

### Uniformity

In [61]:
#Implementing scaling in the modeling notebook

### Additional Cleaning

In [62]:
#No additional cleaning was performed in this notebook

## Save the Modeling Dataset

In [63]:
fwrite(data, paste0(dir$final_data,'clean_financial.csv'))

## Outcome

##### The following changes were made to the data set:
 - Duplicates: Duplicate columns were removed from the dataset. This was done by first using the duplicated function to identify identical columns. Then, I looked at the columns sums to identify additional duplicate variables. In the end, about 10 variables were duplicated. 
 - Variable Names: Variable names were cleaned up so that they were consistent. Spaces and dashes were removed, all letters were lower-cased, and underscores were added between most words.
 - Categorical Encoding: A new variable "sector_num" was created as the numeric version of the categorical variable "sector" to aid in machine learning models.
 - Missing Data: The missing data was cleaned in the following manner:
      1. Any row with missing values greater than 50 was removed
      2. Any column with missing values greater than 15% was removed
      3. The remaining values will be imputed after relevant variables are identified.
 - Outliers: The outliers in the data set were removed in the previous notebook using Cook's Distance. Only four rows were identified as extreme outliers and were thus removed from the data set.
 - Multicollinearity: A significant portion of the variables in the dataset had high multicollinearity and a few were even found to be perfectly collinear. To solve this issue, all variables with a VIF greater than or equal to 5 were removed from the dataset. This resulted in approximately 70 variables being removed from the dataset.
 -  Complete Separation: When running a basic regression, predicted probabilities that were effectively indistinguishable from 1 were occurring. To solve this issue, I first removed collinear variables. After this did not solve the issue, I winsorized the majority of the independent variables in the dataset and the problem was solved.