Import external libraries.

In [4]:
#install.packages("dplyr")
library("readxl")
library("pscl")
library(yardstick)
library(data.table)
library(MLmetrics)
library(ggplot2)

Read in tile count data for BEST2 and BEST3. Change paths to ```barretts-segment-length-predictor``` directory as appropriate.

In [5]:
best2_tff3_tile_data <- read_excel("~/Documents/barrets-segment-length-predictor/data/BEST2_TFF3_AND_GASTRIC_TILE_COUNTS_QC_PASSING.xlsx")
best2_prague_length_data <- read_excel("~/Documents/barrets-segment-length-predictor/data/BEST2_ENDOSCOPY_CRF_CLEANED_SHORTENED.xlsx")
best3_tff3_tile_data <- read_excel('~/Documents/barrets-segment-length-predictor/data/BEST3_TFF3_AND_GASTRIC_TILE_COUNTS_QC_PASSING.xlsx')
best3_prague_length_data <- read.csv('~/Documents/barrets-segment-length-predictor/data/BEST3_ENDOSCOPY_CRF_CLEANED.csv')

New names:
* `` -> ...3


Merge data appropriately and add new columns for training.

In [6]:
# Merge tile count and Prague length data and perform some cleaning tasks
best2_tff3_prague_data = merge(best2_tff3_tile_data, best2_prague_length_data, by='Case')
best3_tff3_prague_data = merge(best3_tff3_tile_data, best3_prague_length_data, by='Case')
tff3_prague_data = rbind(best2_tff3_prague_data[, c('Case', 'Gastric_count', 'TFF3_positive_count', 'PRAGUE_C', 'PRAGUE_M')],
                            best3_tff3_prague_data[, c('Case', 'Gastric_count', 'TFF3_positive_count', 'PRAGUE_C', 'PRAGUE_M')])
tff3_prague_data$PRAGUE_C <- as.numeric(tff3_prague_data$PRAGUE_C)
tff3_prague_data$PRAGUE_M <- as.numeric(tff3_prague_data$PRAGUE_M)
tff3_prague_data$TFF3_positive_count <- as.numeric(tff3_prague_data$TFF3_positive_count)
tff3_prague_data$Gastric_count <- as.numeric(tff3_prague_data$Gastric_count)

# Add C≥1 and M≥3 columns
tff3_prague_data$PRAGUE_C_gtet_1cm <- ifelse(tff3_prague_data$PRAGUE_C >= 1, 1, 0)
tff3_prague_data$PRAGUE_M_gtet_3cm <- ifelse(tff3_prague_data$PRAGUE_M >= 3, 1, 0)

# Add C≥1 OR M≥3 column
tff3_prague_data$C_gtet_1_or_M_gtet_3 <- ifelse(tff3_prague_data$PRAGUE_C >= 1 | tff3_prague_data$PRAGUE_M >= 3, 1, 0)

# Create doubled C and M columns to have only integer values for Poisson regression models
tff3_prague_data$PRAGUE_M_doubled <- tff3_prague_data$PRAGUE_M * 2
tff3_prague_data$PRAGUE_C_doubled <- tff3_prague_data$PRAGUE_C * 2

# Re-split data into BEST2 and BEST3 pieces
best2_tff3_prague_data <- head(tff3_prague_data, nrow(best2_tff3_prague_data))
best3_tff3_prague_data <- tail(tff3_prague_data, nrow(best3_tff3_prague_data))

# Convert to data.table
best2_tff3_prague_data <- setDT(best2_tff3_prague_data)
best3_tff3_prague_data <- setDT(best3_tff3_prague_data)
tff3_prague_data <- setDT(tff3_prague_data)


# Create versions of the above data tables with no patients with no patients with TFF3 positive tile counts of 0
best2_tff3_prague_data_nozeros <- best2_tff3_prague_data[best2_tff3_prague_data$TFF3_positive_count != 0,]
best3_tff3_prague_data_nozeros <- best3_tff3_prague_data[best3_tff3_prague_data$TFF3_positive_count != 0,]
tff3_prague_data_nozeros <- tff3_prague_data[tff3_prague_data$TFF3_positive_count != 0,]

Generate Figure 2 plots. These plots aren't used in the paper. See ```create_figure_2.ipynb``` for the code to generate the paper figure 2 plots.

In [7]:
# Plot 2a
#plot1 <- ggplot(data=tff3_prague_data, aes(x=TFF3_positive_count)) + geom_histogram() + 
#    xlab('TFF3 positive tile count') + ylab('Count') + theme(axis.text=element_text(size=18), axis.title=element_text(size=22, face="bold"))
#plot1
#ggsave(file="~/Downloads/plot2a.svg", plot=plot1, width=10, height=10)

# Plot 2b
#plot2 <- ggplot(data=tff3_prague_data, aes(x=PRAGUE_C, y=TFF3_positive_count, group=PRAGUE_C)) + 
#    geom_boxplot() + xlab('Prague C length (cm)') + ylab('TFF3 positive tile count') + coord_flip() +
#     theme(axis.text=element_text(size=18), axis.title=element_text(size=22, face="bold"))
#plot2
#ggsave(file="~/Downloads/plot2b.svg", plot=plot2, width=10, height=10)

# Plot 2c
#plot3 <- ggplot(data=tff3_prague_data, aes(x=PRAGUE_M, y=TFF3_positive_count, group=PRAGUE_M)) + 
#    geom_boxplot() + xlab('Prague M length (cm)') + ylab('TFF3 positive tile count') + coord_flip() +
#     theme(axis.text=element_text(size=18), axis.title=element_text(size=22, face="bold"))
#plot3
#ggsave(file="~/Downloads/plot2c.svg", plot=plot3, width=10, height=10)

Compute Spearman's correlation coefficients.

In [8]:
# correlation metrics
print('Prague C ~ TFF3 positive tile count Spearman rho')
cor(best2_tff3_prague_data_nozeros$TFF3_positive_count, best2_tff3_prague_data_nozeros$PRAGUE_C, method = c("spearman"))
print('Prague M ~ TFF3 positive tile count Spearman rho')
cor(best2_tff3_prague_data_nozeros$TFF3_positive_count, best2_tff3_prague_data_nozeros$PRAGUE_M, method = c("spearman"))
print('Prague C ~ Gastric tile count Spearman rho')
cor(best2_tff3_prague_data_nozeros$Gastric_count, best2_tff3_prague_data_nozeros$PRAGUE_C, method = c("spearman"))
print('Prague M ~ Gastric tile count Spearman rho')
cor(best2_tff3_prague_data_nozeros$Gastric_count, best2_tff3_prague_data_nozeros$PRAGUE_M, method = c("spearman"))

[1] "Prague C ~ TFF3 positive tile count Spearman rho"


[1] "Prague M ~ TFF3 positive tile count Spearman rho"


[1] "Prague C ~ Gastric tile count Spearman rho"


[1] "Prague M ~ Gastric tile count Spearman rho"


Train 5-fold cross validation logistic regression models for predicting segment length class: (C≥1 OR M≥3) vs. (C<1 AND M<3)

In [9]:
# create 5 folds for cross validation across 691 patients from BEST2 and BEST3 
# (138 patients per val fold, 553 per training fold)
set.seed(111)

# For BEST2 only analyses
tff3_prague_data_shuffled <- best2_tff3_prague_data_nozeros[sample(nrow(best2_tff3_prague_data_nozeros)),]

# For BEST2+BEST3 analyses
#tff3_prague_data_shuffled <- tff3_prague_data_nozeros[sample(nrow(tff3_prague_data_nozeros)),]
nrow(tff3_prague_data_shuffled)

# split folds into training and validation components
fold_size = round(nrow(tff3_prague_data_shuffled)/5)
n <- nrow(tff3_prague_data_shuffled)
r  <- rep(1:ceiling(n/fold_size), each=fold_size)[1:n]
d <- split(tff3_prague_data_shuffled, r)
val1 <- d[["1"]]
val2 <- d[["2"]]
val3 <- d[["3"]]
val4 <- d[["4"]]
val5 <- d[["5"]]
train1 <- rbind(val2, val3, val4, val5)
train2 <- rbind(val1, val3, val4, val5)
train3 <- rbind(val1, val2, val4, val5)
train4 <- rbind(val1, val2, val3, val5)
train5 <- rbind(val1, val2, val3, val4)

# 5-fold classification models trained on training components
logreg_c_gtet_1cm_or_m_gtet_3cm_1 <- glm(C_gtet_1_or_M_gtet_3 ~ TFF3_positive_count, family='binomial', data = train1)
logreg_c_gtet_1cm_or_m_gtet_3cm_2 <- glm(C_gtet_1_or_M_gtet_3 ~ TFF3_positive_count, family='binomial', data = train2)
logreg_c_gtet_1cm_or_m_gtet_3cm_3 <- glm(C_gtet_1_or_M_gtet_3 ~ TFF3_positive_count, family='binomial', data = train3)
logreg_c_gtet_1cm_or_m_gtet_3cm_4 <- glm(C_gtet_1_or_M_gtet_3 ~ TFF3_positive_count, family='binomial', data = train4)
logreg_c_gtet_1cm_or_m_gtet_3cm_5 <- glm(C_gtet_1_or_M_gtet_3 ~ TFF3_positive_count, family='binomial', data = train5)

“glm.fit: fitted probabilities numerically 0 or 1 occurred”

Evalute model performance on validation component of each fold.

In [14]:
#bal_accuracy_vec

# define performance metric functions
RSQUARE = function(y_actual,y_predict){
  cor(y_actual,y_predict)^2
}
RMSE = function(y_actual, y_predict){
    sqrt(mean((y_predict - y_actual)^2))
}
METRIFY = function(y_actual, y_predict){
    acc = Accuracy(ifelse(y_predict >= 0.5, 1, 0), y_actual)
    acc_balanced = bal_accuracy_vec(factor(y_actual), factor(ifelse(y_predict >= 0.5, 1, 0)))
    prec_long = Precision(y_actual, ifelse(y_predict >= 0.5, 1, 0), positive=1)
    prec_short = Precision(y_actual, ifelse(y_predict >= 0.5, 1, 0), positive=0)
    rec_long = Recall(y_actual, ifelse(y_predict >= 0.5, 1, 0), positive=1)
    rec_short = Recall(y_actual, ifelse(y_predict >= 0.5, 1, 0), positive=0)
    f1_long = F1_Score(y_actual, ifelse(y_predict >= 0.5, 1, 0), positive=1)
    f1_short = F1_Score(y_actual, ifelse(y_predict >= 0.5, 1, 0), positive=0)
    
    list(Accuracy=acc, Balanced_acc=acc_balanced, Precision_long=prec_long, Precision_short=prec_short, 
         Recall_long=rec_long, Recall_short=rec_short, F1_long=f1_long, F1_short=f1_short)
}

# get classification predictions for each fold on validation components
logreg_c_gtet_1cm_or_m_gtet_3cm_1_pred <- predict(logreg_c_gtet_1cm_or_m_gtet_3cm_1, newdata=data.frame(TFF3_positive_count=val1$TFF3_positive_count), type='response')
logreg_c_gtet_1cm_or_m_gtet_3cm_2_pred <- predict(logreg_c_gtet_1cm_or_m_gtet_3cm_2, newdata=data.frame(TFF3_positive_count=val2$TFF3_positive_count), type='response')
logreg_c_gtet_1cm_or_m_gtet_3cm_3_pred <- predict(logreg_c_gtet_1cm_or_m_gtet_3cm_3, newdata=data.frame(TFF3_positive_count=val3$TFF3_positive_count), type='response')
logreg_c_gtet_1cm_or_m_gtet_3cm_4_pred <- predict(logreg_c_gtet_1cm_or_m_gtet_3cm_4, newdata=data.frame(TFF3_positive_count=val4$TFF3_positive_count), type='response')
logreg_c_gtet_1cm_or_m_gtet_3cm_5_pred <- predict(logreg_c_gtet_1cm_or_m_gtet_3cm_5, newdata=data.frame(TFF3_positive_count=val5$TFF3_positive_count), type='response')

# Get performance metric results
fold1 = METRIFY(val1$C_gtet_1_or_M_gtet_3, logreg_c_gtet_1cm_or_m_gtet_3cm_1_pred)
fold2 = METRIFY(val2$C_gtet_1_or_M_gtet_3, logreg_c_gtet_1cm_or_m_gtet_3cm_2_pred)
fold3 = METRIFY(val3$C_gtet_1_or_M_gtet_3, logreg_c_gtet_1cm_or_m_gtet_3cm_3_pred)
fold4 = METRIFY(val4$C_gtet_1_or_M_gtet_3, logreg_c_gtet_1cm_or_m_gtet_3cm_4_pred)
#print(val5$C_gtet_1_or_M_gtet_3)
#print(logreg_c_gtet_1cm_or_m_gtet_3cm_5_pred)
fold5 = METRIFY(val5$C_gtet_1_or_M_gtet_3, logreg_c_gtet_1cm_or_m_gtet_3cm_5_pred)

results = rbindlist(list(fold1, fold2, fold3, fold4, fold5))
row.names(results) = list("Fold1", "Fold2", "Fold3", "Fold4", "Fold5")
print('Logistic regression C>=1cm or M>=3cm fold 1 performance:')
results
print("Means:")
colMeans(results)
print("Standard deviations:")
sapply(results, sd)

[1] "Logistic regression C>=1cm or M>=3cm fold 1 performance:"


Unnamed: 0,Accuracy,Balanced_acc,Precision_long,Precision_short,Recall_long,Recall_short,F1_long,F1_short
Fold1,0.8275862,0.7887768,0.8709677,0.72,0.8852459,0.6923077,0.8780488,0.7058824
Fold2,0.8390805,0.7814815,0.8484848,0.8095238,0.9333333,0.6296296,0.8888889,0.7083333
Fold3,0.8275862,0.8517802,0.9649123,0.5666667,0.8088235,0.8947368,0.88,0.6938776
Fold4,0.816092,0.772807,0.8253968,0.7916667,0.9122807,0.6333333,0.8666667,0.7037037
Fold5,0.8372093,0.8179487,0.8965517,0.7142857,0.8666667,0.7692308,0.8813559,0.7407407


[1] "Means:"


[1] "Standard deviations:"


Train 5-fold Poisson regression and zero-inflated Poisson (ZIP) regression models for predicting both C and M length.

In [15]:
psn_c_1_doubled <- glm(PRAGUE_C_doubled ~ TFF3_positive_count, family = 'poisson', data = train1)
psn_c_2_doubled <- glm(PRAGUE_C_doubled ~ TFF3_positive_count, family = 'poisson', data = train2)
psn_c_3_doubled <- glm(PRAGUE_C_doubled ~ TFF3_positive_count, family = 'poisson', data = train3)
psn_c_4_doubled <- glm(PRAGUE_C_doubled ~ TFF3_positive_count, family = 'poisson', data = train4)
psn_c_5_doubled <- glm(PRAGUE_C_doubled ~ TFF3_positive_count, family = 'poisson', data = train5)

zip_c_1_doubled <- zeroinfl(PRAGUE_C_doubled ~ TFF3_positive_count, dist = 'poisson', data = train1)
zip_c_2_doubled <- zeroinfl(PRAGUE_C_doubled ~ TFF3_positive_count, dist = 'poisson', data = train2)
zip_c_3_doubled <- zeroinfl(PRAGUE_C_doubled ~ TFF3_positive_count, dist = 'poisson', data = train3)
zip_c_4_doubled <- zeroinfl(PRAGUE_C_doubled ~ TFF3_positive_count, dist = 'poisson', data = train4)
zip_c_5_doubled <- zeroinfl(PRAGUE_C_doubled ~ TFF3_positive_count, dist = 'poisson', data = train5)

psn_m_1_doubled <- glm(PRAGUE_M_doubled ~ TFF3_positive_count, family = 'poisson', data = train1)
psn_m_2_doubled <- glm(PRAGUE_M_doubled ~ TFF3_positive_count, family = 'poisson', data = train2)
psn_m_3_doubled <- glm(PRAGUE_M_doubled ~ TFF3_positive_count, family = 'poisson', data = train3)
psn_m_4_doubled <- glm(PRAGUE_M_doubled ~ TFF3_positive_count, family = 'poisson', data = train4)
psn_m_5_doubled <- glm(PRAGUE_M_doubled ~ TFF3_positive_count, family = 'poisson', data = train5)

zip_m_1_doubled <- zeroinfl(PRAGUE_M_doubled ~ TFF3_positive_count, dist = 'poisson', data = train1)
zip_m_2_doubled <- zeroinfl(PRAGUE_M_doubled ~ TFF3_positive_count, dist = 'poisson', data = train2)
zip_m_3_doubled <- zeroinfl(PRAGUE_M_doubled ~ TFF3_positive_count, dist = 'poisson', data = train3)
zip_m_4_doubled <- zeroinfl(PRAGUE_M_doubled ~ TFF3_positive_count, dist = 'poisson', data = train4)
zip_m_5_doubled <- zeroinfl(PRAGUE_M_doubled ~ TFF3_positive_count, dist = 'poisson', data = train5)

“glm.fit: fitted probabilities numerically 0 or 1 occurred”

Evaluate regression models' performances on validation components of each fold.

In [16]:
# Predict on validation components
psn_c_1_pred_doubled <- predict(psn_c_1_doubled, newdata=data.frame(TFF3_positive_count=val1$TFF3_positive_count), type='response')
psn_c_2_pred_doubled <- predict(psn_c_2_doubled, newdata=data.frame(TFF3_positive_count=val2$TFF3_positive_count), type='response')
psn_c_3_pred_doubled <- predict(psn_c_3_doubled, newdata=data.frame(TFF3_positive_count=val3$TFF3_positive_count), type='response')
psn_c_4_pred_doubled <- predict(psn_c_4_doubled, newdata=data.frame(TFF3_positive_count=val4$TFF3_positive_count), type='response')
psn_c_5_pred_doubled <- predict(psn_c_5_doubled, newdata=data.frame(TFF3_positive_count=val5$TFF3_positive_count), type='response')

zip_c_1_pred_doubled <- predict(zip_c_1_doubled, newdata=data.frame(TFF3_positive_count=val1$TFF3_positive_count), type='response')
zip_c_2_pred_doubled <- predict(zip_c_2_doubled, newdata=data.frame(TFF3_positive_count=val2$TFF3_positive_count), type='response')
zip_c_3_pred_doubled <- predict(zip_c_3_doubled, newdata=data.frame(TFF3_positive_count=val3$TFF3_positive_count), type='response')
zip_c_4_pred_doubled <- predict(zip_c_4_doubled, newdata=data.frame(TFF3_positive_count=val4$TFF3_positive_count), type='response')
zip_c_5_pred_doubled <- predict(zip_c_5_doubled, newdata=data.frame(TFF3_positive_count=val5$TFF3_positive_count), type='response')

psn_m_1_pred_doubled <- predict(psn_m_1_doubled, newdata=data.frame(TFF3_positive_count=val1$TFF3_positive_count), type='response')
psn_m_2_pred_doubled <- predict(psn_m_2_doubled, newdata=data.frame(TFF3_positive_count=val2$TFF3_positive_count), type='response')
psn_m_3_pred_doubled <- predict(psn_m_3_doubled, newdata=data.frame(TFF3_positive_count=val3$TFF3_positive_count), type='response')
psn_m_4_pred_doubled <- predict(psn_m_4_doubled, newdata=data.frame(TFF3_positive_count=val4$TFF3_positive_count), type='response')
psn_m_5_pred_doubled <- predict(psn_m_5_doubled, newdata=data.frame(TFF3_positive_count=val5$TFF3_positive_count), type='response')

zip_m_1_pred_doubled <- predict(zip_m_1_doubled, newdata=data.frame(TFF3_positive_count=val1$TFF3_positive_count), type='response')
zip_m_2_pred_doubled <- predict(zip_m_2_doubled, newdata=data.frame(TFF3_positive_count=val2$TFF3_positive_count), type='response')
zip_m_3_pred_doubled <- predict(zip_m_3_doubled, newdata=data.frame(TFF3_positive_count=val3$TFF3_positive_count), type='response')
zip_m_4_pred_doubled <- predict(zip_m_4_doubled, newdata=data.frame(TFF3_positive_count=val4$TFF3_positive_count), type='response')
zip_m_5_pred_doubled <- predict(zip_m_5_doubled, newdata=data.frame(TFF3_positive_count=val5$TFF3_positive_count), type='response')

# Print results
print(paste0("Poisson regression C length fold 1 R^2 error: ", RSQUARE(val1$PRAGUE_C, psn_c_1_pred_doubled/2)))
print(paste0("Poisson regression C length fold 2 R^2 error: ", RSQUARE(val2$PRAGUE_C, psn_c_2_pred_doubled/2)))
print(paste0("Poisson regression C length fold 3 R^2 error: ", RSQUARE(val3$PRAGUE_C, psn_c_3_pred_doubled/2)))
print(paste0("Poisson regression C length fold 4 R^2 error: ", RSQUARE(val4$PRAGUE_C, psn_c_4_pred_doubled/2)))
print(paste0("Poisson regression C length fold 5 R^2 error: ", RSQUARE(val5$PRAGUE_C, psn_c_5_pred_doubled/2)))
sum = RSQUARE(val1$PRAGUE_C, psn_c_1_pred_doubled/2)+
    RSQUARE(val2$PRAGUE_C, psn_c_2_pred_doubled/2)+
    RSQUARE(val3$PRAGUE_C, psn_c_3_pred_doubled/2)+
    RSQUARE(val4$PRAGUE_C, psn_c_4_pred_doubled/2)+
    RSQUARE(val5$PRAGUE_C, psn_c_5_pred_doubled/2)
print(paste0("Poisson regression C length Mean R^2: ", sum/5))
print('')
print(paste0("ZIP regression C length fold 1 R^2 error: ", RSQUARE(val1$PRAGUE_C, zip_c_1_pred_doubled/2)))
print(paste0("ZIP regression C length fold 2 R^2 error: ", RSQUARE(val2$PRAGUE_C, zip_c_2_pred_doubled/2)))
print(paste0("ZIP regression C length fold 3 R^2 error: ", RSQUARE(val3$PRAGUE_C, zip_c_3_pred_doubled/2)))
print(paste0("ZIP regression C length fold 4 R^2 error: ", RSQUARE(val4$PRAGUE_C, zip_c_4_pred_doubled/2)))
print(paste0("ZIP regression C length fold 5 R^2 error: ", RSQUARE(val5$PRAGUE_C, zip_c_5_pred_doubled/2)))
sum = RSQUARE(val1$PRAGUE_C, zip_c_1_pred_doubled/2)+
    RSQUARE(val2$PRAGUE_C, zip_c_2_pred_doubled/2)+
    RSQUARE(val3$PRAGUE_C, zip_c_3_pred_doubled/2)+
    RSQUARE(val4$PRAGUE_C, zip_c_4_pred_doubled/2)+
    RSQUARE(val5$PRAGUE_C, zip_c_5_pred_doubled/2)
print(paste0("ZIP regression C length Mean R^2: ", sum/5))
print('')
print(paste0("Poisson regression M length fold 1 R^2 error: ", RSQUARE(val1$PRAGUE_M, psn_m_1_pred_doubled/2)))
print(paste0("Poisson regression M length fold 2 R^2 error: ", RSQUARE(val2$PRAGUE_M, psn_m_2_pred_doubled/2)))
print(paste0("Poisson regression M length fold 3 R^2 error: ", RSQUARE(val3$PRAGUE_M, psn_m_3_pred_doubled/2)))
print(paste0("Poisson regression M length fold 4 R^2 error: ", RSQUARE(val4$PRAGUE_M, psn_m_4_pred_doubled/2)))
print(paste0("Poisson regression M length fold 5 R^2 error: ", RSQUARE(val5$PRAGUE_M, psn_m_5_pred_doubled/2)))
sum = RSQUARE(val1$PRAGUE_C, psn_m_1_pred_doubled/2)+
    RSQUARE(val2$PRAGUE_C, psn_m_2_pred_doubled/2)+
    RSQUARE(val3$PRAGUE_C, psn_m_3_pred_doubled/2)+
    RSQUARE(val4$PRAGUE_C, psn_m_4_pred_doubled/2)+
    RSQUARE(val5$PRAGUE_C, psn_m_5_pred_doubled/2)
print(paste0("Poisson regression M length Mean R^2: ", sum/5))
print('')
print(paste0("ZIP regression M length fold 1 R^2 error: ", RSQUARE(val1$PRAGUE_M, zip_m_1_pred_doubled/2)))
print(paste0("ZIP regression M length fold 2 R^2 error: ", RSQUARE(val2$PRAGUE_M, zip_m_2_pred_doubled/2)))
print(paste0("ZIP regression M length fold 3 R^2 error: ", RSQUARE(val3$PRAGUE_M, zip_m_3_pred_doubled/2)))
print(paste0("ZIP regression M length fold 4 R^2 error: ", RSQUARE(val4$PRAGUE_M, zip_m_4_pred_doubled/2)))
print(paste0("ZIP regression M length fold 5 R^2 error: ", RSQUARE(val5$PRAGUE_M, zip_m_5_pred_doubled/2)))
sum = RSQUARE(val1$PRAGUE_C, zip_m_1_pred_doubled/2)+
    RSQUARE(val2$PRAGUE_C, zip_m_2_pred_doubled/2)+
    RSQUARE(val3$PRAGUE_C, zip_m_3_pred_doubled/2)+
    RSQUARE(val4$PRAGUE_C, zip_m_4_pred_doubled/2)+
    RSQUARE(val5$PRAGUE_C, zip_m_5_pred_doubled/2)
print(paste0("ZIP regression M length Mean R^2: ", sum/5))

[1] "Poisson regression C length fold 1 R^2 error: 0.299086481764143"
[1] "Poisson regression C length fold 2 R^2 error: 0.237490360097952"
[1] "Poisson regression C length fold 3 R^2 error: 0.257959079249613"
[1] "Poisson regression C length fold 4 R^2 error: 0.336969241357576"
[1] "Poisson regression C length fold 5 R^2 error: 0.0198825913480809"
[1] "Poisson regression C length Mean R^2: 0.230277550763473"
[1] ""
[1] "ZIP regression C length fold 1 R^2 error: 0.405952809975251"
[1] "ZIP regression C length fold 2 R^2 error: 0.467705070105087"
[1] "ZIP regression C length fold 3 R^2 error: 0.304139296317477"
[1] "ZIP regression C length fold 4 R^2 error: 0.514497011559017"
[1] "ZIP regression C length fold 5 R^2 error: 0.0905175029814184"
[1] "ZIP regression C length Mean R^2: 0.35656233818765"
[1] ""
[1] "Poisson regression M length fold 1 R^2 error: 0.213027472708732"
[1] "Poisson regression M length fold 2 R^2 error: 0.19778547128652"
[1] "Poisson regression M length fold 3 R^2 er