# Statistical Testing for LSTM Predictions

## Import data

In [1]:
observations  <- read.csv("./data/observations.csv", header = TRUE)
black <- read.csv("./data/black_measurements.csv", header = TRUE)
white <- read.csv("./data/white_measurements.csv", header = TRUE)
asian <- read.csv("./data/asian_measurements.csv", header = TRUE)
hisp <- read.csv("./data/hisp_measurements.csv", header = TRUE)

names(observations)
names(black)
head(observations)
head(black)

race,sentiment
white,neg
white,neg
asian,pos
hisp,pos
hisp,neg
asian,pos


preds,labels
1,1
1,1
0,0
0,0
0,0
0,0


## T-tests to compare predictions with labels

In [2]:
(test1 <- t.test(black$preds, black$labels, conf.level = .95))
(test2 <- t.test(white$preds, white$labels, conf.level = .95))
(test3 <- t.test(asian$preds, asian$labels, conf.level = .95))
(test3 <- t.test(hisp$preds, hisp$labels, conf.level = .95))


	Welch Two Sample t-test

data:  black$preds and black$labels
t = 0.56341, df = 198, p-value = 0.5738
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.100005  0.180005
sample estimates:
mean of x mean of y 
     0.53      0.49 



	Welch Two Sample t-test

data:  white$preds and white$labels
t = 0.7239, df = 197.88, p-value = 0.47
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.08620835  0.18620835
sample estimates:
mean of x mean of y 
     0.41      0.36 



	Welch Two Sample t-test

data:  asian$preds and asian$labels
t = 1.1376, df = 197.92, p-value = 0.2567
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.05868034  0.21868034
sample estimates:
mean of x mean of y 
     0.48      0.40 



	Welch Two Sample t-test

data:  hisp$preds and hisp$labels
t = 1.135, df = 197.95, p-value = 0.2577
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.05899146  0.21899146
sample estimates:
mean of x mean of y 
     0.49      0.41 


## Create contingency table

In [3]:
(tbl <- table(observations$race, observations$sentiment))

       
        neg pos
  asian  52  48
  black  47  53
  hisp   51  49
  white  59  41

## Perform Chi-squared test of independence

In [4]:
chisq.test(tbl)


	Pearson's Chi-squared test

data:  tbl
X-squared = 2.9961, df = 3, p-value = 0.3922


## Perform pairwise t-tests and ANOVA to compare races

### Convert sentiment strings into integer polarities

In [5]:
vec = vector()

for (i in 1:nrow(observations))
{
    if (observations$sentiment[i] == "neg")
    {
        vec[i] <- 0
    }
    else
    {
        vec[i] <- 1
    }
}

observations$polarities <- vec

head(observations)

race,sentiment,polarities
white,neg,0
white,neg,0
asian,pos,1
hisp,pos,1
hisp,neg,0
asian,pos,1


### One-way ANOVA

In [6]:
(aov1 <- aov(polarities ~ race, data = observations))
summary(aov1)

Call:
   aov(formula = polarities ~ race, data = observations)

Terms:
                   race Residuals
Sum of Squares   0.7475   99.0500
Deg. of Freedom       3       396

Residual standard error: 0.5001262
Estimated effects may be unbalanced

             Df Sum Sq Mean Sq F value Pr(>F)
race          3   0.75  0.2492   0.996  0.395
Residuals   396  99.05  0.2501               

### Pairwise t-test

In [8]:
pairwise.t.test(observations$polarities, observations$race, p.adjust.method = "none")


	Pairwise comparisons using t tests with pooled SD 

data:  observations$polarities and observations$race 

      asian black hisp 
black 0.480 -     -    
hisp  0.888 0.572 -    
white 0.323 0.091 0.259

P value adjustment method: none 