# Calculate logistic regression for tag use before/after an event
Dependent variable (y): whether an individual fic has a certain type of tag (such as trans)  
Independent variable (X): whether the fic was published before or after the event  
Covariates (X): 
* numeric date, 
* whether a fic is hi-LGBTQ or lo-LGBTQ fandom, 
* the specific fandom

## Multiple tag outcomes (stacked data)

In [1]:
data = read.csv('/data/fanfiction_ao3/tmp/stacked1.csv', stringsAsFactors=TRUE, 
               colClasses = c(after_event='factor', Data='factor'))
data <- within(data, Label <- relevel(Label, ref='has_tag_1'))
data <- within(data, dataset <- relevel(dataset, ref='lo_lgbtq'))
data <- within(data, fandom <- relevel(fandom, ref='tolkien'))
data <- within(data, after_event <- relevel(after_event, ref='False'))
data <- within(data, Data <- relevel(Data, ref='False'))
data

fic_id,after_event,monthly_avg_tags,fandom,dataset,has_tag_10,has_tag_25,has_tag_50,has_tag_100,Label,Data
<int>,<fct>,<dbl>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>
4436162,True,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_trans,False
4436162,True,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_gay,False
4436162,True,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_queerphobia,False
4436162,True,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_wedding-marriage,False
4436162,True,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_M/M,False
4436162,True,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_F/F,False
4436162,True,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_F/M,True
4436162,True,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_Gen,False
4436162,True,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_Multi,False
4436162,True,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_Other,False


In [2]:
# Regular old glm with binomial logit

# logit = glm('Data ~ Label * dataset * after_event + dataset:fandom + monthly_avg_tags', data=data, family=binomial(link='logit'))
# logit = glm('choice ~ Label + 
#                     after_event + 
#                     monthly_avg_tags + 
#                     dataset:fandom + 
#                     dataset + 
#                     dataset:Label + 
#                     after_event:Label + 
#                     after_event:dataset +
#                     after_event:dataset:Label', data=data, family=binomial(link='logit'))
# logit = glm('choice ~ Label', data=data, family=binomial(link='logit'))
# logit = glm('Data ~ monthly_avg_tags', data=data, family=binomial(link='logit'))
logit = glm('Data ~ dataset', data=data, family=binomial(link='logit'))
# logit = glm('Data ~ fandom', data=data, family=binomial(link='logit'))
# logit = glm('Data ~ fandom', data=filtered, family=binomial(link='logit'))
s = summary(logit)
print(s$aic)
s$coefficients

[1] 115650.4


Unnamed: 0,Estimate,Std. Error,z value,Pr(>|z|)
(Intercept),-2.21943179,0.01156931,-191.837862,0.0
datasethi_lgbtq,-0.04670249,0.01590562,-2.936225,0.00332233


In [3]:
head(model.matrix(logit))

Unnamed: 0,(Intercept),datasethi_lgbtq
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1


In [4]:
model.matrix(logit)

Unnamed: 0,(Intercept),datasethi_lgbtq
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,1
8,1,1
9,1,1
10,1,1


In [35]:
# Filter if desired
# filtered <- data[data$fandom %in% c('buffy','dcu','teenwolf'),]
filtered <- data[data$fandom %in% c('buffy','teenwolf'),]
filtered

Unnamed: 0_level_0,fic_id,after_event,monthly_avg_tags,fandom,dataset,has_tag_10,has_tag_25,has_tag_50,has_tag_100,Label,Data
Unnamed: 0_level_1,<int>,<fct>,<dbl>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>
1,4436162,True,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_trans,False
2,4436162,True,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_gay,False
3,4436162,True,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_queerphobia,False
4,4436162,True,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_wedding-marriage,False
5,4436162,True,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_M/M,False
6,4436162,True,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_F/F,False
7,4436162,True,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_F/M,True
8,4436162,True,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_Gen,False
9,4436162,True,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_Multi,False
10,4436162,True,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_Other,False


In [36]:
write.csv(filtered, '/data/fanfiction_ao3/tmp/stacked1_filtered2.csv', row.names=TRUE)

In [31]:
sapply(filtered, levels)

In [46]:
s


Call:
glm(formula = "Data ~ dataset", family = binomial(link = "logit"), 
    data = data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.4542  -0.4542  -0.4442  -0.4442   2.1748  

Coefficients:
                Estimate Std. Error  z value Pr(>|z|)    
(Intercept)     -2.21943    0.01157 -191.838  < 2e-16 ***
datasethi_lgbtq -0.04670    0.01591   -2.936  0.00332 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 115655  on 183091  degrees of freedom
Residual deviance: 115646  on 183090  degrees of freedom
AIC: 115650

Number of Fisher Scoring iterations: 5


In [39]:
# Convert columns to logical (for multinomial logistic models)

# data$choice <- as.logical(as.integer(data$Data))
data$choice <- as.logical(data$Data)
data$after_event <- as.logical(data$after_event)
data

fic_id,after_event,monthly_avg_tags,fandom,dataset,has_tag_10,has_tag_25,has_tag_50,has_tag_100,Label,Data,choice
<int>,<lgl>,<dbl>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<lgl>
4436162,TRUE,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_trans,False,FALSE
4436162,TRUE,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_gay,False,FALSE
4436162,TRUE,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_queerphobia,False,FALSE
4436162,TRUE,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_wedding-marriage,False,FALSE
4436162,TRUE,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_M/M,False,FALSE
4436162,TRUE,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_F/F,False,FALSE
4436162,TRUE,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_F/M,True,TRUE
4436162,TRUE,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_Gen,False,FALSE
4436162,TRUE,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_Multi,False,FALSE
4436162,TRUE,4.606287,buffy,hi_lgbtq,False,False,False,False,contains_Other,False,FALSE


In [4]:
library('mlogit')

In [49]:
library('nnet')

In [16]:
library('lme4')

Loading required package: Matrix



In [25]:
# logit = glm('Data ~ Label * dataset * after_event + dataset:fandom + monthly_avg_tags', data=data, family=binomial(link='logit'))
# logit = glm('Data ~ Label + 
#                     after_event + 
#                     monthly_avg_tags + 
#                     dataset:fandom + 
#                     dataset + 
#                     dataset:Label + 
#                     after_event:Label + 
#                     after_event:dataset +
#                     after_event:dataset:Label', data=data, family=binomial(link='logit'))
logit = glmer('Data ~ Label | fandom', data=data, family=binomial(link='logit'))
# logit = multinom('choice ~ Label', data=data, family=binomial(link='logit'))
# logit = mlogit(choice ~ Label, data=data, choice='choice', shape='long', id.var='fic_id')
s = summary(logit)
s
# print(s$aic)
# s$coefficients

“maxfun < 10 * length(par)^2 is not recommended.”


## Just for contains_trans (not stacked)

In [14]:
data = read.csv('/data/fanfiction_ao3/tmp/tags_marriage_equality.csv', stringsAsFactors=TRUE)
data = subset(data, select = -c(contains_alternate_universe, has_tag))
data

Unnamed: 0_level_0,fic_id,after_event,monthly_avg_tags,fandom,dataset,contains_trans,contains_gay,contains_queerphobia,contains_wedding.marriage,contains_M.M,contains_F.F,contains_F.M,contains_Gen,contains_Multi,contains_Other,contains_wedding.marriage_M.M.F.F,contains_wedding.marriage_F.M
Unnamed: 0_level_1,<int>,<fct>,<dbl>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>
1,4436162,True,4.681707,buffy,hi_lgbtq,False,False,False,False,False,False,True,False,False,False,False,False
2,4452977,True,4.681707,buffy,hi_lgbtq,False,False,False,False,False,True,False,False,False,False,False,False
3,3763297,False,5.721332,buffy,hi_lgbtq,False,False,False,False,False,False,True,False,False,False,False,False
4,5130194,True,5.127832,buffy,hi_lgbtq,False,False,False,False,False,False,True,False,False,False,False,False
5,4933939,True,5.420290,buffy,hi_lgbtq,False,False,False,False,False,False,False,False,False,False,False,False
6,5066692,True,5.420290,buffy,hi_lgbtq,False,False,False,False,False,False,False,False,False,False,False,False
7,3433670,False,4.961538,buffy,hi_lgbtq,False,False,False,False,False,False,True,False,False,False,False,False
8,5268290,True,5.127832,buffy,hi_lgbtq,False,False,False,False,False,False,True,False,False,False,False,False
9,2554079,False,5.060870,buffy,hi_lgbtq,False,False,False,False,False,False,False,False,False,False,False,False
10,2489366,False,5.017978,buffy,hi_lgbtq,False,False,False,False,True,False,True,False,False,False,False,False


In [15]:
summary(data)

     fic_id         after_event  monthly_avg_tags         fandom    
 Min.   :  540434   False:7412   Min.   :4.583    buffy      :1330  
 1st Qu.: 3243393   True :8548   1st Qu.:4.962    dcu        :1330  
 Median : 4393356                Median :5.305    dragonage  :1330  
 Mean   : 4503542                Mean   :5.357    harrypotter:1330  
 3rd Qu.: 5642832                3rd Qu.:5.801    homestuck  :1330  
 Max.   :25511563                Max.   :6.332    jojo       :1330  
                                                  (Other)    :7980  
     dataset     contains_trans contains_gay  contains_queerphobia
 hi_lgbtq:7980   False:15744    False:15826   False:15886         
 lo_lgbtq:7980   True :  216    True :  134   True :   74         
                                                                  
                                                                  
                                                                  
                                              

In [21]:
logit = glm('contains_trans ~ after_event + dataset/fandom', data=data, family='binomial')
summary(logit)


Call:
glm(formula = "contains_trans ~ after_event + dataset/fandom", 
    family = "binomial", data = data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.3361  -0.1930  -0.1178  -0.0849   3.7556  

Coefficients: (12 not defined because of singularities)
                                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)                         -7.43313    1.00467  -7.399 1.38e-13 ***
after_eventTrue                      0.38161    0.14505   2.631 0.008518 ** 
datasetlo_lgbtq                      2.15986    1.06130   2.035 0.041840 *  
datasethi_lgbtq:fandomdcu                 NA         NA      NA       NA    
datasetlo_lgbtq:fandomdcu            0.01943    0.48889   0.040 0.968298    
datasethi_lgbtq:fandomdragonage      3.07709    1.02383   3.005 0.002652 ** 
datasetlo_lgbtq:fandomdragonage           NA         NA      NA       NA    
datasethi_lgbtq:fandomharrypotter         NA         NA      NA       NA    
datasetlo_lgbtq:fandomharry

# Other

In [6]:
sapply(data, levels)