# Preamble

This code and analyses were produced Jason Anastasopoulos (j.andronici@gmail.com)
and Anthony Bertelli for the paper "Understanding Delegation Through Machine Learning: A Method and Application to the EU" published in the *American Political Science Review*: 


https://www.cambridge.org/core/journals/american-political-science-review/article/understanding-delegation-through-machine-learning-a-method-and-application-to-the-european-union/1724F3ECFA1F0AABE3C7F8DA5C5D521B

Here we make predictions on parsed EU Directives and Regulations for the Member State

- Citation:
*Anastasopoulos, L. Jason, and Anthony M. Bertelli. "Understanding delegation through machine learning: A method and application to the European Union." American Political Science Review 114, no. 1 (2020): 291-301.*

## Pre-processing directive data for prediction

Parsed directives and regulations are saved on a local machine.

In [None]:
library(pacman)


# This loads and installs the packages you need at once
pacman::p_load(tm,SnowballC,foreign,plyr,twitteR,slam,foreign,wordcloud,LiblineaR,e1071, topicmodels,readr,
              caret,dplyr,xgboost,quanteda)

# Load the regulation data
directive_provisions_regs = "/Users/jason/Desktop/Provisions/regulation-articles.csv"
newprovisiondata_regs<-read.csv(directive_provisions_regs)

# Load the directive data

directive_provisions_dirs = "/Users/jason/Desktop/Provisions/directive-articles.csv"
newprovisiondata_dirs<-read.csv(directive_provisions_dirs)

#[1] "celex"    "fileyear" "day"      "month"    "year"     "article" 

newprovisiondata_dirs<-newprovisiondata_dirs[,1:6]
# Restrict from 1960 onward
newprovisiondata_dirs<-newprovisiondata_dirs[as.numeric(as.character(newprovisiondata_dirs$fileyear))>1960,]
newprovisiondata_dirs[,2]<-as.numeric(as.character(newprovisiondata_dirs[,2]))
newprovisiondata_dirs[,3]<-as.numeric(as.character(newprovisiondata_dirs[,3]))
newprovisiondata_dirs[,4]<-as.numeric(as.character(newprovisiondata_dirs[,4]))
newprovisiondata_dirs[,5]<-as.numeric(as.character(newprovisiondata_dirs[,5]))

# Combine these into one mega data frame

newprovisiondata<-bind_rows(newprovisiondata_regs,newprovisiondata_dirs)

# Save the year, month and celex, but get rid of the text
newprovision.year = newprovisiondata$fileyear
newprovision.celex = newprovisiondata$celex
newprovisiondata.month = newprovisiondata$month

rawtext = newprovisiondata$article

## Process the text data
# Preprocess the text

# Now we have to put the training data and classification data into one matrix

cleancorpus.predict<-corpus(rawtext)

token.dirty  = tokens(cleancorpus.predict,ngrams = 1:2)
token.clean = tokens_select(token.dirty, 
                            c("/","@", "\\|","#","http","https" ,".com","$", " g "),
                            selection ="remove")


dtm.predict <- dfm(token.clean, remove = stopwords("english"), 
                   remove_punct = TRUE,stem = TRUE)

# Remove the big files for memory concerns
rm(newprovisiondata_regs,newprovisiondata_dirs,provisiontext.predict,cleancorpus.predict,newprovisiondata)


## Predictions for constraint classifiers

In [None]:
final.delegation.data = data.frame(newprovision.celex,newprovisiondata.month,newprovision.year)

classifier.list = list.files("~/Dropbox/Research/Papers/Delegation-ML-Project/Dataverse Files copy/Pipeline/Trained_Classifiers_MS")

science = c() # We're going to fill this with file paths.

for(i in 1:length(classifier.list)){
  science[i] = paste("~/Dropbox/Research/Papers/Delegation-ML-Project/Dataverse Files copy/Pipeline/Trained_Classifiers_MS/",
                     classifier.list[i],sep = "")
}

colnames.classifiers = c()

# Generate predictions for each of the constraint classifiers, save the predictions in a big ass file
for(i in 1:length(science)){
  load(science[i])
  print(science[i])

  # First step is to match the columns to subset
  colnames<-colnames(trainX)
  fullnames<-dtm.predict@Dimnames$features
  indexno<-c()

  for(j in 1:length(colnames)){
    tempname = colnames[j]
    indexnum = which(tempname == fullnames)
    indexno = c(indexno, indexnum)
  }


  dtm_mat_class<-dtm.predict[,indexno]

  # Make predictions
  constraint.probs <- as.vector(predict(xgb1,dtm_mat_class))
  constraint.predictions <- ifelse(constraint.probs > 0.5,1,0)
  
  final.delegation.data = data.frame(final.delegation.data, constraint.probs,constraint.predictions)
  colnames.classifiers = c(colnames.classifiers,
                           paste(constrainttype,".probs", sep = ""), 
                          paste(constrainttype,".preds", sep =""))
  #rm(science[i])
}

names(final.delegation.data)[4:dim(final.delegation.data)[2]]<-colnames.classifiers


## Predictions for delegation classifier

In [None]:
##########################################################################################################################
##########################################################################################################################
##########################################################################################################################
################################ Delegation Classifier  ##################################################################
##########################################################################################################################
##########################################################################################################################
##########################################################################################################################
##########################################################################################################################

load("~/Dropbox/Research/Papers/Delegation-ML-Project/Dataverse Files copy/Pipeline/Trained_Classifiers_MS/delegation-ms.RData")

# First step is to match the columns to subset
colnames<-colnames(trainX)
fullnames<-dtm.predict@Dimnames$features
indexno<-c()

for(j in 1:length(colnames)){
  tempname = colnames[j]
  indexnum = which(tempname == fullnames)
  indexno = c(indexno, indexnum)
}


dtm_mat_class<-dtm.predict[,indexno]

# Make predictions
delegation.probs <- as.vector(predict(xgb1,dtm_mat_class))
delegation.predictions <- ifelse(delegation.probs > 0.5,1,0)


final.delegation.data = data.frame(final.delegation.data, delegation.probs,delegation.predictions,rawtext)


write.csv(final.delegation.data,
"~/Dropbox/Research/Papers/Delegation-ML-Project/Dataverse Files copy/Pipeline/Disaggregated-Predictions/disaggregated-ms-predictions.csv")
