#### Train LSTM Classifier
This notebook trains the main DF (KKS.csv) using KFold Cross Validation. Expects an already-trained language model which was trained using Create Language Model.ipynb in this directory. Training happens in separate .py file to conserve GPU memory.

#### Libraries, Functions

In [4]:
import pandas as pd
from sklearn.utils import shuffle
from neuralegal_kfold import *
from fastai import *

def shuffle_keep_pairs(df):
    #This function shuffles the DF but keeps docket pairs together to avoid independence issues. Returns shuffled DF.
    #Expects DF with labels column 0, text column 1 and docket column 2.
    
    droprows = []
    #Drops half the rows, puts pet and resp text on the same line so they'll shuffle together
    for i, row in df.iterrows():
        if i > 0:
            curdoc = df.loc[i][2]
            lastdoc = df.loc[i-1][2]   

            if curdoc == lastdoc:
                resplabel = df.loc[i][0]
                resptext = df.loc[i][1]
                df.at[i-1,3] = int(resplabel) 
                df.at[i-1,4] = resptext

                droprows.append(i)
    df = df.drop(df.index[droprows])
    df = shuffle(df)
    
    data = []
    #Shuffles the DF, then re-places pet and resp text/labels on separate lines
    for i, row in df.iterrows():
        petlabel = df.loc[i][0]
        pettext = df.loc[i][1]
        docket = df.loc[i][2]
        data.append([int(petlabel),pettext,docket])
        
        resplabel = df.loc[i][3]
        resptext = df.loc[i][4]
        docket = df.loc[i][2]
        data.append([int(resplabel),resptext,docket])
        
    df = pd.DataFrame(data)
    
    return df

def make_kfold_sets(df):
    #Makes K new training and testing dataframes with the respective testing sets removed; calls the above shuffle function

    df = shuffle_keep_pairs(df)
    chunk = int(len(df)/kfolds) #length of each test set

    testingsets = []
    trainingsets = []

    for indx,testingset in enumerate(range(kfolds)):
        testingset = df[chunk*indx:chunk*(indx+1)]
        testingsets.append(testingset)

    for indx,trainingset in enumerate(range(kfolds)):
        trainingset = df.drop(testingsets[indx].index.values)
        trainingsets.append(trainingset)

    return testingsets, trainingsets

#### Model Parameters

In [None]:
bs = 30 #Batch size: 
kfolds = 10
learn_rate = 1e-2 #Starting learn rate
dropout = 0.4
ident = dropout #Puts identifier on saved filenames

#### Load Language Model

In [None]:
#Specific location of language model files; these need to be separately generated as they are very large.
path = '../models'
data_lm = load_data(path, fname='lm.pkl')
encoder = f'{path}/lm_enc'

#### Train Main Model

In [None]:
#Outputs data

data = '../Dataframes'
dataframe = "KKS.csv"

df = pd.read_csv(f"{data}/{dataframe}", encoding = "utf-8")
testingsets, trainingsets = make_kfold_sets(df) #Creates 10 training and 10 matching testing sets

for k in range(kfolds):
    kfold(ident, k,dataframe, path, learn_rate, data_lm, encoder,testingsets[k],trainingsets[k],dropout, bs)



#### Train Justice Models

In [None]:
data = '../Dataframes'
justices = ["Antonin Scalia","John Paul Stevens","Anthony M. Kennedy","David H. Souter","Ruth Bader Ginsburg",
            "Stephen G. Breyer", "John G. Roberts, Jr.","Samuel A. Alito, Jr.","Sonia Sotomayor","Elena Kagan",
            "Neil Gorsuch", "Sandra Day O'Connor","William H. Rehnquist"]

for justice in justices:
    dataframe = f"{justice}_questions.csv"
    
    df = pd.read_csv(f"{data}/{dataframe}", encoding = "utf-8")
    testingsets, trainingsets = make_kfold_sets(df)
    
    for k in range(kfolds):
        kfold(ident, k,dataframe, path, learn_rate, data_lm, encoder,testingsets[k],trainingsets[k],dropout, bs)

