# Using the DocuSco-Bert Tagger

Analyze a corpus of student essays (sentence or paragraphs) using the David Brown's DocuSco-Bert Tagger. 
See documentation here: https://huggingface.co/browndw/docusco-bert?text=My+name+is+Clara+and+I+live+in+Berkeley%2C+California. 

Files needed: 
* CSV containing student essays and score data (one essay and score per row)

Limits: 
* Accepts documents of max length 512 (info on max length and truncation [here](https://towardsdatascience.com/how-to-apply-transformers-to-any-length-of-text-a5601410af7f) and [here.](https://stackoverflow.com/questions/65246703/how-does-max-length-padding-and-truncation-arguments-work-in-huggingface-bertt)
* Performs word piece tokenization which splits words and labels each piece with a LAT (redundant). Current workaround is to remove repeated pieces manually. Issue discussed [here.](https://stackoverflow.com/questions/62082938/how-to-stop-bert-from-breaking-apart-specific-words-into-word-piece)


## Setup

In [None]:
#Load packages
import glob
import os
import pandas as pd
import re

In [None]:
# Set working directory
path = os.chdir("/Users/megankane/Desktop/")

In [None]:
#Load DocuScope model
#https://huggingface.co/browndw/docusco-bert
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
tokenizer = AutoTokenizer.from_pretrained("browndw/docusco-bert", truncation=True, max_length=512)
model = AutoModelForTokenClassification.from_pretrained("browndw/docusco-bert")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

## Run DocuSco-BERT tagger on example sentence

In [None]:
#Define example sentence
example = "Globalization is the process of interaction and integration among people, companies, and governments worldwide."

#Run nlp pipeline with DocuSco-BERT tagger on example sentence
ds_results = nlp(example)

#Print results
print(ds_results)

In [None]:
#Put results into a dataframe
lat_df = pd.DataFrame(ds_results)
lat_df

In [None]:
#Remove any strings that contain ## (redundant)
lat_df = lat_df[lat_df["word"].str.contains("##") == False]

##Remove b and i from each colum
#Data were split into chunks that don't split B + I sequences 
#and end with sentence-final punctuation marks (i.e., period, quesiton mark or exclamaiton point).
lat_df['entity'] = lat_df['entity'].str.replace('B-','')
lat_df['entity'] = lat_df['entity'].str.replace('I-','')

#Now we have a workable dataframe of LATS
lat_df

In [None]:
#Download df to csv
lat_df.to_csv('lat_df.csv')

## Run DocuSco-BERT on corpus of student essays

In [None]:
#Upload dataframe of student essay paragraphs
df = pd.read_csv(r'rhetorical_sentences.csv', index_col=0)
df.reset_index(inplace=True, drop=True)
df

In [None]:
#Create list for dfs
list_of_dfs = []

#Create for loop to run DocuSco-BERT on each text in dataframe
for doc in df['Text']:
    result = nlp(doc)
    #Create new dataframe to contain results of each tagged document
    lat_df = pd.DataFrame(result)
    #Remove any redundant lats from split words
    lat_df = lat_df[lat_df["word"].str.contains("##") == False]
    #display(df.head())
    #Append each dataframe to list of dataframes
    list_of_dfs.append(lat_df)
    #Print list of length (to track progress)
    print(len(list_of_dfs))

In [None]:
#Check on a one of the tagged dataframes
list_of_dfs[1]

In [None]:
#Concatenate list of dfs to one dataframe
lat_dfs = pd.concat(list_of_dfs)

#Remove b and i from each colum
#Data were split into chunks that don't split B + I sequences 
#and end with sentence-final punctuation marks (i.e., period, quesiton mark or exclamaiton point).
lat_dfs['entity'] = lat_df['entity'].str.replace('B-','')
lat_dfs['entity'] = lat_df['entity'].str.replace('I-','')
lat_dfs

In [None]:
#Download list of dataframes to csv
lat_dfs.to_csv('filename.csv')

In [None]:
#Create master dataframe with LATs as strings from each document

#Append names of LATs in each dataframe to master list
results_list = []

#Append each LAT column to list and then to list of lists
for df in list_of_dfs:
  l = df['entity'].to_list()
  result = ' '.join(str(item) for item in l)
  result = result.replace('nan', '')
  results_list.append(result)
    

In [None]:
#Make list for filenames/scores
scores = df['Score_ID_Sentence']

#Add list of lists to dataframe
results_list

result_df = pd.DataFrame(results_list)

#Add scores/filenames to dataframe
result_df['Scores'] = scores

#result_df
result_df

In [None]:
#Reorganize columns 
final_lat_df = result_df[['Scores', 0]]
final_lat_df.columns.values[1] = "LAT Strings"

#Remove b and i from each colum
#Data were split into chunks that don't split B + I sequences 
#and end with sentence-final punctuation marks (i.e., period, quesiton mark or exclamaiton point).
final_lat_df['LAT Strings'] = final_lat_df['LAT Strings'].str.replace('B-','')
final_lat_df['LAT Strings'] = final_lat_df['LAT Strings'].str.replace('I-','')

#Check DF
final_lat_df

In [None]:
#Download to csv
final_lat_df.to_csv('citation_LATs_strings.csv')

In [None]:
#Download each row as a text named with the score and ID number 
texts = []
for row in final_lat_df['LAT Strings'].items():
    row_string = (str(row[1]))
    texts.append(row_string)

#Add filenames to list
filenames = []
for row in final_lat_df['Scores'].items():
    row_string = (str(row[1]))
    filenames.append(row_string)

filenames[1]

#Make new directory to store text files
!mkdir lat_strings

#Write texts to files
n = 0
for item in texts:
  f = open("lat_strings/" + filenames[n] +  '.txt','w')
  n= n+1
  f.write(item)
  f.close()
  