In [9]:
import pandas as pd
from datetime import datetime
import sysconfig
from ipywidgets import Image, HTML, Button, IntProgress, \
    Box, HBox, VBox, GridBox, Layout, ButtonStyle, Output
from IPython.display import display, clear_output


package_dir = sysconfig.get_paths()['purelib']
logo_path = package_dir + '/tortus/Images/tortus_logo.png'

try:
    with open(logo_path, 'rb') as image_file:
        image = image_file.read()

    logo = Image(value=image, format='png', width='100%')
    welcome = HTML("<h2 style='text-align:center'>\
        easy text annotation in a Jupyter Notebook</h2>")

except:
    logo = HTML("<h1 style='text-align:center'>t &nbsp; <span style=\
        'color:#36a849'>o</span> &nbsp; r &nbsp; t &nbsp; u &nbsp; s</h2>")
    welcome = HTML("<h3 style='text-align:center'>\
        easy text annotation in a Jupyter Notebook</h3>")

display(logo, welcome)

class Tortus:
    '''Text annotation within a Jupyter Notebook
    
    :attr annotation_index: A counter for the annotations in progress
    :param df: A dataframe with texts that need to be annotated
    :type df: pandas.core.frame.DataFrame
    :param text: The name of the column containing the text to be annotated
    :type text: str
    :param num_records: Number of records to annotate, defaults to 10
    :type num_records: int, optional
    
    :param id_column: The name of the column containing ID of the text - if None, ``id_column`` 
        is the index of ``df``, default is None
    :type id_column: str, optional
    :param annotations: The dataframe with annotations previously created in this tool.
        If None, ``annotations`` is created with columns ``id_column``, ``text``, ``label``, 
        ``annotated_at``, default is None
    :type annotation_df: pandas.core.frame.DataFrame, optional
    :param random: Determines if records are loaded randomly or sequentially, default is True
    :type random: bool, optional
    :param labels: Annotation labels, default is ['Positive', 'Negative', 'Neutral']
    :type labels: list, optional
    '''
    annotation_index = 0


    def __init__(self, df, text, num_records=10, id_column=None, annotations=None, random=True,
                labels=['Positve', 'Negative', 'Neutral']):
        '''Initializes the Tortus class.'''
        self.df = df
        self.text = text
        self.num_records = num_records
        self.id_column = id_column
        if annotations is None:
            if id_column is None:
                self.annotations = pd.DataFrame(columns=['id_column', text, 'label', 'annotated_at'])
            else:
                self.annotations = pd.DataFrame(columns=[id_column, text,'label','annotated_at'])
        else:
            self.annotations = annotations.copy()
        self.random = random
        self.labels = labels
        self.subset_df = self.create_subset_df()


    def create_subset_df(self):
        '''
        Subsets ``df`` to include only records cued for annotation.
        If ``annotations`` already exists, those records will excuded from the annotation tool.
        :returns: A dataframe that will be used in the annotation tool.
        :rtype: pandas.core.frame.DataFrame
        '''
        if self.annotations.empty:
            subset_df = self.df.copy()

        else:
            leave_out = self.annotations[self.text].to_list()
            subset_df = self.df[~self.df[self.text].isin(leave_out)]

        if self.random:
            try:
                subset_df = subset_df.sample(n=self.num_records)[[self.id_column, self.text]]
            except:
                subset_df = subset_df.sample(n=self.num_records)[[self.text]]
        else:
            try:
                subset_df = subset_df[[self.id_column, self.text]][:self.num_records]
            except:
                subset_df = subset_df[[self.text]][:self.num_records]

        return subset_df


    def create_record_id(self):
        '''Provides a record id for ``annotations``.
        :returns: A list of record ids that refer to each text in subset df created by 
            :meth:`create_subset_df` method.
        :rtype: list
        '''
        if self.id_column is None:
            record_id = self.subset_df.index.to_list()
        else:
            record_id = self.subset_df[self.id_column].to_list()
        return record_id

    def make_html(self, text):
        '''Changes text to html for annotation widget user interface.
        :param text: Text for conversion to html.
        :type text: str
        :returns: HTML snippet
        :rtype: str
        '''
        html = '<h4>' + text + '</h4>'
        return html

    def annotate(self):
        '''Displays texts to be annotated in a UI. Loads user inputted labels and timestamps into
            ``annotations`` dataframe.
        '''
        try:
            with open(logo_path, 'rb') as image_file:
                image = image_file.read()
                logo = Image(value=image, format='png', width='40%')

        except:
            logo = HTML('<h1>t &nbsp; <span style="color:#36a849">o</span> \
            &nbsp; r &nbsp; t &nbsp; u &nbsp; s</h1>')

        rules = HTML(
            'Click on the label corresponding with the text below. Each selection requires \
                confirmation before proceeding to the next item.')
        annotation_text = self.subset_df.iloc[self.annotation_index, -1]
        html = self.make_html(annotation_text)
        text = HTML(html)
        
        labels = []
        for label in self.labels:
            label_button = Button(
                description=label,
                layout=Layout(border='solid', flex='1 1 auto', width='auto'),
                style=ButtonStyle(button_color='#eeeeee', font_weight='bold'))
            labels.append(label_button)

        label_buttons = HBox(labels)
        
        skip_button = Button(
            description='Skip',
            layout=Layout(border='solid', flex='1 1 auto', width='auto'),
            style=ButtonStyle(button_color='#eeeeee', font_weight='bold'))
       
        confirm_button = Button(
            description='Confirm selection',
            layout=Layout(border='solid', flex='1 1 auto', width='auto', grid_area='confirm'),
            style=ButtonStyle(button_color='#eeeeee', font_weight='bold'))
        
        redo_button = Button(
            description='Try again',
            layout=Layout(border='solid', flex='1 1 auto', width='auto', grid_area='redo'),
            style=ButtonStyle(button_color='#eeeeee', font_weight='bold'))
        
        progress_bar = IntProgress(
                value=self.annotation_index,
                min=0,
                max=self.num_records,
                step=1,
                description=f'{self.annotation_index + 1}/{self.num_records}',
                bar_style='',
                orientation='horizontal',
                layout=Layout(width='50%', align_self='flex-end'))
        progress_bar.style.bar_color = '#36a849'
    
        header = HBox([logo, progress_bar])
        sentiment_buttons = HBox([label_buttons, skip_button])
        sentiment = labels + [skip_button]
        confirm = [confirm_button, redo_button]

        box_layout = Layout(
            display='flex',
            flex_flow='wrap',
            align_items='stretch',
            width='100%'
        )

        box_sentiment = Box(sentiment, layout=box_layout)
        box_confirm = Box(confirm, layout=box_layout)

        all_buttons = VBox(
            [box_sentiment, box_confirm],
            layout=Layout(width='auto', grid_area='all_buttons')
        )

        ui = GridBox(
            children=[all_buttons],
            layout=Layout(
                width='100%',
                grid_template_rows='auto auto',
                grid_template_columns='15% 70% 15%',
                grid_template_areas='''
                ". all_buttons ."
                ''')
        )
        
        output = Output()

        display(header, rules, text, ui, output)
        confirm_button.layout.visibility = 'hidden'
        redo_button.layout.visibility = 'hidden'    


        def label_buttons_clicked(button):
            '''Response to button click of any sentiment buttons.
            
            Appends ``annotations`` with label selection.
            :param button: Label buttons click. 
            '''
            button.style.button_color = '#36a849'
            record_id = self.create_record_id()
            self.annotations.loc[len(self.annotations)] = [
                record_id[self.annotation_index],
                self.subset_df[self.text].iloc[self.annotation_index],
                str(button.description).lower(),
                datetime.now().replace(microsecond=0)  
            ]
            
            for label in labels:
                label.disabled = True
                if button != label:
                    label.layout.border = 'None'

            skip_button.disabled = True
            skip_button.layout.border = 'None'
                
            with output:
                clear_output(True)
                sentiment_buttons.layout.visibility = 'visible'
                confirm_button.layout.visibility = 'visible'
                redo_button.layout.visibility = 'visible'

        for label in labels:
            label.on_click(label_buttons_clicked)
        

        def skip_button_clicked(button):
            '''Response to button click of the skip button.
            Appends ``annotations``. Label value is ``Null``.
            
            :param button: Skip button click.
            '''
            button.style.button_color = '#36a849'
            record_id = self.create_record_id()
            self.annotations.loc[len(self.annotations)] = [
                record_id[self.annotation_index],
                self.subset_df[self.text].iloc[self.annotation_index],
                None,
                datetime.now().replace(microsecond=0)  
            ]
            for label in labels:
                label.disabled = True
                label.layout.border = 'None'

            skip_button.disabled = True
                
            with output:
                clear_output(True)
                sentiment_buttons.layout.visibility = 'visible'
                confirm_button.layout.visibility = 'visible'
                redo_button.layout.visibility = 'visible'
            
        skip_button.on_click(skip_button_clicked)


        def confirm_button_clicked(button):
            '''Response to click of the confirm button.
            Advances the ``annotation_index`` to view the next item in the annotation tool.
                Indicates the tool is done if ``annotation_index`` does not advance further.
            
            :param button: Confirmation button click.
            '''
            if self.annotation_index < len(self.subset_df) - 1:
                self.annotation_index += 1
                clear_output(True)
                self.annotate()
            else:

                clear_output(True)
                progress_bar.value = self.num_records
                progress_bar.description = 'Complete'
                display(header, output)    

        confirm_button.on_click(confirm_button_clicked)


        def redo_button_clicked(button):
            '''Response to click of the redo button.
            Deletes the most recent input to ``annotations``.
            
            :param button: Redo button click.
            '''
            self.annotations = self.annotations.head(-1)
            for label in labels:
                label.style.button_color = '#eeeeee'
                label.disabled = False
                label.layout.border = 'solid'
                
            skip_button.style.button_color = '#eeeeee'
            skip_button.disabled = False
            skip_button.layout.border = 'solid'

            with output:
                clear_output(True)
                sentiment_buttons.layout.visibility = 'visible'
                confirm_button.layout.visibility = 'hidden'
                redo_button.layout.visibility = 'hidden'

        redo_button.on_click(redo_button_clicked)

Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00#0\x00\x00\x06\xc4\x08\x06\x00\x00\x00\xa5\xaf~d\x00\…

HTML(value="<h2 style='text-align:center'>        easy text annotation in a Jupyter Notebook</h2>")

In [10]:
import pandas as pd
import os

In [17]:
def annotate_tweets(path, text_column = "cleaned_content", num_records=100, prev_annotations = None, additional_labels = []):
    df = pd.read_csv(path)
    df["annotate_text"] = "<b>Raw:</b> " + df["rawContent"] + "<br><b>Clean</b>: " + df[text_column]
    
    temp_path_list = os.path.dirname(path).split('/')
    temp_path_list[0] = 'annotations'
    outdirs = '/'.join(temp_path_list)
    basename = os.path.basename(path)
    os.makedirs(outdirs, exist_ok = True)
    output_path = os.path.join(outdirs, basename)
    
    if os.path.exists(output_path):
        print(f"Annotations already exist for: {output_path}, adding to these annotations")
        prev_annotations = pd.read_csv(output_path, index_col = "Unnamed: 0")
        
    tortus = Tortus(df, "annotate_text", num_records=num_records, annotations=prev_annotations, labels=["full_standard_english", "not-syntactic_standard_english", "non_standard_english", "code-switched", "some_english", "not_english"] + additional_labels)
    tortus.annotate()
    return tortus, output_path

def save_annotations(tortus, output_path):
    tortus.annotations.to_csv(output_path)


In [18]:
tortus, output_path = annotate_tweets("data/Singapore/GenerateMostRecentTweets/3000_most_recent_tweets/0.7_to_0.8_english_words.csv", num_records = 10)

HBox(children=(Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00#0\x00\x00\x06\xc4\x08\x06\x00\x00\x00…

HTML(value='Click on the label corresponding with the text below. Each selection requires                 conf…

HTML(value='<h4><b>Raw:</b> his next kdrama when 😩<br><b>Clean</b>: his next kdrama when</h4>')

GridBox(children=(VBox(children=(Box(children=(Button(description='full_standard_english', layout=Layout(borde…

Output()

In [15]:
save_annotations(tortus, output_path)
tortus.annotations

Unnamed: 0,id_column,cleaned_content,label,annotated_at
0,183,Raw: @GeorgeInTheMeta @GuildOfGuardian Get tha...,not_standard_english,2022-10-27 15:13:31
1,73,Raw: #Splatoon3 #NintendoSwitch\nim going to g...,standard_english,2022-10-27 15:13:40
2,43,Raw: Completed reading the 2 books by Dr Pornt...,standard_english,2022-10-27 15:13:48
3,101,Raw: Past and future mistake\nLol\nClean: past...,not_standard_english,2022-10-27 15:14:01
4,88,Raw: ZIM Integrated Shipping Services Ltd. Ran...,standard_english,2022-10-27 15:15:02
5,114,Raw: An uplifting day both to be back visiting...,standard_english,2022-10-27 15:15:10
6,205,Raw: Idky suddenly theres so many ulcers on my...,not_standard_english,2022-10-27 15:15:17
7,27,Raw: @rkirubi Thank you so much. Enjoyed that ...,not_standard_english,2022-10-27 15:15:36
8,18,Raw: @alanrslee ada specific set of reasoning ...,code-switched,2022-10-27 15:16:10
9,90,Raw: @LukeGromen Commodities &gt; fiat currenc...,not_standard_english,2022-10-27 15:17:19


In [76]:
tortus.annotations

Unnamed: 0_level_0,Unnamed: 0,cleaned_content,label,annotated_at
id_column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
68,0,alles autos credit ad 2 colour copyrighted amp...,standard_english,2022-10-03 17:02:22
1,1,paling best texture dia cair macam toner tapi ...,not_english,2022-10-03 17:02:25
26,2,tak gak amik yang kiwi passionfruit guava tu s...,not_english,2022-10-03 17:02:26
13,40,poling jebakan yang paling gak pantas katanya,not_english,2022-10-03 17:03:08
61,4,hmm nike tak nk supply boot kat aliksan dgn sp...,not_english,2022-10-03 17:02:30
48,5,arghh nak nangis nak top 5 dapat top 4 jomm ki...,not_english,2022-10-03 17:02:32
29,6,sb se pehly tmhain doob k mar jana chaiye bc,not_english,2022-10-03 17:02:34
36,7,hyderabadi humour uno cycle chalate chalate pa...,not_english,2022-10-03 17:02:36
60,8,dipimpin nagita slavina inilah profil es teh i...,not_english,2022-10-03 17:02:38
46,9,apa yg salah ku pun tak tau,not_english,2022-10-03 17:02:40


In [3]:
import os


'data/Singapore/GenerateMostRecentTweets/3000_most_recent_tweets/0.7_to_0.8_english_words_annotations.csv'

In [29]:
temp_path_list = os.path.dirname('data/Singapore/GenerateMostRecentTweets/3000_most_recent_tweets/0.7_to_0.8_english_words.csv').split('/')
basename = os.path.basename('data/Singapore/GenerateMostRecentTweets/3000_most_recent_tweets/0.7_to_0.8_english_words.csv')

In [26]:
temp_path_list[0] = 'annotations'
temp_path_list


['annotations',
 'Singapore',
 'GenerateMostRecentTweets',
 '3000_most_recent_tweets']

In [28]:
outdirs = '/'.join(temp_path_list)
os.makedirs(outdirs, exist_ok = True)

In [30]:
os.path.join(outdirs, basename)

'annotations/Singapore/GenerateMostRecentTweets/3000_most_recent_tweets/0.7_to_0.8_english_words.csv'

In [7]:
tests = ['try', 'not_try', 'etc']

labels = []
widget = ipyannotations.text.ClassLabeller(options=['spam', 'not spam'], allow_freetext=False)
widget.display(
    "Greetings! Your esteemed research would be suitable \nfor publication in our scientific journal.")
