# Drop Junk Phrases

Last updated: 18 June 2024

This Notebook is to drop phrases from the results output in phase 02 that are detected by matcher as quotations but are not in fact quotations. Examples of this can include:

- coincidental repetition of common words (e.g. "the question is whether")
- multi-word idioms (e.g. "at the end of the day")
- multi-word names, e.g.
    - place names: "Place de la Concorde"
    - names of people: "José Ortega y Gasset"
    - names of books: "Discipline and Punish: The Birth of the Prison"
    - names of publishers: "Johns Hopkins University"

This Notebook guides the user through identifying what counts as junk phrases and removing them from the full results JSONL file.

The Notebook saves a list of dropped junk phrases, in case the user needs to repeat the process, check what was dropped, or add additional junk phrases to remove later.

The Notebook prioritizes the most frequently repeating junk phrases, since these are likely to skew the results most intensely. Low-frequency junk phrases are probably not worth removing one by one.

In [1]:
# import libraries needed

import sys

import pandas as pd
import numpy as np

try:
    import re
except:
    !{sys.executable} -m pip install re
    import re


#try:
#    import json
#except:
#    !{sys.executable} -m pip install json
#    import json

try:
    import ipywidgets as widgets
except:
    !{sys.executable} -m pip install ipywidgets
    import ipywidgets as widgets


try:
    from ipywidgets import Label
except:
    !{sys.executable} -m pip ipywidgets
    from ipywidgets import Label
    from iwidgets import widgets


try:
    from pathlib import Path
except:
    !{sys.executable} -m pip install pathlib
    from pathlib import Path

try:
    from IPython.display import display
except:
    !{sys.executable} -m pip install IPython.display
    from IPython.display import display


import os

try:
    import copy
except:
    !{sys.executable} -m pip copy
    import copy


try:
    import ast
except:
    !{sys.executable} -m pip ast
    import ast


try:
    import csv
except:
    !{sys.executable} -m pip csv
    import csv

# 🚨 Adapt these library imports to make clear the distinction between tkinter and ttk.
# 🚨 Maybe rewrite all notebook code to remove "tk" abbreviation?

try:
    import tkinter as tk
except:
    !{sys.executable} -m pip tkinter
    import tkinter as tk
    from tkinter import ttk

from tkinter import ttk

In [2]:
# Defines a function that returns the path to a folder selected using the folder picker, a system folder navigation dialog

from tkinter import filedialog

def open_folder_dialog():
    root = tk.Tk()
    root.withdraw()  # Hide the main window
    folder_selected = filedialog.askdirectory()  # Open the folder dialog
    print(f'Folder Selected: {folder_selected}')
    return folder_selected

In [4]:
# Defines a function that opens (or creates) a CSV file containing user data in the current working directory

def write_user_data_to_csv_file(data_dir, author_name, project_name):

    with open('user_data.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        # Write the data
        writer.writerow([data_dir, author_name, project_name])
    return

# 18 June 2024: Paul to resume cell-by-cell tidying and explanation from here

In [6]:
def read_user_data_from_csv_file():
    # Open the CSV file in the current working directory

    if os.path.exists('user_data.csv'):
        with open('user_data.csv', 'r') as file:
            reader = csv.reader(file)
            for row in reader:
                return row
    else:
        return None

# Use the function
#write_user_data_to_csv_file('data_dir', 'author_name', 'project_name')
#    with open('output.csv', 'r') as file:
#        reader = csv.reader(file)

# Use the function
row = read_user_data_from_csv_file()
if row is not None:
    data_dir=row[0]
    author_name=row[1]
    project_name=row[2]

    print(row)
else:
    print('No matching data found.')

No matching data found.


In [None]:

# ACTION: copy path to data directory 


row = read_user_data_from_csv_file()


#🚨  default in developer stage:  path_data_dir=  r"C:\Users\bdt\Documents\Data"


In [None]:

if not row == None:
    path_data_dir= row[0]
    #path_data = path_data.replace("\\\\", "\\")
    author_name= row[1]
    project_name= row[2]
else:
    path_data_dir= r"C:\Users\bdt\Documents\Data"
    author_name= "Joyce"
    project_name= "1922_Ulysses"

#r"C:\Users\bdt\Documents\Data"
path_data_dir2=Path(path_data_dir)


In [None]:

# Create a text widget for the path input
path_data_dir_input = widgets.Textarea(
    # 🚨has to be removed 
    value=path_data_dir,
    placeholder="Paste the path here",
    description="Path:",
    rows=6,
    width= 30)
# Add the instruction line above the input field
path_data_dir_instruction_line = widgets.Label("Paste the path below:")

path_data_dir=path_data_dir_input.value
path_data_dir2 =Path(path_data_dir)

# Create the widgets
folder_path_label = widgets.Label(value="Find data dir folder path by folder dialog:")
folder_path_text = widgets.Text(value= path_data_dir, placeholder="Select a folder...")
folder_path_button = widgets.Button(description="Browse")

# Define the event handler for the button click
def browse_button_clicked(button):
    folder_path = open_folder_dialog()
    if not folder_path == None:
        folder_path_text.value = folder_path
        path_data_dir_input.value= folder_path

# Attach the event handler to the button click event
folder_path_button.on_click(browse_button_clicked)


folder_path_dialog_panel= widgets.VBox([folder_path_label, widgets.VBox([folder_path_text, folder_path_button])])


# Create the panel layout
panel_layout = widgets.VBox([

    
    folder_path_label,
    widgets.HBox([folder_path_text, folder_path_button])
])

# Display the panel
display(panel_layout)

# Create a VBox layout with the path_input widget
instruction_data_dir_line= widgets.Label("Or paste the path below:")

panel_data_dir_layout = widgets.VBox([path_data_dir_input])

# Create a button widget for the commit action
commit_data_dir_button = widgets.Button(description="Confirm")
text_data_dir_label = widgets.Label(value="")
commit_data_dir_box= widgets.HBox([commit_data_dir_button, text_data_dir_label])

panel_data_dir_layout.children = (instruction_data_dir_line, path_data_dir_input, commit_data_dir_box)

# Define the event handler for the commit button
# Update the commit_button_clicked function


def input_field_changed(change):
    new_path = change['new']
  
    new_path = new_path.replace("\\", "\\\\")
    new_path = new_path.replace("'", "")
    new_path = new_path.replace('"', '')
    if Path(new_path).exists():
        instruction_data_dir_line.value = "Paste the path below:"
        text_data_dir_label.value = 'This path exists'
        commit_data_dir_button.layout.visibility = 'visible'
        commit_data_dir_button.description = 'Confirm'
    else:
        commit_data_dir_button.layout.visibility = 'hidden'
        instruction_data_dir_line.value = "Please try again. Paste the path below"
        text_data_dir_label.value = 'This path does not exist'
    
    if Path(new_path).exists():
        instruction_data_dir_line.value = "Paste the path below:"
        text_data_dir_label.value = 'This path exists'
        commit_data_dir_button.layout.visibility = 'visible'

        commit_data_dir_button.description = 'Confirm'
    else:
        commit_data_dir_button.layout.visibility = 'hidden'
        # commit_button.visible = False
        instruction_data_dir_line.value = "Please try again. Paste the path below"
        text_data_dir_label.value = 'This path does not exit' 
    # Perform actions based on the new value
    
# Attach the event handler to the value change event of input_field
path_data_dir_input.observe(input_field_changed, names='value')

def commit_data_dir_button_clicked(button):
    global path_data_dir, path_data_dir2
    new_path= str( path_data_dir_input.value).replace("\\","\\\\")
    new_path = new_path.replace("'", "")
    new_path = new_path.replace('"', '')
    exists= Path(new_path).exists()
    if  exists:
        instruction_data_dir_line.value="Paste the data root path below:"
        path_data_dir = new_path
        path_data_dir2 = Path(new_path)
        commit_data_dir_button.description='Confirmed'
        text_data_dir_label.value='This path exists'
        
        

        
    else:
        instruction_data_dir_line.value="Please try again. Paste the path below"
        text_data_dir_label.value='This path does not exist'

# Attach the event handler to the commit button
commit_data_dir_button.on_click(commit_data_dir_button_clicked)
# Display the panel
display(panel_data_dir_layout)


In [None]:
# a quotatation is an object containing these  attributes
# location: is a tuple of begin position and end position
# string : the actual phrase in the source A text defined by the location
# numMatches is the count of quotations in source B corpus
# junk is the boolean value , set true when the phrase is considered as junk by the user
# index is the index in the quotationsList
# extra is a spare atribute for future use
# quotation class contains the attributes of a quotation object
# the class is used to create a list of quotation objects, which is used to display the data in the widgets
# the calss contains the attributes  th location in the source text, the phrase string value, 
# the number of matches in the quoting journals
# and the junk value ( True when considerded junk by the user)
# the index is the index in the list of quotations
# the extra attribute is a spare attribute for future use

class quotation:
    def __init__(self, string, loc):
        # nu  zonser attribuut self.text
        self.location = loc
        #self.quotations_list = quotations_list 
        self.string = string
        self.numMatches = 0
        self.junk= False
        self.index = 0
        self.extra = False

    
        
    

In [None]:
class quotations:

    def __init__(self, book_project ):
        #self.book_proj = book_project

       

        #if book_project.text is None:
        #    book_project.read_sourceA()  
        #    print(" book_project.text is made")
        #else: 
        self.text= book_project.text
        # print(self.text)
        self.unique_quotations_list = None

        self.locationsInA = book_project.df['Locations in A']
        

        #self.unique_quotations_list = None

        print(len(self.locationsInA))
        #self.unique_quotations_list= self.make_unique_quotations_list()
        self.unique_quotations_list = self.make_unique_quotations_list()
      
        
        print(f"len unique_quotations_list : {len(self.unique_quotations_list)}")
        return 

    def make_unique_quotations_list(self):

        #locationsInA= self.locationsInA 
        non_empty_locations = [loc for loc in self.locationsInA if loc != []]
        # Flatten the list
        # Using list comprehension
        flattened_locations = [item for sublist in non_empty_locations for item in sublist]
        sorted_locations = sorted(flattened_locations)
        self.sorted_locations= sorted_locations
    
        #print(len(sorted_locations))
        #for loc in sorted_locations:
        #    print( f"{loc[0]},   {loc[1]}")  

        loc1 = sorted_locations[0]
        text=self.text
        string = text[loc1[0]:loc1[1]+1]
                
          
        unique_quotations_list = []
        index=0
        new_quotation = quotation(string, loc1)
        new_quotation.index=index
       
        new_quotation.numMatches= 0
        #unique_quotations_list.append(new_quotation)
        

        for i in range(0, len(sorted_locations)):     
            if sorted_locations[i]==loc1:
                new_quotation.numMatches += 1       
            else:
                unique_quotations_list.append(new_quotation)
                loc1 = sorted_locations[i]
                index+=1
                string= self.text[loc1[0]:loc1[1]+1]
                junk= False
                #all_equal = True
        
                #new_location2= quotation2(string,loc1   )
                string = text[loc1[0]:loc1[1]+1]
                new_quotation = quotation(string, loc1)
                new_quotation.numMatches = 1
                new_quotation.index= index
        #self.unique_quotations_list = unique_quotations_list

        return unique_quotations_list 
    
       
    def add_unique_quotations_list(self): 
        pass

 

In [None]:

# the class 'book_Project'  contains all functionalitity to create a
# unique quotationsLst and  user filtered versions of that list 
# setting or getting user settings for user sessions wwhile orking on a book project
# reading and writing these settings from and to csv files

 
 # the book_Project class defines data like project dirs, short filenames, make project data
# etc, facilitating the use of the project data in phase 2 and 3   
# 

# define project dir, short filename, make project data

class Book_Project:
  def __init__(self, data_dir, author_name, pub_book_name):
   
    #dataFDir is string of root dir path
    # pub_book_name contains string pubicationyear and name of the book 
    self.pub_book_name = pub_book_name

    # author_name contains sting with name of the author
    self.author_name = author_name

    #projectName contains string with authorname and pubBookName 
    self.projectName = f"{self.authorName}_{self.pubBookName}"
   
    # dataDir contains a pathobject of path to the root directory of all bookprojects data     
    self.dataDir = Path(dataDir)
    
    # define all the project dirs

    #project_dir contains the Path object to the root directory of this book project
    self.project_dir= Path(self.data_dir/self.author_name/pub_book_name)


    #source_dir contains the Path object to the source directory of this book project
    
    self.source_dir= Path(self.project_dir/'SourceText')

    #corpus_dir contains the Path object to the corpus directory of this book project
 
    self.corpus_dir=Path(self.project_dir/'TargetCorpus')

    #results_dir contains the Path object to the results directory of this book project
    
    self.results_dir=Path(self.project_dir/'Results')
     
    # the project directories are created if they don't exist

    self.make_project_dirs()

    #the string hyperparsuffix is created by make_hyperparsuffix()
    #which is a part of the full JSONL file name
    self.hyperparsuffix=self.make_hyperparsuffix()

    # the path to the plain text of the book project is defined 
    self.path_plain_text=Path(self.source_dir/f"{self.project_name}_plaintext.txt")
    
    # the path to the JSONL file of the book project is defined 
    self.path_JSONL=     Path(self.results_dir/f"{self.project_name}_results_{self.hyperparsuffix}.jsonl")
        
    # the path to the new JSONL file after phase 02 of the book project is defined 
    
    self.path_JSONL_new= Path(self.results_dir/f"{self.project_name}_results_{self.hyperparsuffix}_new.jsonl")
    
    #the attribute text is initialized
    self.text = None

    #the attribute df is initialized
    self.df = None

    #the attribute new_df is initialized
    self.new_df= None

    #the attribute uniqueQuotationsList is initialized
    #uniqueQuotationsList will be a list of all unique quotations, ordered by locatiuon
    # in ascending order 
    self.unique_quotations_list = None

    #the attribute junkphrases is initialized
    #junk phrases will contain the list of all junk phrases
    self.junkPhrases = []

    # check if all the project dirs exist 
    self.all_projectDirs_exist()

  # make an indepent copy of the original df  
  def make_new_df(self):
    self.new_df= copy.copy(self.df)

    return

  # update the approved list of non-junk phrase quotations, in the columns of dfNew

  def update_unique_quotations_list(self, new_unique_quotations_list):
    self.unique_quotations_list = new_unique_quotations_list
    return
    
  # create the text object of the book project, by reading the corresponding textfile   
  def read_sourceA(self):
    path_plain_text = self.path_plain_text
    with open(path_plain_text, encoding='utf-8') as f: 
      rawText = f.read()
      self.text=rawText
    return rawText 

  # create de dataframe df by reading the corresponding JSONL file 
  def make_df(self):
    path = self.path_JSONL
    if path.exists():
      # Load df as pandas dataframe
      df = pd.read_json(path, lines=True) 
      self.df=df
    else: 
      print(f"file {path}  does not exist" )
    return df   

   # create the dataframe dfNew by reading the corresponding JSONL file  
  def read_dfNew_from_file(self):

    path = self.path_JSONL_new
    if path.exists():
    # Load results as pandas dataframe
      new_df = pd.read_json(path, lines=True) 
      self.new_df= new_df
    else: 
      print(f"file {path}  does not exist" )
    return new_df   

  def write_new_df_to_file(self): 

    path = self.path_JSONL_new
    self.new_df.to_json(path, orient='records', lines=True)
      
      # Load results as pandas dataframe

    return    

  # writes the uniqueQuotationsList to a csv file

    if self.unique_quotations_list is not None:
      path_quotations_CSV = os.path.join(self.results_dir, "quotations.csv")
      print(len(self.unique_quotations_list))
      
      print( path_quotations_CSV )
      with open(path_quotations_CSV, 'w', newline='', encoding='utf-8') as file:
          writer = csv.writer(file)
          writer.writerow(['junk', 'location', 'string', 'numMatches', 'index'])  # writing headers
          for q in self.unique_quotations_list:
              writer.writerow([str(q.junk), q.location, q.string, q.numMatches, q.index])
              print(f"{q.junk}, {q.location},  {q.string}, {q.numMatches}, {q.index}")

    else:
      print("self.unique_quotations_list is None")  
    return  

#'junk', 'location', 'string', 'numMatches', 'index'

  # create the unique_quotations_list  by reading the coreponding csv file

  # create the uniqueQuotationsList by reading the coreponding csv file

  def read_uniqueQuotationsList_from_csv(self):
    #self.uniqueQuotationsList=[]
    pathQuotationsCSV = os.path.join(self.resultsDir, "quotations.csv")
    with open(pathQuotationsCSV, 'r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header
        self.unique_quotations_list = []
        i=0
        for row in reader:
          i +=1
          if not len(row)==5:
            print(f"{i}, {len(row)} ")

          location_list = ast.literal_eval(row[1])
          q= quotation(self.text, location_list)
          q.junk= bool( row[0])
          q.location= location_list
          q.string= str(row[2])
          q.numMatches=int(row[3]) 
          q.index= int(row[4])
          self.unique_quotations_list.append(q)
    return self.unique_quotations_list


  # make the data for this book project by reading and processing the corresponing data files  
  def read_data(self): 
    if self.text is None:
      self.read_sourceA()  
      print(" self.text is made")
    if self.df is None:  
      self.make_df()
      self.make_new_df()
      print(" book_project.df is made")
    self.unique_quotations_list= quotations(self).unique_quotations_list
    return


  # save de  data of the unique _quottions_list tot a csv file
  def write_quotations_list_to_CSV(self):
    
    path_quotations_CSV = os.path.join(self.results_dir / "quotations.csv")
              
    with open( path_quotations_CSV , 'w', newline='') as file:
      writer = csv.writer(file)
      writer.writerow(['junk', 'location', 'string', 'numMatches', 'index'])  # writing headers
    
      for q in self.unique_quotations_list:
        writer.writerow([q.junk, q.location, q.string, q.numMatches, q.index])     
          
    return

  # make_projectDirs(self): creates the project directiories if thaey do'n't exist yet 

  def make_project_dirs(self):
    if not self.source_dir.exists():
      self.source_dir.mkdir(exist_ok=True)
    if not self.corpus_dir.exists():
      self.corpus_dir.mkdir(exist_ok=True)
    if not self.results_dir.exists():
      self.results_dir.mkdir(exist_ok=True)
    return   
    
  # creates a string by using hyperparsuffix default protocol   
  def make_hyperparsuffix(self):    
    thresh = 2
    cut = 3
    ngram = 2
    mindist = 3
    nostops = True
    hyperparSuffix = f"t{thresh}-c{cut}-n{ngram}-m{mindist}-{'nostops' if nostops else 'stops'}"
    return hyperparSuffix

  # all_project_dirs_exist(self) checks if all project directories exist

  def all_project_dirs_exist(self):
    #preetting the value of the return variable exist to False  
    data_dir_exists= self.data_dir.exists()
    if not data_dir_exists: 
      print( f"The data directory {self.dataDir}  does not exist")
    else:
      data_dir_exists = True
      results_dir_exists = self.results_dir.exists()
      
      if not results_dir_exists:
        print( f"The results directory {self.results_dir}  does not exist")
      else:
        results_dir_exists = True 
      
      corpus_dir_exists = self.corpus_dir.exists()
      if not corpus_dir_exists:
        print( f"The corpus directory {self.corpus_dir}  does not exist")
      else:
        corpus_dir_exists = True  
      
      source_dir_exists = self.source_dir.exists()
      if not source_dir_exists:
        print( f"The source directory {self.source_dir}  does not exist")
      else:  
        source_dir_exists = True      
    
    all_dirs_exist = data_dir_exists and source_dir_exists and results_dir_exists and corpus_dir_exists and source_dir_exists
    return all_dirs_exist


  #  get_junkPhrases(self) runs thorugh the uniqueQuotationsList, and checks if the quotations are 'junk' , 
  # and returns a list of junk phrases

  def get_junk_phrases(self):
    junk_phrases=[] 
    for q in self.unique_quotations_list:
      if q.junk:
        junk_phrases.append(q.string) 
        self.junk_phrases= junk_phrases
      return junk_phrases  

  # write_junk_phrases_to_csv(self) writes the list of junk phrases to a csv file
  
  def write_junk_phrases_to_csv(self):
    path_junk_phrases_CSV = os.path.join(self.results_dir, "junk_phrases.csv")
    with open(path_junk_phrases_CSV, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["string"])  # writing header
        for string in self.junk_phrases:  # Removed parentheses
            writer.writerow([string])
    return file
  
  # set_junkPhrases(self) creates a list of junkPhrases out of the uniqueQuotationsList 
  # and sets the value of the attribute self.junkPhrases with this 
  # list of junk phrases

  def set_junk_phrases(self):
    junk_phrases=[] 
    for q in self.unique_quotations_list:
      if q.junk:
        junk_phrases.append(q.string) 

    self.junk_phrases= junk_phrases
    self.write_junk_phrases_to_csv()
    return junk_phrases  
      
  
  # read_junkPhrases_from_csv(self) fills the attribute self.junkPhrases with list of junk_phrases, 
  # it reads out of the the corresponding csv file
  # in which this list is stored

  def read_junk_phrases_from_csv(self):
    path_junk_phrases_CSV = os.path.join(self.results_dir, "junk_phrases.csv")
    junk_phrases = []
    with open(path_junk_phrases_CSV, 'r', newline='', encoding='utf-8') as file:
      reader = csv.reader(file)
      next(reader)  # Skip the header
    
      i=0
      for row in reader:
        i +=1
        if not len(row)==1:
          print(f"{i}, {len(row)} ")
        string=row[0] 
        junk_phrases.append(string)
    self.junk_phrases = junk_phrases
    return junk_phrases        


  # update_all_items_with_accepted_quotations(self) updates the dataframe new_df,
  # updateing the columns 'Loçations_in A'and 'Locations in B'

  def update_all_items_with_accepted_quotations(self):
      
    def check_loc(qloc,locs_list):

      # make a sorted list of locs_list, odered by starting index 
      # of the locations in that list

      locs = sorted(locs_list, key=lambda x: x[0])

      # use  the boolean variable check for checking if the object qloc is in that list     
      check = False
      for loc in locs:
        if qloc[0]> loc[0]:
          #check = False
          break
        else:
          if qloc == loc:
            check = True
            break 
      return check

    locsInA = self.df['Locations in A'] 
    locsInB = self.df['Locations in B']

    #initiaaize  new_locsInA and in B 
    new_locsInA = []
    new_locsInB = []

    #makes list of all not-junk quatation locations
    locs = []
    for q in self.unique_quotations_list: 
      if not q.junk:
        locs.append(q.location)
    if locs==[]:
      print("no accepted quotations")

      return
    else:
      #iterate over all journal items in the dataframe
      for j, item in enumerate(locs_in_A):
        new_item_A = []
        new_item_B = []
        if isinstance(item,list) and item != []:
          if isinstance(item[0], list):

            # iterate over all locations in the item
            for k, loc in enumerate(item): 
              if check_loc(loc, locs):
                new_item_A.append(loc)
                #new_item_B.append(locs_in_B[j,k]) 
          else: 
            loc = item 
            if check_loc(loc, locs):
              new_item_A.append(loc)
              new_item_B.append(locs_in_B[j,k])             

        new_locs_in_A.append(new_item_A)
        new_locs_in_B.append(new_item_B) 

    #df.loc[row_indexer, "col"]
    self.new_df['Locations in A'] = new_locs_in_A
    self.new_df['Locations in B'] = new_locs_in_B

    # still have to reduce the new_df where the locations in A are empty []

    return

  #def read_sourcetextA(self):
  #  self.sourceText_name = str(Path(self.source_dir/f"{self.project_name}_plaintext.txt"))
  #  with open(self.sourceText_name) as f: 
  #    rawText = f.read()
  #  self.plain_sourcetextA = rawtext  # Text(rawText,self.project_name)
  #  return self.plain_sourcetextA

 
  #def read_JSONL(self):
  #  path = self.path_JSONL
  #  if path.exists():
    # Load results as pandas dataframe
  #    self.df = pd.read_json(path, lines=True)  
  #  else: 
  #    print(f"file {path} does not exist" )
  #  return


# Load the corpus you want to find results in
  #def read_corpusA(self):
  #  self.corpusFile_path = Path(self.corpus_dir/f"{self.project_name}_fulltext.jsonl")
  #  with open(self.corpusFile_path) as f:
  #    rawProcessedData = f.readlines()
  #  self.data_fulltext_jsonl = [json.loads(line) for line in rawProcessedData]
  #  return self.data_fulltext_jsonl



In [None]:
import os
#def scan_book_projects(data_dir, author_name):
#    author_dir = Path(data_dir / author_name)
#    book_projects_list = [folder.name for folder in os.scandir(str(author_dir)) if folder.is_dir()]
#    return book_projects_list

#preparatory facilitations for building the book_Project instances

#for making the book_Project projectName string

def make_project_name(pub_year,book_title):
    project_name= f"{pub_year}_{book_title}" 
    return project_name

# for making the Book_Project publication year string: pub_year

def make_pub_year(project_name):
    pub_year = project_name.split("_")[0]
    return pub_year

# for getting a list of book project names in the author's directory
def scan_book_projects(data_dir, author_name):
    author_dir = os.path.join(str(data_dir), author_name)
    book_projects_list = [folder.name for folder in os.scandir(author_dir) if folder.is_dir()]
    return book_projects_list


# for making the Book_Project book title string: book_title

def make_book_title(project_name):
    book_title = project_name.split("_")[1]
    return book_title    


In [None]:

class ProjectsData:
    def scan_Subdirs(self, data_Dir):
        #dataDir is a pathlib Path object
        authors_list = [folder.name for folder in os.scandir(str(data_Dir)) if folder.is_dir()]
        self.authors_list = authors_list
        return authors_list

    def __init__(self, data_Dir):
        self.data_Dir = data_Dir
        self.authors_list = self.scan_Subdirs(self.data_Dir)
        


In [None]:
#🚨  for developers stage. To be removed 
all_projects = ProjectsData(pathDataDir2)


print(all_projects.authors_list)

In [None]:
author_name = all_projects.authors_list[1]

   
pub_title_name = scan_book_projects(path_data_dir, author_name)[0]
book_proj = Book_Project(path_data_dir, author_name, pub_title_name)   #
print( book_proj.path_JSONL)
book_proj.read_data()

In [None]:
myQuotationsList = bookProj.uniqueQuotationsList

# Sort the list by decreasing value of q.numMatches
sorted_quotations_list = sorted(my_quotations_list, key=lambda q: q.numMatches, reverse=True)[0:50]

def sort_quotations_list_by_frequency(quotations_list,ascending):

    sortedQuotationsList = sorted(quotations_list, key=lambda q: q.numMatches, reverse= not ascending)
    return sortedQuotationsList




def sort_quotations_list_by_location(quotations_list,ascending):
    sortedQuotationsList = sorted(quotations_list, key=lambda q: q.location[0], reverse= not ascending)
    return sortedQuotationsList


def sort_quotations_list_by_string(quotations_list,ascending):
    sortedQuotationsList = sorted(quotations_list, key=lambda q: q.string, reverse= not ascending)
    return sortedQuotationsList


In [None]:

ascending = False
sortedQuotationsList= sort_quotations_list_by_frequency(myQuotationsList,ascending)  

for i, q in enumerate(sortedQuotationsList):
    print(f"{i}, {q.location[0]},{q.location[1]}, {q.numMatches},  {q.string}")

In [None]:

ascending = True
sortedQuotationsList= sort_quotations_list_by_string(myQuotationsList,ascending)  

for i, q in enumerate(sortedQuotationsList):
    print(f"{i}, {q.location[0]},{q.location[1]}, {q.numMatches},  {q.string}")

#for i, q in enumerate(myQuotationsList):
#    print(f"{i}, {q.location[0]}, {q.numMatches},  {q.string}") 

In [None]:
print(len(quotations(book_proj).unique_quotations_list))

In [None]:



# ACTION: 
#🚨  


instruction_line = widgets.Label("Chose your book project, and press Confirm button:")

# Create a dropdown widget
authors_dropdown = widgets.Dropdown(
    value= author_name,
    options=all_projects.authors_list,
    description='Authors:'
    )

author_name = authors_dropdown.value

books_dropdown = widgets.Dropdown(
    #value= project_name,
    options=scan_book_projects(path_data_dir, author_name),
    description='Books:'
    )


# Create a VBox layout  with the path_input widget
# panel_layout = widgets.VBox([authors_dropdown, books_dropdown  ])

# Create a button widget for the commit action
commit_button = widgets.Button(description="Confirm")
text_label=widgets.Label(value="")
commit_box= widgets.HBox([commit_button, text_label])
panel_layout=widgets.VBox()
panel_layout.children = (instruction_line,authors_dropdown, books_dropdown, commit_box)

def author_name_changed(change):
    global author_name, books_dropdown
    
    author_name = change['new']
    books_dropdown.options = scan_book_projects(path_data_dir, author_name)
    books_dropdown.value = books_dropdown.options[0]  # Select the first book by default
    commit_button.description='Confirm'

# Attach the event handler to the value change event of authors_dropdown
authors_dropdown.observe(author_name_changed, names='value')


def commit_button_clicked(button):
    global author_name, pub_title_name,book_proj
   
    author_name = authors_dropdown.value
    pub_title_name = books_dropdown.value

    book_proj = Book_Project(path_data_dir, author_name, pub_title_name)   
    book_proj.read_data()
    
    commit_button.description='Confirmed'
    print('passed')
    #text_label.value='This path exists'
    
# Attach the event handler to the commit button
commit_button.on_click(commit_button_clicked)
# Display the panel
display(panel_layout)


In [None]:
#testing some functionality of the book_Project instance

bookProj.uniqueQuotationsList[0].junk=True


book_proj.set_junk_phrases()
print( book_proj.unique_quotations_list[0].junk )
#book_proj.write_unique_quotations_list_to_csv()
#book_proj.read_unique_quotations_list_from_csv()
print( book_proj.unique_quotations_list[0].junk )


#.             .   set_junk_phrases()

In [None]:
book_proj.make_new_df()


In [None]:
# applying the update_all_items_with_accepted_quotations() method 

if len(bookProj.uniqueQuotationsList)>0:
    bookProj.update_all_items_with_accepted_quotations()
else:
    print("The unique_quotations_list is empty.")

 

In [None]:
# eploring 'Locations in A' entry values

for i, item in enumerate(bookProj.df['Locations in A']):
    if item != []:
        print(type(book_proj.df['Locations in A'][i][0][0]))
        print(i)
        print(book_proj.df['Locations in A'][i][0][0])
        break



In [None]:

#exploring some features of a current book_Project instance
print(len(bookProj.dfNew))


#print(book_proj.new_df.columns)


for q in book_proj.unique_quotations_list:
    if q.location== [168085, 168181]:
        q.junk= True
        print(q.string)
        print(q.location)
        print(q.index)




for i in range(len(book_proj.df)):
    if not book_proj.new_df['Locations in A'][i]==[]:
        print(book_proj.df['Locations in A'][i])

        print(book_proj.df['Locations in B'][i])

        print(book_proj.new_df['Locations in A'][i])

        print(book_proj.new_df['Locations in B'][i])
            #['Locations_in_B']))

In [None]:
#exploring writing and reading stored junkphrases
bookProj.set_junkPhrases()
bookProj.read_junkPhrases_from_csv()

print(len(book_proj.junk_phrases))

In [None]:
# exploring uniqueQuotqtionsList attribute of bookProj 
print(len(bookProj.uniqueQuotationsList))

In [None]:

# reading data related to the book Projet instance bookProj
bookProj.read_data()

# expoloring quotations functionality over a given  book_Project instance 

quotations(bookProj)
print(len(bookProj.uniqueQuotationsList))


In [None]:
import csv

# example of reading a quotationsList from CSV 

if bookProj.uniqueQuotationsList is not None:
    pathQuotationsCSV = os.path.join(bookProj.resultsDir, "quotations.csv")
    with open(pathQuotationsCSV[0:50], 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['junk', 'location', 'string', 'numMatches', 'index'])  # writing headers
        for q in book_proj.unique_quotations_list:
            writer.writerow([q.junk, q.location, q.string, q.numMatches, q.index])
            print(f"{q.junk} , {q.location},  {q.string}, {q.numMatches}, {q.index}")

else:
    print('book_proj.unique_quotations_list is none')         

    #book_proj.write_quotations_list_to_CSV()


In [None]:
# a quick setting of the first 15 items in the uniqueQuotatonsList to junk = True 

for q in bookProj.uniqueQuotationsList[0:15]:
    q.junk= True

In [None]:
bookProj.write_uniqueQuotationsList_to_csv()



In [None]:
bookProj.read_uniqueQuotationsList_from_csv()



In [None]:
#testing the set junk value

book_proj.unique_quotations_list[10].junk





In [None]:
#🗣️🚨

# using pathlib
# dataDir is the main directory of all projects of all authors 
#  


def file_exists(full_filename_string):
  filename_path = Path(full_filename_string)

  if filename_path.exists():        
    if filename_path.is_file():
      return True
    else:
      print(f"{filename_path} exists, but it is not a file.")
      return False    
  else:
    print(f"{filename_path} does not exist.")
    return False



In [None]:
#testing the content of a uniqueQuotationsList 

my_uniqueQuotationsList = bookProj.uniqueQuotationsList

print(len(my_uniqueQuotationsList))
for quot in my_uniqueQuotationsList:
    print(quot.string)




In [None]:
#testing write_uniqueQuotationsList_to_csv()

bookProj.write_uniqueQuotationsList_to_csv()

In [None]:
#testing bookProj.uniqueQuotationsList

my_quotations = bookProj.uniqueQuotationsList


for quot in my_quotations[285:287]:
    print(quot.string)

In [None]:
#testing the existence of two quattions at differnt loctions with same phrase value

my_quotations = bookProj.uniqueQuotationsList

quot1= my_quotations[  285]
quot2= my_quotations[  286]

compare_loc1= quot1.location
compare_loc2= quot2.location


compare_len1=compare_loc1[1]- compare_loc1[0]
compare_len2=compare_loc2[1]- compare_loc2[0]
print(quot1.string )
print(quot2.string )

print(len(quot1.string ))
print(len(quot2.string ))

print(f"{compare_loc1[0]},    {compare_loc1[1]},   {compare_len1}   ") 
print(f"{compare_loc2[0]},    {compare_loc2[1]},   {compare_len2}   ") 


In [None]:
# Load the text you want to find quotations from.
#book_proj.read_source()


sourceText = book_proj.path_plain_text
with open(sourceText, encoding='utf-8') as f: 
    rawText = f.read()
print(rawText)


In [None]:
# getting the source A text for the bookProj 
text= bookProj.read_sourceA()

#🚨  find out: Text
#tx = Text(rawText, book_proj.project_name)    
print(text[0:400])


In [None]:
# Building an instance of Book_Project
# create and fill de the data frame df, reading the  JSONL file
# doing a check of the existence of all project dirs

#self.dataDir=dataDir file-exits
path = book_proj.path_JSONL

print(str(path))
if path.exists():
    # Load results as pandas dataframe
    df = pd.read_json(path, lines=True)  
else: 
    print(f"file {path}  does not exist" )




In [None]:

# testing the uniqueQuotationsList attribute
quotationsList=bookProj.uniqueQuotationsList

print(len(quotationsList) )

In [None]:
# testing the uniqueQuotationsList and text attributes

uniqueQuotationsList = bookProj.uniqueQuotationsList
rawText= bookProj.text
print(len(rawText) )
print(len(unique_quotations_list) )


In [None]:
# Defining a comparestring value , for further use in findeing quotatiosn woth equal string value 
# but at different locations 

compareString =  "Cashel Boyle O’Connor Fitzmaurice Tisdall Farrell"




In [None]:
# # make a list of quoatuon obejcts
#
# make adaptions of locationsinA list and parallel to that: adaptions locationsinB list

# Make adatations in numMatches

#locations_in_A = make_list_of_locationsinA(df)
#locations_in_B = make_iist_of_locationsinB(df)


#return[new_locsA,new_locsB] 


In [None]:
# is adapt locationsinA en b verdwenen??non_empty_locations = [loc for loc in self.locationsInA if loc != []]
        # Flatten the list
        # Using list comprehension
#flattened_locations = [item for sublist in non_empty_locations for item in sublist]
#sorted_locations = sorted(flattened_locations)
#self.sorted_locations= sorted_locations

#print(len(sorted_locations))

#loc1 = sorted_locations[0]

        
#unique_quotations_list = []
#index=0
#new_quotation = quotation(self.text, loc1)
#new_quotation.index=index
#index+=1
#new_quotation.numMatches= 1
#unique_quotations_list.append(new_quotation)

In [None]:
def get_no_junk_quotations( quotations_list):
    no_junk_quotations=[]  
    for q in quotations_list:
        if not q.junk:
            no_junk_quotations.append(q)  
    return no_junk_quotations                  
                    
def get_junk_quotations( quotations_list):
    junk_quotations=[]  
    for q in quotations_list:
        if q.junk:
            junk_quotations.append(q)
    return junk_quotations             


In [None]:
# testing get_junk_quotations() method

jpl=get_junk_quotations(bookProj.uniqueQuotationsList)
print(len(jpl))

for jp in jpl:
    print(jp.string) 

In [None]:
# defining make_equal_string_quotations_list(compare_string, quotations_list), 
# which returns a list of two ists: 
# a ist of indices and a list of quotations with equal phrase as the compare_string

def make_equal_string_quotations_list (compareString, quotationsList):
    equalQuotationsList=[]
    indList=[]
    for index,q in enumerate(quotationsList):
        if q.string == compareString:
            indList.append(index)
            equalQuotationsList.append(q)
    return  [indList,equalQuotationsList]         

In [None]:

# defining update the accepted quotations in locations in B list

def update_all_mutations_quotations(qlocs,locsInAList,all_locsInBList): 
    for i, locsInA in enumerate(locsInAList):
        locsInB= locsInBList[i]
        update_accepted_quotations(qlocs,locsInA,locsInB) 


def check_loc(loc, qlocs):
    for qloc in qlocs:
        if qloc[1]> loc[1]:
            check = False
            break
        else:
            if qloc == loc:
                check = True
                break
    return check        

def update_accepted_quotations(qlocs, locs_in_A, locs_in_B):
    
    new_locs_in_A= []
    
    new_locs_in_B = []

    if isinstance(locs_in_A[0], list):

        for j, item in enumerate(locs_in_A):
            new_item_A = []
            new_item_B = []
            if isinstance(item[0], list):
                for k, loc in enumerate(item): 
                    if check_loc(loc, qlocs):
                        new_item_A.append(loc)
                        new_item_B.append(locs_in_B[j,k]) 
            else: 
                loc = item 
                if check_loc(loc, qlocs):
                        new_item_A.append(loc)
                        new_item_B.append(locs_in_B[j,k])             

            if not new_item_A== []:
                new_locs_in_A.append(new_item_A)
                new_locs_in_B.append(new_item_B) 


    elif check_loc(locs_in_A[0], qlocs):
        
        new_locs_in_A.append(locs_in_A[0])
        new_locs_in_B.append(locs_in_B[0])    


    return [new_locs_in_A, new_locs_in_B]

    








    


In [None]:
#applying make_equal_string_quotations_list() function

quotationsList = bookProj.uniqueQuotationsList
length=len(quotationsList )
print(length)

text= book_proj.text
for i,q in enumerate(quotations_list):
    h_list=quotations_list[i:length]
    result_list= make_equal_string_quotations_list(q.string,h_list)

    h1_list= result_list[1]
    h1_ind_list=  result_list[0]
    if not len(h1_list)==1:
        print(i)
        #rint(f"{i}  ,   {len(h1_list)},  {h1_list[0].string}, {h1_list[1].string}, {h1_ind_list[0]}, {h1_ind_list[1]} ")
        #print(f"{i}  ,   {h1_list[0].location},  {h1_list[1].location} ")
        string1= text[h1_list[0].location[0] : h1_list[0].location[1] ]
        print(string1 )
        string2= text[h1_list[1].location[0] : h1_list[1].location[1] ]
        print(string2 )




In [None]:
text= bookProj.text
quotationsList = bookProj.uniqueQuotationsList

    
def get_q_context(q, text):
    start = max(0, q.location[0]-100)  # Ensure the start index is not negative
    end = start + 200  # Display 200 characters of context around the quotation
    context = text[start:end]

    return  context

    # Create a scrollable text area widget
widget = widgets.Textarea(
    value= get_q_context(quotations_list[0], text),
    placeholder='Enter text',
    description=  'Context:',
    layout= widgets.Layout(height='400px',width ='500px', overflow_y='auto')
)




#display(widget)



In [None]:
# Define filter_settings for current and later use

class filter_settings:
    def __init__(self):
        self.most_frequent =True
        self.number=100
        # type options=['All', 'Junk', 'Non-Junk']
        self.type='Non-Junk'
        self.ascending=False
        self. alphabetical= True

f_s= filter_settings()

print(f_s.most_frequent)
print(f_s.type)


In [None]:
# defining and building GUI for setting user_settings 

from IPython.display import display


# Create a label
pre_filter_label = widgets.Label(value="Pre filter Settings")

# Create a button
most_freq_checkbox = widgets.Checkbox(description="filter by most frequently", value= f_s.most_frequent)

# Create a button
commit_button = widgets.Button(description="Use these settings")

# Create an input field for a number
number_input = widgets.IntText(value= f_s.number, description='Number:', width ="50px")

# Create a box to hold the label, button, and number input
most_freq_quoted_label = widgets.Label(value="Number of most frequently quoted: ")
# Create an input field for a number


pre_filter_box= widgets.VBox([pre_filter_label, most_freq_quoted_label, number_input])

type_radio_buttons = widgets.RadioButtons(
    options=['All', 'Junk', 'Non-Junk'],
    description='Quotation type:',
    disabled= False,
    value= f_s.type
)


first_sorting_radio_buttons = widgets.RadioButtons(
    options=['Alphabetical', 'By location'],
    description='sorting option:',
    disabled= False,
)

if f_s.alphabetical:
    first_sorting_radio_buttons.value='Alphabetical'
else:    
  first_sorting_radio_buttons.value='By location'

first_sorting_radio_buttons_box = widgets.VBox([first_sorting_radio_buttons])

    
second_sorting_radio_buttons = widgets.RadioButtons(
    options=['Ascending', 'Descending'],
    description='sorting option:',
    disabled=False
)

if f_s.ascending:
    second_sorting_radio_buttons.value='Ascending'
else:    
    second_sorting_radio_buttons.value='Descending'



second_sorting_radio_buttons_box = widgets.VBox([second_sorting_radio_buttons])


junk_box = widgets.VBox([radio_buttons])

settings_box = widgets.VBox([pre_filter_box, junk_box, first_sorting_radio_buttons_box, second_sorting_radio_buttons_box, commit_button])

# Display the box
display(settings_box)

# Define a function to run when the button is clicked
def on_button_clicked(button):
    filterSettings.number = number_input. value
    filterSettings.type = type_radio_buttons.value
    filterSettings.ascending = second_sorting_radio_buttons.value=='Ascending'
    filterSettings.alphabetical = first_sorting_radio_buttons.value=='Alphabetical'
    commit_button.description='Confirmed'
    print(f"Button clicked. Number entered: {filterSettings.most_frequent},  {filterSettings.number}, {filterSettings.type}, {filterSettings.ascending}, {filterSettings.alphabetical}")
    return filterSettings
# Set the function to run when the button is clicked
commit_button.on_click(on_button_clicked)




In [None]:
#defining an applying a GUI for selecting junk_prase quotationss, using source text context

from IPython.display import display, HTML

def get_q_color_context(q, text):
    start = max(0, q.location[0]-200)  # Ensure the start index is not negative
    end =  min(q.location[1]+200, len(text)-1)  # Display 200 characters of context around the quotation
    context_before = text[start:q.location[0]]
    context_quotation = text[q.location[0]:q.location[1]]
    context_after = text[q.location[1]:end]

    # Create HTML with the quotation colored red
    html = f"  {context_before}<span style='color:red;'>{context_quotation}</span>{context_after}"

    # Display the HTML
    #display(HTML(html))

    return html

def list_of_colored_context(quotations_list, text):
    result_list =  []
    for i in range(10): 
        result= get_q_color_context(quotations_list[i], text)
        result_list.append(f"<br>, {result}")
    
    # Convert the list into a single string
    list_of_colored_contexts = '<br>'.join(result_list)
    
    return list_of_colored_contexts

lines_of_colored_contexts = list_of_colored_context(sorted_quotations_list, text)

# Create a scrollable HTML widget
widget = widgets.HTML(
    value=lines_of_colored_contexts,
    placeholder='Enter text',
    description='Context:',
    layout=widgets.Layout(height='400px', overflow_y='auto')
)

#display(widget)
text= book_proj.text
list_of_colored_contexts = list_of_colored_context(quotations_list, text)
    
# Create a scrollable HTML widget
widget = widgets.HTML(
    value= lines_of_colored_contexts,
    placeholder='Enter text',
    description='Context:',
    layout=widgets.Layout(height='400px')
)

#display(widget)


from ipywidgets import Checkbox, VBox

def create_checkboxes(quotations_list, text):
    checkboxes = []
    
     # has to be rvisited auto the range

    for i in range(min(10, len(selectedQuotationsList))): 
        html_line = get_q_color_context(selectedQuotationsList[i], text)
        checkbox = Checkbox(description=html_line, value=False, indent=False)
        checkboxes.append(checkbox)
    return checkboxes

#def create_checkboxes(quotations_list, text):
#    checkboxes = []

    
#    for i in range(min(20, len(quotatations_list))): 
#        html_line = get_q_color_context(quotations_list[i], text)
#        checkbox = Checkbox(description=html_line, value=False, indent=False)
#        checkboxes.append(checkbox)
#    return checkboxes    

def create_quotation_checkboxes(quotations_list, text):

# has to be consedered for longer list 
#
    checkboxes = []
    for i in  range(10): 
        html_line = get_q_color_context(quotations_list[i], text)
        checkbox = Checkbox(description=html_line, value=False, indent=False)
        checkboxes.append(checkbox)
    return checkboxes

def create_quotation_HBox(html_line , q ):
    if q.junk:
        descr =  'junk'
    else:
        descr = 'not junk'    

    checkbox = widgets.Checkbox(description = descr, value=q.junk, indent=False)

    checkbox.observe(lambda change: on_checkbox_change(change, checkbox, q), names='value')

    context_widget = widgets.HTML(
            value = html_line,
            placeholder='',
            description='',
            layout= widgets.Layout(height='430px', width= '1000px')
                                   )
 
    checkbox_all= widgets.Checkbox(description = "with all equal strings", value= False, indent=False)

    quotation_specs_VBox= widgets.VBox([checkbox, checkbox_all ], 
                                       layout= widgets.Layout(height='300x', width= '350px'))

    quotation_HBox = widgets.HBox([ quotation_specs_VBox, context_widget],  layout= widgets.Layout(height0='300x', width= '1000px'))


    return  quotation_HBox 




#html_line = get_q_color_context(q, text)

def on_checkbox_change(change, checkbox, q):
    if change ['name'] == 'value' and change['type'] == 'change':
        q.junk = change['new']
        save_changes_button.description = 'Save changes'
        quotations_list[q.index].junk= q.junk

 
        print(f"{q.index}, {unique_quotations_list[q.index].junk},   {q.string}" )
        print(f"{q.index}, {book_proj.unique_quotations_list[q.index].junk},   {q.string}" )

        if change['new'] == True:
           checkbox .description = 'junk'
        else:
            checkbox.description = 'not junk'
        print(f"Checkbox changed to: {change['new']}")




# Create a VBox with the checkboxes
#quotation

def make_quotation_Hboxes(quotations_list , text):
    quotation_HBoxes= [] 
    for i, q in enumerate(quotations_list):
        
        html_line = get_q_color_context(q, text)
        quotation_Hbox = create_quotation_HBox(html_line, q)

        quotation_HBoxes.append(quotation_Hbox)

    return quotation_HBoxes



if filterSettings.type=='Non-Junk':
    selectedQuotationsList=[]
    for q in sortedQuotationsList:
        if not q.junk:
           selectedQuotationsList.append(q)
elif filterSettings.type=='Junk':
    selectedQuotationsList=[]
    for q in sortedQuotationsList:
        if q.junk:
           selectedQuotationsList.append(q)
        
if filterSettings.type=='All':
    selectedQuotationList= sortedQuotationsList       


quotations_boxes = make_quotation_Hboxes(selectedQuotationsList[0:10], text)

quotations_Vbox = widgets.VBox(quotations_boxes,layout= widgets.Layout(height='1200px', overflow_y='scraoll') )


# Display the VBox
display(quotations_Vbox)


save_changes_button= widgets.Button(description='Save changes', layout=widgets.Layout(width='400px')) 

def save_changes_button_clicked(button):
    book_proj.write_unique_quotations_list_to_csv()    
    save_changes_button.description = 'Changes saved'
    book_proj.unique_quotations_list = quotations_list
    return   

  
# Attach the event handler to the commit button
save_changes_button.on_click(save_changes_button_clicked)
display(save_changes_button)



In [None]:
# test return value of bookProj.get_junkPhrases()

print(bookProj.get_junkPhrases()[0])

In [None]:
# test return value of bookProj.write_junkPhrases()

bookProj. write_junkPhrases_to_csv()
    
      

In [None]:
# test bookProj.uniqueQuotationsList value of a specified entry 

print(uniqueQuotationsList[1578].junk)


In [None]:
#defines make_equal_string_quotations_list , a function which retuns two lists: 
# one with the index values of the quotations in the quotations_list 
# and one with the list of quotations with equal phrase string value 

def make_equal_string_quotations_list (compare_string, quotations_list):
    equal_quotations_list=[]
    ind_list=[]
    for index,q in enumerate(quotations_list):
        if q.string == compare_string:
            ind_list.append(index)
            equal_quotations_list.append(q)
    return  [ind_list,equal_quotations_list]      

In [None]:
#defines make_all_equal_string_quotations_list, 
# a function which returns a list of all quotations with equal phrase string value 

def make_all_equal_string_quotations_list (quotations_list):

      all_equal_lists=[]
      
      length=len(quotations_list )
      print(length)

      text= book_proj.text
      for i,q in enumerate(quotations_list):
            h_list=quotations_list[i:length]
            h1_list= result_list[1]
            h1_ind_list=  result_list[0]
            
            equal_list= make_equal_string_quotations_list(q.string,h_list)

      return equal_list

      


In [None]:
#testing the copy() method

my_quotations = bookProj.uniqueQuotationsList.copy()
bookProj.uniqueQuotationsList[0].junk= False

buffer_file_path = os.path.join(book_proj.results_dir, 'buffer.txt')

with open(buffer_file_path, 'w') as f:
    for item in quotations_list:
        # Write each item to the file
        f.write("%s\n" % item)

In [None]:
# Open the buffer file in read mode
with open(buffer_file_path, 'r') as f:
    # Read each line from the file, strip the newline character, and add it to the list
    new_unique_quotations_list = [line.rstrip('\n') for line in f]

In [None]:
import os

def write_quotations_to_file(quotations_list, file_path):
    with open(file_path, 'w') as f:
        for item in quotations_list:
            if isinstance(item, quotation):  # Check if item is of type quotation
                f.write("%s\n" % item)

def read_quotations_from_file(file_path):
    with open(file_path, 'r') as f:
        quotations_list = [line.rstrip('\n') for line in f]
    return quotations_list

# Get the full path to the buffer file
buffer_file_path = os.path.join(book_proj.results_dir, 'buffer.txt')

# Write quotations to file
write_quotations_to_file(book_proj.unique_quotations_list, buffer_file_path)

# Read quotations from file and rebuild unique_quotations_list
book_proj.unique_quotations_list = read_quotations_from_file(buffer_file_path)

In [None]:
write_quotations_to_file(quotations_list, buffer_file_path)
new_quotations_list= read_quotations_from_file(buffer_file_path)
len(new_quotations_list)

In [None]:
type(new_quotations_list[0])

In [None]:
my_quotations = book_proj.unique_quotations_list.copy()
book_proj.unique_quotations_list[0].junk= False

print(book_proj.unique_quotations_list[0].junk)

print(my_quotations[0].junk)

my_quotations[0].junk= True

print(my_quotations[0].junk)


print(book_proj.unique_quotations_list[0].junk)




In [None]:
#testing setting the junk attribute 
junkList=[]

quotations_list[2].junk = True

for q in quotations_list:
    junk_list.append(q.junk)

for i in range(10):
    print(quotations_list[i].junk)
    print ( junk_list[i])
   






 

In [None]:
# Convert all booleans to strings and join them with commas
buffer_file_path = os.path.join(book_proj.results_dir, 'junk_buffer.csv')
bool_str = ','.join(map(str, junk_list))

# Write the string to the file
with open(buffer_file_path , 'w') as f:
    f.write(bool_str)

In [None]:
# Open the file in read mode

def read_junk_list(buffer_file_path): 
    with open(buffer_file_path, 'r') as f:
        # Read the line from the file
        line = f.readline().strip()

    # Split the line into strings and convert each string to a boolean
    bool_list = [s == 'True' for s in line.split(',')]

    print(bool_list[0:10])
    return bool_list



junk_list=read_junk_list(buffer_file_path)

for i, bool in    enumerate(junk_list):
    quotations_list[i].junk= bool

for i, bool in    enumerate(quotations_list[0:10]):
    print(quotations_list[i].junk)




In [None]:
for i, q in enumerate(new_unique_quotations_list[0:10]):
    print( f"{i},  {q.string}")

In [None]:
#testing make_equal_string_quotations_list given uniqueQuotationsList

quotationsList = uniqueQuotationsList
length=len(quotationsList )
print(length)
#for q in quotations_list[0:10]:
# print(q.string)  

for i,q in enumerate(quotationsList):
    hList=quotationsList[i:length]

    resultList= make_equal_string_quotations_list(q.string,hList)

    h1List= resultList[1]
    h1IndList=  resultList[0]

    length1=len(hList)
    if not len(h1List)==1:

        print(i)
        #print(f"{i}  ,   {len(h1_list)},  {h1_list[0].string}, {h1_list[1].string}, {h1_ind_list[0]}, {h1_ind_list[1]} ")
        #print(f"{i}  ,   {h1_list[0].location},  {h1_list[1].location} ")
        string1= text[h1List[0].location[0] : h1List[0].location[1] ]
        print(string1 )
        string2= text[h1List[1].location[0] : h1List[1].location[1] ]
        print(string2 )
      #my_list = [f" { length - length1 +j}, {q.string} " for j,q in enumerate(h1_list)]
      #my_ind_list=[f" { length - length1 +j}, {index} " for j,index in enumerate(h1_ind_list)]

      #result = ', '.join(my_list)
      #print(f"  {i}, {quotations_list[i].string},  {q.string}" for q in  my_list)
      #print(f"  {i}, {quotations_list[i].string},  { ind} " for ind in my_ind_list)

In [None]:


#defines find_cases_of_a_location and find_all_cases_of_a_location , 
# based on  a list of locations in A

def find_cases_of_a_location (i, compareLoc, locsInA):
    
    cases=[]

    if isinstance(locsInA, list):
        #print("locsInA is a list")
  

        if not locs_in_A == []:

            if isinstance(locs_in_A[0], list):

                for j, item in enumerate(locs_in_A):
            
                    if isinstance(item[0], list):
                        dummy=0
                #     for k, loc in enumerate(item): 
                #         if loc== compare_loc:
                #             cases.append([i, j])
                    else: 
                        loc = item 
                        if loc == compareLoc:
                                cases.append([i, j])   
    else:
        print("locs_in_A is not a list")                                  

        #elif locs_in_A == compare_loc:
        #    cases.append([i, j])

    return cases
    

    
def find_all_cases_of_a_location(compareLoc,locsInA_list): 
    casesList=[]
    
    for i, locsInA in enumerate(locsInAList):
        cases = find_cases_of_a_location(i, compareLoc,locsInA) 
        if not cases==[]:
            casesList.append(cases)
    
    return  casesList

    


In [None]:
locsInAList= bookProj.df['Locations in A']
print(len(locsInAList))
# 19712

#compare_string =  "Cashel Boyle O’Connor Fitzmaurice Tisdall Farrell"
compareLoc1= quotationsList[3784].location
#compare_loc2= quotations_list[3785].location
        
        

find_all_cases_of_a_location(compareLoc1,locsInAList)

# strange result (PT)

In [None]:


# Define the event handler
#def handle_checkbox_click(change):
#    # Check if the checkbox is checked
#if change['new']:
#        # Update the value of the selected_quotation_scrollb#ox
#        selected_quotation_scrollbox.value = change['owner'].description
/
quotations_list = sorted_quotations_list[0:40]      



label = widgets.Label()
selected_quotation_index_label = widgets.Label("test")

selected_quotation_string_label= widgets.Label("test")

num_equal_quotations_label= widgets.Label("test")

junk_phrase_label= widgets.Label("test")

# Create a scrollable text area widget
# Create the scrollable text area widget

 
junk_button = widgets.Button(description='junk phrase')

# Define a function to handle the select button click event
def handle_junk_button_click(button):
    #selected_quotation = quotations_scrollbox.value
    junk_phrase_label.value= "Selected phrase is junk}"
    quotations_list[index].junk= True  
    
    # Find the quote that matches the selected text
    selected_quote = None

    for i,quote in enumerate(quotations_list):
        if selected_quotation in quote.string:
            selected_quote_string = quote.string
            break
    
    if selected_quote is not None:
        print(f"Selected quote: {selected_quote}.string")
    else:
        print("No matching quote found.")
    selected_quotation_string= quotations_scrollbox.value
    print(f"Selected quotation: {selected_quotation_string}")

# Attach the event handler to the select button
junk_button.on_click(handle_junk_button_click)


equal_string_quotations_list= []

equal_string_quotations_list= make_equal_string_quotations_list(sorted_quotations_list[0].string, sorted_quotations_list)[1]

equal_string_checkboxes = [widgets.Checkbox(value= False, description= q.string) for q in equal_string_quotations_list]

equal_checkboxes_vbox = widgets.VBox(equal_string_checkboxes, layout = widgets.Layout(overflow_y='auto', height='300px'))


equal_button = widgets.Button(description='show equal phrases')

def handle_equal_button_click(button, compare_string):
    equal_string_quotations_list = make_equal_string_quotations_list(compare_string, quotations_list)[1]
    for quote in equal_string_quotations_list:
        if selected_quotation in quote:
            selected_quotation_string = quote.string
            break
    
    if selected_quotation_string is not None:
        print(f"Selected quote: {selected_quotation}")
    else:
        print("No matching quote found.")
    selected_quotation_string= quotations_scrollbox.value
    print(f"Selected quotation: {selected_quotation_string}")

#equal_button.on_click(handle_equal_button_click(selected_quotation_string))

#num_equal_quotations_label
 
quotations_scrollbox = widgets.Textarea(
    value='\n  \n'.join(q.string for q in quotations_list),
    layout=widgets.Layout(height='200px'),
    disabled=True
)

selected_quotations_scrollbox = widgets.Textarea(
    value='\n  \n'.join(q.string for q in unique_quotations_list),
    layout=widgets.Layout(height='200px'),
    disabled=True
)


# Create a list of checkboxes, one for each quotation
checkboxes = [widgets.Checkbox(value=False, description=q.string) for q in quotations_list]

#for checkbox in checkboxes:#
#    checkbox.observe(handle_checkbox_click, 'value')

# Create a VBox to hold the checkboxes
#checkbox_vbox = widgets.VBox(checkboxes)  # Remove the extra argument 'label'

# Create a VBox to hold the checkboxes
checkbox_vbox = widgets.VBox(checkboxes, layout=widgets.Layout(overflow_y='auto', height='300px'))


checkboxes = [widgets.Checkbox(value=False, description=q.string) for q in unique_quotations_list[0:30]]

#for checkbox in checkboxes:#
#    checkbox.observe(handle_checkbox_click, 'value')

# Create a VBox to hold the checkboxes
#checkbox_vbox = widgets.VBox(checkboxes)  # Remove the extra argument 'label'

# Create a VBox to hold the checkboxes
checkbox_vbox = widgets.VBox(checkboxes, layout=widgets.Layout(overflow_y='auto', height='300px'))


quotation_vbox = widgets.VBox(
    [selected_quotation_index_label ,
    selected_quotation_string_label ,
    num_equal_quotations_label,
    junk_phrase_label,
    junk_button 
    
    ])

quotation_hbox= widgets.HBox([checkbox_vbox, quotation_vbox] )
display(quotation_hbox)




# Define the event handler
def make_checkbox_handler(index):
    def handle_checkbox_click(change):
        # Check if the checkbox is checked
        global selected_index

        if change['new']:
            selected_index = index
            # Update the value of the selected_quotation_scrollbox
            selected_quotation_scrollbox.value = change['owner'].description
            # Show the index of the checkbox that was checked
            label.value = f"Checkbox {index} was checked."
            selected_quotation_index_label.value = f"Checkbox {index} was checked."
            compare_string = checkboxes[index].description
            selected_quotation_string_label .value=f" quatation string:  {checkboxes[index].description}"

            num_equal_quotations = len(equal_string_quotations_list (compare_string, quotations_list))
            num_equal_quotations_label.value= f" num equal quotations: { num_equal_quotations }"


            selected_quotation_scrollbox.value = change['owner'].description
    return handle_checkbox_click

# Attach the event handler to all checkboxes


for index, checkbox in enumerate(checkboxes):
    checkbox.observe(make_checkbox_handler(index), 'value')

#quotation_hbox=widgets.HBox([ checkbox_vbox, quotation_vbox])
#display(quotation_hbox)

# Create the label widget
quotation_label = widgets.Label("Quotation text")

rawText_scrollbox = widgets.Textarea(
    value=rawText,
    layout=widgets.Layout(height='500px', width='400px'),
    disabled=True
)

selected_quotation_scrollbox = widgets.Textarea(
    value='ä quotation text',
    layout=widgets.Layout(height='500px', width='400px'),
    disabled=True)


#checkbox_hbox.children = (*checkbox_hbox.children, selected_quotation_label)

# Create the scrollbox widget for rawText
rawText_scrollbox = widgets.Textarea(
    value=rawText,
    layout=widgets.Layout(height='500px', width='400px'),
    disabled=True
)
commit_button = widgets.Button(description='Junk')

# Define the event handler
def handle_commit_button_click(button):
    # Get the current index from the label
    index = int(label.value.split(' ')[1])
    # Set the junk attribute of the quotation at the current index to True
    quotations_list[index].junk = True
    #selected_quotation_index_label.value = quotations_list[index].junk 

# Attach the event handler to the commit_button
commit_button.on_click(handle_commit_button_click)

# Create the HBox and VBox layout

# problem wruntime loop??: 


#vbox_layout2 = widgets.VBox([quotation_label, quotation_junk_label, select_button, quotations_scrollbox])
#hbox_layout = widgets.HBox([vbox_layout, rawText_scrollbox])

#hbox_layout2 = widgets.HBox([vbox_layout2, rawText_scrollbox])




# Display the panel
#display(hbox_layout2)


#
#






In [None]:
with open(filePath, "r") as f:
    content = f.read()

for q in content:
    print(q.string)


In [None]:

#quotations_list2= book_proj.unique_quotions_list


my_quotations = quotations(book_proj)
my_unique_quotations_list=book_proj.unique_quotations_list 

if hasattr(my_quotations, 'unique_quotations_list'):
    print(len(my_quotations.unique_quotations_list))
    for quot in my_quotations.unique_quotations_list:
        print(f"{quot.location[0]},   {quot.location[1]},    {quot.string}")



    #print(len(my_quotations))
    for quot in my_quotations.unique_quotations_list:
        print(f"{quot.location[0]},   {quot.location[1]},    {quot.string}")


# Create a scrollable text area widget
# Create the scrollable text area widget
quotations_scrollbox = widgets.Textarea(
    value='\n\n'.join(my_quotations.unique_quotations_list ),
    layout=widgets.Layout(height='200px'),
    disabled=True
)

# Create the label widget
quotation_label = widgets.Label("Quotation text")

# Create the scrollbox widget for rawText
rawText_scrollbox = widgets.Textarea(
    value=rawText,
    layout=widgets.Layout(height='500px', width='400px'),
    disabled=True
)
commit_button = widgets.Button(description='Junk')


# Create the HBox and VBox layout

vbox_layout = widgets.VBox([quotation_label,commit_button, quotations_scrollbox] )
hbox_layout = widgets.HBox([vbox_layout, rawText_scrollbox])




# Display the panel
display(hbox_layout)



quotations_scrollbox = widgets.Textarea(
    value='\n  \n'.join(quotations_list),
    layout=widgets.Layout(height='200px'),
    disabled=True
)

# Display the scrollbox


# Create a label widget
quotation_label = widgets.Label("Quotation text")

def handle_click_event(change):
    clicked_item = change['new']

    
    # Perform actions based on the clicked item
    print(f"Clicked item: {clicked_item}")
    quotation_label.value=str(clicked_item)

# Attach the event handler to the value change event of the quotations_scrollbox
quotations_scrollbox.observe(handle_click_event, 'value')

# Define a function to update the label text
def update_quotation_text(change):
    quotation_label.value = f"Selected Item: {scrollbox.value}"

# Attach the event handler to the value change event of the scrollbox
quotations_scrollbox.observe(update_quotation_text, 'value')

# Create the select button
select_button = widgets.Button(description='Select')

# Define a function to handle the select button click event
def handle_select_button_click(button):
    selected_quotation = quotations_scrollbox.value
    print(f"Selected quotation: {selected_quotation}")
    
    # Find the quote that matches the selected text
    selected_quote = None
    for quote in quotations_list:
        if selected_quotation in quote:
            selected_quote = quote
            break
    
    if selected_quote is not None:
        print(f"Selected quote: {selected_quote}")
    else:
        print("No matching quote found.")
    selected_quotation = quotations_scrollbox.value
    print(f"Selected quotation: {selected_quotation}")

# Attach the event handler to the select button
select_button.on_click(handle_select_button_click)

# Create the VBox layout with the select button and quotations scrollbox
vbox_layout = widgets.VBox([quotation_label, select_button, quotations_scrollbox])
hbox_layout = widgets.HBox([vbox_layout, rawText_scrollbox])

# Display the layout
display(hbox_layout)






In [None]:
df=book_proj.df

In [None]:
# Save as JSONL file for analysis and visualization
#🚨
df.to_json(path_or_buf=book_proj.path_JSONL, orient='records', lines=True)


# Drop phrases

In [None]:
import itertools
# Tally matches

# Calculate length of source text

#print(book_proj.text)

textALength = len(rawText)
print(textALength)

# Make an empty array the size of the text

tally = np.zeros(textALength)
#tally = [0] * textALength

# Read the matched locations from the results dataset, and literally evaluate them into lists. 

locationsInA = df['Locations in A']

# Tally up every time a letter in the text is quoted. 
for article in locationsInA: 
    for locRange in article: 
         for i in range(locRange[0], min(locRange[1]+1, len(tally))):
                tally[i] += 1


In [None]:
#table of tally



In [None]:
# make a visualrepresentation of tally



In [None]:



non_empty_locations = [loc for loc in locationsInA if loc != []]
# Flatten the list

# Using list comprehension
flattened_locations = [item for sublist in non_empty_locations for item in sublist]

#print(flattened_locations)
sorted_locations = sorted(flattened_locations)
print(sorted_locations)

# Using itertools.chain.from_iterable()

#unique_locations = list(set(tuple(loc) for loc in flattened_locations))



unique_locations = []
loc1=[]
for loc in sorted_locations:
    if loc != loc1: 
        loc1=loc 
        unique_locations.append(loc1)
print(unique_locations)

print(len(unique_locations) )

In [None]:
import pandas as pd
# Calculate the frequencies and bins

# Convert sorted_locations to a pandas Series


series = pd.Series(sorted_locations)

# Create the frequency table
frequency_table = series.value_counts().reset_index()

# Rename the columns
frequency_table.columns = ['Value', 'Frequency']

# Print the frequency table
print(frequency_table)


In [None]:
quotations_list = [rawText[loc[0]:loc[1]+1] for loc in unique_locations]

for i in range(len(quotations_list)):
    print (quotations_list[i] )





In [None]:
# creates an external GUI window panel 


proj_uniqueQuotationsList5 = bookProj.uniqueQuotationsList[0:100]
text= bookProj.text

def main():
    root = tk.Tk()
    root.title('Scrollable radiobutton list')
    root.geometry("1500x1000")
    tabs = ttk.Notebook(root)
    tabs.pack(fill = "both")
    scrollable_radiobutton_list_frame = ttk.Frame(tabs)
    tabs.add(scrollable_radiobutton_list_frame, text = "Scrollable radiobutton list")
    my_checker = Quotations_Window(window = scrollable_radiobutton_list_frame)
    root.mainloop()
    
class Quotations_Window:
    def __init__(self, window):
        self.main_window = window
        self.mainframe = ttk.Frame(self.main_window, padding='15 3 12 12')
        self.mainframe.grid(column=0, row=0, sticky="W, E, N, S")

        self.file_choice = tk.StringVar()
        self.contents_list = list()
        
        self.display_folder_btn.grid(row=0, column=0)

        self.display_folder_btn = ttk.Button(self.mainframe, text="Display list of choices", width=20)
        self.display_folder_btn.grid(row=1, column=0, columnspan=2)
        self.display_folder_btn.bind("<Button-1>", self.list_folder_contents)

        self.folder_contents_canvas = tk.Canvas(self.mainframe)
        self.scroll_y = tk.Scrollbar(self.folder_contents_canvas, orient="vertical")
        self.scroll_y.pack(fill='y', side='right')
        self.folder_contents_canvas.grid(row=2, column=0, columnspan=2)
        self.folder_contents_frame = tk.Text(self.folder_contents_canvas, height=7, width=50,
                                             yscrollcommand=self.scroll_y.set)
        self.folder_contents_frame.pack(side="top", fill="x", expand=False, padx=20, pady=20)

        self.text_scrollbox = tk.Scrollbar(self.mainframe)
        self.text_scrollbox.pack(side="right", fill="y")

        self.text_box = tk.Text(self.mainframe, yscrollcommand=self.text_scrollbox.set)
        self.text_box.pack(side="right", fill="both", expand=False)

        self.text_scrollbox.config(command=self.text_box.yview)


    def list_folder_contents(self, event):
        try:
            self.contents_list = [q.string for q in proj_unique_quotations_list5]

            contents_dict = dict()
            self.folder_contents_frame.delete(1.0, 'end')
            counter = 0
            for i in self.contents_list:
                contents_dict[str(counter + 1)] = i
                counter += 1
            for (text, value) in contents_dict.items():
                ttk.Radiobutton(self.folder_contents_frame, text=value, variable=self.file_choice, value=text,
                                style="TRadiobutton").grid(column=0, columnspan=2, sticky=tk.W)
            self.scroll_y.config(command=self.folder_contents_frame.yview)

            self.text_box.delete(1.0, 'end')
            self.text_box.insert('end', book_proj.text)

        except Exception as exc:
            print(exc)
             
  

class Quotations_Window:
    def __init__(self, window):
        # use text as the text to display in the text box
        self.text =text
        self.main_window = window
        self.mainframe = ttk.Frame(self.main_window, padding = '15 3 12 12')
        self.mainframe.grid(column = 0, row = 0, sticky = "W, E, N, S")

        self.file_choice = tk.StringVar()
        self.contents_list = list()
 
     
        self.display_folder_btn = ttk.Button(self.mainframe, text = "Display list of choices", width = 20)
        self.display_folder_btn.grid(row = 0, column = 0, columnspan = 2)
        #self.display_folder_btn.pack(side='top')
        self.display_folder_btn.bind("<Button-1>", self.list_folder_contents)

        self.folder_contents_canvas = tk.Canvas(self.mainframe)
        self.scroll_y = tk.Scrollbar(self.folder_contents_canvas, orient="vertical")
        self.scroll_y.pack(fill = 'y', side = 'right')
        self.folder_contents_canvas.grid(row=0, column = 0, columnspan = 2)
        self.folder_contents_frame = tk.Text(self.folder_contents_canvas, height = 30, width = 50, yscrollcommand = self.scroll_y.set)
        self.folder_contents_frame.pack(side = "top", fill = "x", expand = False, padx = 20, pady = 20)

        # create a new frame for the new folder
        self.new_folder_frame = ttk.Frame(self.mainframe)
        self.new_folder_frame.grid(column=3, row=0, sticky="W, E, N, S")

        # create a new canvas in the new frame
        self.new_folder_canvas = tk.Canvas(self.new_folder_frame)
        self.new_folder_canvas.grid(row=2, column=0, columnspan=2)
# Create a frame





    def list_folder_contents(self, event):
        try:
            #self.contents_list = ['A dictum nulla auctor id.', 'A porttitor diam iaculis quis.', 'Consectetur adipiscing elit.', \
            #                      'Curabitur in ante iaculis', 'Finibus tincidunt nunc.', 'Fusce elit ligula', \
            #                      'Id sollicitudin arcu semper sit amet.', 'Integer at sapien leo.', 'Lorem ipsum dolor sit amet', \
            #                      'Luctus ligula suscipit', 'Nam vitae erat a dolor convallis', \
            #                      'Praesent feugiat quam ac', 'Pretium diam.', 'Quisque accumsan vehicula dolor', \
            #                      'Quisque eget arcu odio.', 'Sed ac elit id dui blandit dictum', 'Sed et eleifend leo.', \
            #                      'Sed vestibulum fermentum augue', 'Suspendisse pharetra cursus lectus', 'Ultricies eget erat et', \
            #                      'Vivamus id lorem mi.']
            self.contents_list = [ q.string for q in proj_unique_quotations_list5[0:100] ]

            contents_dict = dict()
            self.folder_contents_frame.delete(1.0, 'end')
            counter = 0
            for i in self.contents_list:
                contents_dict[str(counter+1)] = i
                counter+=1
            for (text, value) in contents_dict.items():
                #self.folder_contents_frame.insert(1.0, text+"\t"+value+"\n")
                ttk.Radiobutton(self.folder_contents_frame, text = value, variable = self.file_choice, value = text, style = "TRadiobutton").grid(column = 0, columnspan = 2, sticky = tk.W)
            self.scroll_y.config(command = self.folder_contents_frame.yview)

        except Exception as exc:
            print(exc)
            
    
     
if __name__ == '__main__':
    main()

In [None]:
len(book_proj.unique_quotions_list)



In [None]:
book_proj.unique_quotions_list

In [None]:
proj_quotations=book_proj.unique_quotions_list
print([quot.string for quot in proj_quotations])

In [None]:
proj_quotations=book_proj.unique_quotations_list[0:100]

def main():
    root = tk.Tk()
    root.title('Scrollable radiobutton list')
    root.geometry("500x600")
    tabs = ttk.Notebook(root)
    tabs.pack(fill = "both")
    scrollable_radiobutton_list_frame = ttk.Frame(tabs)
    tabs.add(scrollable_radiobutton_list_frame, text = "Scrollable radiobutton list")
             
    my_checker = Quotations_Window(window = scrollable_radiobutton_list_frame)
    root.mainloop()

class Quotations_Window:
    def __init__(self, window):
        self.main_window = window
        self.mainframe = ttk.Frame(self.main_window, padding='15 3 12 12')
        self.mainframe.grid(column=0, row=0, sticky="W, E, N, S")

        self.file_choice = tk.StringVar()
        self.contents_list = list()

        self.display_folder_btn = ttk.Button(self.mainframe, text="Display list of choices", width=20)
        self.display_folder_btn.grid(row=1, column=0, columnspan=2)
        self.display_folder_btn.bind("<Button-1>", self.list_folder_contents)

        self.folder_contents_canvas = tk.Canvas(self.mainframe)
        self.scroll_y = tk.Scrollbar(self.folder_contents_canvas, orient="vertical")
        self.scroll_y.pack(fill='y', side='right')
        self.folder_contents_canvas.grid(row=2, column=0, columnspan=2)
        self.folder_contents_frame = tk.Text(self.folder_contents_canvas, height=7, width=50,
                                             yscrollcommand=self.scroll_y.set)
        self.folder_contents_frame.pack(side="top", fill="x", expand=False, padx=20, pady=20)

        self.text_scrollbox = tk.Scrollbar(self.mainframe)
        self.text_scrollbox.grid(row=2, column=3, sticky="NS")
        self.text_area = tk.Text(self.mainframe, height=7, width=50, yscrollcommand=self.text_scrollbox.set)
        self.text_area.grid(row=2, column=2, padx=20, pady=20)
        self.text_scrollbox.config(command=self.text_area.yview)

    def list_folder_contents(self, event):
        try:
            #self.contents_list = ['A dictum nulla auctor id.', 'A porttitor diam iaculis quis.', 'Consectetur adipiscing elit.', \
            #                      'Curabitur in ante iaculis', 'Finibus tincidunt nunc.', 'Fusce elit ligula', \
            #                      'Id sollicitudin arcu semper sit amet.', 'Integer at sapien leo.', 'Lorem ipsum dolor sit amet', \
            #                      'Luctus ligula suscipit', 'Nam vitae erat a dolor convallis', \
            #                      'Praesent feugiat quam ac', 'Pretium diam.', 'Quisque accumsan vehicula dolor', \
            #                      'Quisque eget arcu odio.', 'Sed ac elit id dui blandit dictum', 'Sed et eleifend leo.', \
            #                      'Sed vestibulum fermentum augue', 'Suspendisse pharetra cursus lectus', 'Ultricies eget erat et', \
            #                      'Vivamus id lorem mi.']
            self.contents_list = [ q.string for q in proj_quotations.unique_quotations_list]

            contents_dict = dict()
            self.folder_contents_frame.delete(1.0, 'end')
            counter = 0
            for i in self.contents_list:
                contents_dict[str(counter+1)] = i
                counter+=1
            for (text, value) in contents_dict.items():
                #self.folder_contents_frame.insert(1.0, text+"\t"+value+"\n")
                ttk.Radiobutton(self.folder_contents_frame, text = value, variable = self.file_choice, value = text, style = "TRadiobutton").grid(column = 0, columnspan = 2, sticky = tk.W)
            self.scroll_y.config(command = self.folder_contents_frame.yview)

        except Exception as exc:
            print(exc)


#-----------------------------------------


In [None]:
proj_quotations=book_proj.unique_quotations_list[0:100]

def main():
    root = tk.Tk()
    root.title('Scrollable radiobutton list')
    root.geometry("1500x1000")
    tabs = ttk.Notebook(root)
    tabs.pack(fill = "both")
    scrollable_radiobutton_list_frame = ttk.Frame(tabs)
    tabs.add(scrollable_radiobutton_list_frame, text = "Scrollable radiobutton list")
    tabs.add(scrollable_radiobutton_list_frame, text = "second Scrollable radiobutton list")
             
    my_checker = Quotations_Window(window = scrollable_radiobutton_list_frame)

  

    # Place label1 in row 0, column 0
    #label1.grid(row=0, column=0)

    # Place label2 in row 0, column 1
    #label2.grid(row=0, column=1)

    # Place label3 in row 1, column 0, and make it span 2 columns
    #label3.grid(row=1, column=0, columnspan=2)

    tabs2 = ttk.Notebook(root)
    tabs2.pack(fill = "both")
    my_frame = ttk.Frame(tabs2)
    label1 = tk.Label(my_frame, text="My Label")


    tabs2.add(my_frame, text = "my list")

    tabs2.add(my_frame, text = "my list")
    #tabs2.add(scrollable_radiobutton_list_frame, text = "My Scrollable radiobutton list")

    root.mainloop()
    
class Quotations_Window:
    def __init__(self, window):
        self.main_window = window
        self.mainframe = ttk.Frame(self.main_window, padding='15 3 12 12')
        self.mainframe.grid(column=0, row=0, sticky="W, E, N, S")

        self.file_choice = tk.StringVar()
        self.contents_list = list()

        self.display_folder_btn = ttk.Button(self.mainframe, text="Display list of choices", width=20)
        self.display_folder_btn.grid(row=1, column=0, columnspan=2)
        self.display_folder_btn.bind("<Button-1>", self.list_folder_contents)

        self.folder_contents_canvas = tk.Canvas(self.mainframe)
        self.scroll_y = tk.Scrollbar(self.folder_contents_canvas, orient="vertical")
        self.scroll_y.pack(fill='y', side='right')
        self.folder_contents_canvas.grid(row=4, column=0, columnspan=2)
        self.folder_contents_frame = tk.Text(self.folder_contents_canvas, height=7, width=50,
                                             yscrollcommand=self.scroll_y.set)
        self.folder_contents_frame.pack(side="top", fill="x", expand=False, padx=20, pady=20)

        #self.text_scrollbox = tk.Scrollbar(self.mainframe)
        #self.text_scrollbox.grid(row=2, column=3, sticky="NS")
        #self.text_area = tk.Text(self.mainframe, height=7, width=50, yscrollcommand=self.text_scrollbox.set)
        #self.text_area.grid(row=2, column=2, padx=20, pady=20)
        #self.text_scrollbox.config(command=self.text_area.yview)

        #self.text_area.insert(tk.END, book_proj.text)


    def list_folder_contents(self, event):
        try:
            self.contents_list = [q.string for q in proj_quotations]

            contents_dict = dict()
            self.folder_contents_frame.delete(1.0, 'end')
            counter = 0
            for i in self.contents_list:
                contents_dict[str(counter + 1)] = i
                counter += 1
            for (text, value) in contents_dict.items():
                ttk.Radiobutton(self.folder_contents_frame, text=value, variable=self.file_choice, value=text,
                                style="TRadiobutton").grid(column=0, columnspan=2, sticky=tk.W)
            self.scroll_y.config(command=self.folder_contents_frame.yview)

        except Exception as exc:
            print(exc)
   


if __name__ == '__main__':
    main()

In [None]:
# how many times is quotation quoted?
from tkinter import scrolledtext

proj_quotations=book_proj.unique_quotations_list[0:5]
text= book_proj.text


In [None]:


def main():
    root = tk.Tk()
    root.title('Scrollable radiobutton list')
    root.geometry("1500x1000")
    root.mainframe = ttk.Frame(root, padding='15 3 12 12')
    root.mainframe.rowconfigure(0, weight = 1 )
    root.mainframe.rowconfigure(1, weight = 1 )
    root.mainframe.columnconfigure(0, weight = 1 )
    root.mainframe.columnconfigure(1, weight = 1 )     

    scrollable_radiobutton_list_frame = ttk.Frame(root)
    scrollable_radiobutton_list_frame.grid(row=0, column=0, sticky="e")

    scrollable_text_frame = ttk.Frame(root.mainframe)
    scrollable_text_frame.grid(row=0, column=1, sticky="w")

    my_text_frame = ttk.Frame(root.mainframe)
    my_text_frame.grid(row=0, column=1, sticky="w")

    my_checker = Quotations_Window(window = scrollable_radiobutton_list_frame)
    my_text = Text_Window(window = my_text_frame)

    label2 = tk.Label(root.mainframe , text="SourcA ")
    label2.grid(row=1, column=0)

    label3 = tk.Label(root.mainframe, text="something")
    label3.grid(row=1, column=1)

    st1 = scrolledtext.ScrolledText(root, width=30, height=10)
    st1.insert('end', book_proj.text)
    st1.grid(row=2, column=0)

    st2 = scrolledtext.ScrolledText(root, width=30, height=10)
    st2.grid(row=2, column=6)

    root.mainloop()    

    
class Quotations_Window:

    def junk(self, event):

        print(dir(self.file_choice.get()))
        return

    def __init__(self, window):
        self.main_window = window
        self.mainframe = ttk.Frame(window, padding='15 3 12 12')
        self.mainframe.rowconfigure(0, weight = 1 )
        self.mainframe.rowconfigure(1, weight = 1 )
        self.mainframe.columnconfigure(0, weight = 1 )
        self.mainframe.columnconfigure(1, weight = 1 )        

        self.mainframe.grid(column=0, row=0, sticky="w")

        self.file_choice = tk.StringVar()
        self.contents_list = list()

        self.display_folder_btn = ttk.Button(window,
                         text="Display list of choices (click a radiobutton)", 
                         width=40)

        self.display_folder_btn.grid(row=1, column=0, columnspan=1)
        self.display_folder_btn.bind("<Button-1>", self.list_folder_contents)

        
        self.display_folder_btn2 = ttk.Button(window, text="Dispel junk phrase", width=20)
        self.display_folder_btn2.grid(row=1, column=2, columnspan=1)
        self.display_folder_btn2.bind("<Button-1>", self.junk)

        self.folder_contents_canvas = tk.Canvas(self.mainframe)
        self.scroll_y = tk.Scrollbar(self.folder_contents_canvas, orient="vertical")
        self.scroll_y.pack(fill='y', side='right')
        self.folder_contents_canvas.grid(row=4, column=0, columnspan=2)
        self.folder_contents_frame = tk.Text(self.folder_contents_canvas,  width=50, height=10,
                                             yscrollcommand=self.scroll_y.set)

        self.folder_contents_frame.pack(side="bottom", fill="x", expand=False, padx=20, pady=20)

        self.contents_list = [q.string for q in proj_quotations]

        contents_dict = dict()

        self.folder_contents_frame.delete(1.0, 'end')

        counter = 0
        for i in self.contents_list:
            contents_dict[str(counter + 1)] = i
            counter += 1

        for (text, value) in contents_dict.items():
            ttk.Radiobutton(self.folder_contents_frame, text=value, variable=self.file_choice, value=text,
                            style="TRadiobutton").grid(column=0, columnspan=1, sticky= "w")
        self.scroll_y.config(command=self.folder_contents_frame.yview)
        
    def list_folder_contents(self, event):
        try:
            self.contents_list = [q.string for q in proj_quotations]

            contents_dict = dict()
            self.folder_contents_frame.delete(1.0, 'end')
            counter = 0
            for i in self.contents_list:
                contents_dict[str(counter + 1)] = i
                counter += 1
            for (text, value) in contents_dict.items():
                ttk.Radiobutton(self.folder_contents_frame, text=value, variable=self.file_choice, value=text,
                                style="TRadiobutton").grid(column=0, columnspan=1, sticky= "w ")
            self.scroll_y.config(command=self.folder_contents_frame.yview)

        except Exception as exc:
            print(exc)
 
class Text_Window:
    def __init__(self, window):
        self.main_window = window
        self.mainframe = ttk.Frame(window, padding='15 3 12 12')
        self.mainframe.rowconfigure(0, weight = 1 )
        self.mainframe.rowconfigure(1, weight = 1 )
        self.mainframe.columnconfigure(0, weight = 1 )
        self.mainframe.columnconfigure(1, weight = 1 )        

        self.mainframe.grid(column=0, row=0, sticky="W, E")

        self.file_choice = tk.StringVar()
        self.contents_list = list()

        self.display_folder_btn = ttk.Button(window, text="Display text", width=20)
        self.display_folder_btn.grid(row=1, column=0, columnspan=2)
        # self.display_folder_btn.bind("<Button-1>", self.list_folder_contents)

        self.folder_contents_canvas = tk.Canvas(window)
        self.scroll_y = tk.Scrollbar(self.folder_contents_canvas, orient="vertical")
        self.scroll_y.pack(fill='y', side='right')
        self.folder_contents_canvas.grid(row=0, column=0, columnspan=2)
        self.folder_contents_frame = tk.Text(self.folder_contents_canvas, height=50, width=150,
                                             yscrollcommand=self.scroll_y.set)
        self.folder_contents_frame.pack(side="top", fill="x", expand=False, padx=20, pady=20)

        self.contents_list = text

        self.scroll_y.config(command=self.folder_contents_frame.yview)
        self.folder_contents_frame.delete('1.0', 'end')

        self.folder_contents_frame.insert('end',text)

 
      


if __name__ == '__main__':
    main()