# Kindle Clippings Parser

Simple Script to Parse Kindle Highlights. 

Configure path to your file. Script extracts highlights into a CSV export. 

For **data analysis and some data visualization** of your Amazon Kindle clippings, see: [kindle_clippings_data_analysis.ipynb](https://github.com/markwk/qs_ledger/blob/master/kindle/kindle_clippings_data_analysis.ipynb)

---

In [1]:
# set path to your clippings file
my_path = "data/My Clippings.txt"
# my_path = "data/My Clippings - Kindle1.txt"

-----

In [2]:
# dependencies
from datetime import date, datetime as dt, timedelta as td
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

----

In [3]:
# function to parse and extract kindle highlights from clippings.txt file
# TODO: Add Reference to Adapted Code
def extract_clipping(data):

    meta0 = data[0]
    br_pts0 = [i for i, x in enumerate(meta0) if x == '(']
    if len(br_pts0) == 0:
        book_title = meta0
        author = 'Unknown'
    else:
        br0 = br_pts0[-1]
        book_title = meta0[:br0]
        author = meta0[br0+1:-1]

        if ',' in author:
            parts = author.split(',')
            author = parts[1][1:] + ' ' + parts[0]

        if ';' in author:
            author = author.split(';')

    meta1 = data[1].split('|')
    if len(meta1) == 3:
        num_pages = meta1[0][meta1[0].find('p') + 5:-1]
        location = meta1[1][1:-1]
        br1 = meta1[2].find(',')
        entry_date = dt.strptime(meta1[2][br1+2:], '%B %d, %Y %I:%M:%S %p').strftime("%Y-%m-%d %H:%M:%S")
    elif len(meta1) == 2:
        num_pages = 'Unknown'
        location = meta1[0][meta1[0].find('on') + 3:-1]
        br1 = meta1[1].find(',')
        entry_date = dt.strptime(meta1[1][br1+2:], '%B %d, %Y %I:%M:%S %p').strftime("%Y-%m-%d %H:%M:%S")

    highlight = data[3]
        
    return {
            'book_title': book_title,
            'author': author,
            'num_pages': num_pages,
            'location': location,
            'entry_date': entry_date,
            'highlight': highlight
        }

----

## Parse and Export Kindle Clippings to CSV

In [4]:
# split clippings.txt file into relevant sections
with open(my_path, "r") as file:
    everything = file.read()
    everything = everything.replace('\ufeff', '')
    sections = everything.split('==========\n')
    
# print(len(sections))
    
clippings_list = []
for i, clip in enumerate(sections[:-1]):
    data = clip.split('\n')
    extract = extract_clipping(data)
    clippings_list.append(extract)

# len(clippings_list)

# create dataframe
clippings = pd.DataFrame(clippings_list)

In [5]:
# export to csv
clippings.to_csv("data/clippings.csv", index=None, encoding='utf-8')

----

## Export Notes from Specific Book

In [6]:
my_clippings = pd.read_csv("data/clippings.csv")

In [7]:
my_clippings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 988 entries, 0 to 987
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   book_title  988 non-null    object
 1   author      988 non-null    object
 2   num_pages   988 non-null    object
 3   location    988 non-null    object
 4   entry_date  988 non-null    object
 5   highlight   983 non-null    object
dtypes: object(6)
memory usage: 46.4+ KB


In [8]:
my_clippings.columns = ['author', 'book_title', 'timestamp', 'highlight', 'location',
       'num_pages']

In [9]:
def print_book(title):
    book_notes = my_clippings[my_clippings['book_title'] == title]
    print(book_notes.iloc[0]['book_title'])
    print("by " + book_notes.iloc[0]['author'])
    print("---------------------------")
    for index, row in book_notes.iterrows():
        print(row['highlight'])
        print("pg: " + row['num_pages'] + " | " + row['location'] + " | " + str(row['timestamp']))
        print("")

In [10]:
book_titles = my_clippings['book_title'].unique()
print("Latest Books with Highlights: ")
for i in book_titles[-15:]:
    print(i)

Latest Books with Highlights: 
Unknown
W.E.B. Du Bois
Jonathan Weiner
Justin Jackson
Jon Meacham
Safi Bahcall
Eric Topol
Dale Bredesen
G. Polya
['Robert Gottlieb', 'Simon Ng']
Kai-Fu Lee
Alex Hutchinson
Susan Brownmiller
Leonardo Gabrielli
Matthew B. Crawford


In [11]:
# Display Individual Book's Clippings
# Uncomment and configure to title of 
# print_book("Handmaid's Tale ")