In [None]:
import collections
import matplotlib.pyplot as plt
import sqlite3
import tqdm

from wordcloud import WordCloud, STOPWORDS

def plot_cloud(wordcloud):
    plt.figure(figsize=(40, 30))
    plt.imshow(wordcloud) 
    plt.axis("off");
    
class ReadStatus(object):
    READ = 'read'
    UNREAD = 'unread'
    NEEDS_CLEAN = 'needs-clean'
    UNKNOWN = 'unknown'
    
    ALL = [READ, UNREAD, NEEDS_CLEAN, UNKNOWN]
    VALID = [READ, UNREAD, NEEDS_CLEAN]

In [None]:
con = sqlite3.connect('/Users/nnayak/Zotero/zotero.sqlite')
cur = con.cursor()

cur.execute('''SELECT * FROM fields;''')
fields = {a:b for (a, b, _) in cur.fetchall()}
rev_fields = {name:idx for idx, name in fields.items()}

cur.execute('''SELECT * FROM tags;''')
tags = {a:b for (a, b) in cur.fetchall()}
rev_tags = {name:idx for idx, name in tags.items()}

In [None]:
tag_map = collections.defaultdict(list)
cur.execute('''SELECT * FROM itemTags;''')

Paper = collections.namedtuple("Paper", "title add_month tags read_status".split())

class Paper(object):
    def __init__(self, paper_id, title, add_month):
        self.paper_id = paper_id
        self.tags = []
        self.add_month = add_month
        self.title = title
        self.read_status = ReadStatus.UNKNOWN
        self.valid = False
    
    def set_tags(self, tags):
        for tag in tags:
            if tag in ReadStatus.VALID:
                self.read_status = tag
            else:
                self.tags.append(tag)
        self.valid = True
        
    def to_dict(self):
        if not self.valid:
            return None
        assert len(self.tags)
        return {
            "paper_id":self.paper_id,
            "tags":self.tags,
            "add_month":self.add_month,
            "title":self.title,
            "read_status":self.read_status,
        }
    
def get_month_wordcloud(month_papers):
    tags = sum([paper.tags for paper in month_papers], [])
    if not tags:
        return
    wordcloud = WordCloud(
    width = 3000, height = 2000, random_state=1,
    background_color='salmon', colormap='Pastel1', collocations=False, stopwords = []).generate(
    " ".join(tags))
    plot_cloud(wordcloud)
        
        
def get_some_value(cursor, item_id, field_id):
    cur.execute(
        "SELECT valueID FROM itemData WHERE itemID = ? and fieldID = ?", (item_id, field_id))
    rows = cur.fetchall()
    if not rows:
        return None
    (field_value_id,) = rows[0]
    
    cur.execute("SELECT value FROM itemDataValues WHERE valueID =%s" % field_value_id)
    rows = cur.fetchall()
    if not rows:
        return None
    return rows[0][0]

def get_add_month(cur, item_id):
    cur.execute(
        "SELECT dateAdded FROM items WHERE itemID = ? ", (item_id,))
    rows = cur.fetchall()
    assert len(rows) == 1
    return rows[0][0][:7]
         
paper_map = {}

for i in cur.fetchall():
    item_id, tag_id, _ = i
    
    title = get_some_value(cur, item_id, rev_fields['title'])
    add_month = get_add_month(cur, item_id)
    if item_id not in paper_map:
        paper_map[item_id] = Paper(item_id, title, add_month)
    tag_map[item_id].append(tags[tag_id])
    
for paper_id, paper_obj in paper_map.items():
    paper_map[paper_id].set_tags(tag_map[paper_id])

papers_by_month = collections.defaultdict(list)
for paper_id, paper in paper_map.items():
    papers_by_month[paper.add_month].append(paper)

In [None]:
for month in tqdm.tqdm(list(sorted(papers_by_month.keys()))):
    papers = papers_by_month[month]
    get_month_wordcloud(papers)
    plt.savefig(month+".png")