In [1]:
import json
import pickle
import string
from collections import defaultdict

import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

from utils.strings import ENG_CORPUS_PATH, ENG_TRAIN_PATH, ENG_VALID_PATH, ENG_TEST_PATH, ENG_POEMS_PATH
from utils.util import read_file, get_glove

In [62]:
with open(f"{ENG_POEMS_PATH}", 'rb') as pickle_file:
    data = pickle.load(pickle_file)

In [15]:
lines_limit = 10
special_chars = set()


num_topics = 0
topics_dist = defaultdict(int)

for topic, poems in tqdm(data.items()):
    num_topics += 1

    for poem in poems:
        topics_dist[topic] += 1
        lines = poem.split('\n')

        for line in lines:
            if line.isspace():
                continue
            for char in line:
                if not (char in string.ascii_letters or char.isspace()):
                    special_chars.add(char) 

print(f"Number of topics: {num_topics}")
print(f"Topics dist:")
for topic, count in topics_dist.items():
    print(f"Topic: {topic} - Number: {count}")

special_chars = "".join(special_chars)
print("="*50)
print(f"Special characters:\n{special_chars}")

100%|██████████| 144/144 [00:02<00:00, 54.59it/s]

Number of topics: 144
Topics dist:
Topic: alone - Number: 100
Topic: america - Number: 100
Topic: angel - Number: 3722
Topic: anger - Number: 100
Topic: animal - Number: 100
Topic: baby - Number: 100
Topic: beach - Number: 100
Topic: beautiful - Number: 101
Topic: beauty - Number: 1536
Topic: believe - Number: 100
Topic: birth - Number: 100
Topic: brother - Number: 100
Topic: butterfly - Number: 1252
Topic: car - Number: 100
Topic: carpe diem - Number: 97
Topic: change - Number: 100
Topic: chicago - Number: 6627
Topic: childhood - Number: 100
Topic: children - Number: 100
Topic: christmas - Number: 100
Topic: cinderella - Number: 100
Topic: city - Number: 100
Topic: courage - Number: 100
Topic: crazy - Number: 100
Topic: culture - Number: 100
Topic: dance - Number: 100
Topic: dark - Number: 100
Topic: daughter - Number: 100
Topic: death - Number: 100
Topic: depression - Number: 100
Topic: despair - Number: 100
Topic: destiny - Number: 100
Topic: dream - Number: 100
Topic: evil - Number




In [63]:
poem_id = 0
clean_poems = {}

for topic, poems in tqdm(data.items()):
    for poem in poems:
        clean_poem = ""
        
        for char in poem:
            if char not in special_chars:
                clean_poem += char
        
        poem_lines = []
        for line in clean_poem.split('\n'):
            if len(line) == 0:
                continue
            tokens = [token for token in line.split(' ') if token.isalpha()]

            if len(tokens) < 3 or len(tokens) > 99:
                continue
            poem_lines.append(line)
        
        if len(poem_lines) < 3:
            continue

        clean_poem = '\n'.join(poem_lines)
        
        clean_poems[f"poem_{poem_id}"] = {"topic": topic, "poem": clean_poem.lower()}
        poem_id += 1

100%|██████████| 144/144 [00:09<00:00, 15.67it/s]


In [72]:
with open("clean_poems.json", "w+") as json_file:
    json.dump(clean_poems, json_file, indent=4)

len(clean_poems)

26442

In [71]:
line_len_dist = defaultdict(int)
num_lines_dist = defaultdict(int)

for poem_id, data in clean_poems.items():
    lines = data["poem"].split('\n')

    num_lines_dist[len(lines)] += 1

    for line in lines:
        line_len_dist[len(line.split(' '))] += 1
        if 3 < len(line.split(' ')) < 5:
            print(line)


In [65]:
print(sorted(line_len_dist))

for line_len in line_len_dist.keys():
    if line_len < 5 or line_len > 100:
        print(f"{line_len}: {line_len_dist[line_len]}")

print(f"100: {line_len_dist[100]}")

[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 104, 107, 109, 114, 128, 294, 390, 654, 940]
3: 37447
4: 74005
104: 1
128: 1
114: 1
940: 1
654: 1
294: 1
390: 1
109: 1
107: 1
100: 2


In [70]:
for poem_len in sorted(num_lines_dist):
    print(f"{poem_len}: {num_lines_dist[poem_len]}")

3: 240
4: 384
5: 318
6: 2324
7: 248
8: 677
9: 380
10: 3541
11: 312
12: 1340
13: 438
14: 1213
15: 454
16: 1089
17: 2544
18: 1034
19: 306
20: 827
21: 310
22: 583
23: 212
24: 1029
25: 207
26: 190
27: 490
28: 320
29: 208
30: 194
31: 156
32: 1806
33: 119
34: 114
35: 117
36: 526
37: 69
38: 57
39: 67
40: 235
41: 45
42: 67
43: 38
44: 92
45: 39
46: 33
47: 29
48: 96
49: 50
50: 30
51: 26
52: 47
53: 23
54: 38
55: 18
56: 55
57: 87
58: 23
59: 15
60: 37
61: 14
62: 13
63: 16
64: 99
65: 22
66: 15
67: 13
68: 35
69: 16
70: 11
71: 15
72: 26
73: 12
74: 11
75: 10
76: 13
77: 8
78: 8
79: 6
80: 20
81: 15
82: 9
83: 4
84: 10
85: 5
86: 9
87: 4
88: 13
89: 4
90: 6
91: 5
92: 11
93: 7
94: 2
95: 1
96: 11
97: 1
98: 5
99: 6
100: 5
101: 2
102: 4
103: 3
104: 8
105: 2
106: 2
107: 4
108: 5
109: 2
110: 8
111: 1
112: 9
113: 5
114: 5
115: 3
117: 4
118: 2
119: 8
120: 3
121: 3
122: 6
123: 2
124: 1
125: 2
126: 3
127: 1
128: 3
130: 4
132: 1
135: 1
136: 2
138: 3
140: 4
141: 4
142: 2
143: 1
144: 4
145: 1
147: 1
149: 1
150: 3
151: 3


In [73]:
with open("eng_dataset.json", 'r') as json_file:
    eng_data = json.load(json_file)

In [79]:
counter = 0
for _, value in eng_data.items():
    if len(value["keywords"]) < 3:
        counter += 1

print(counter)
print(len(eng_data))

2419
48693


In [5]:
vocab = get_glove("50d")

In [6]:
vocab