In [2]:
from names_dataset import NameDataset
from pathlib import Path
import re
from nltk.corpus import stopwords
from difflib import SequenceMatcher
from time import time
import json
# import nltk
# nltk.download('stopwords')

In [7]:
data_path = Path.cwd().joinpath("DATA_FILES")
english_stop_words = set(stopwords.words('english'))

In [10]:
def filter_sentence(line: str, debug_print: bool = False) -> str:
    if debug_print:
        print(line.strip())
    txt = re.sub(r"(<([^>])*([^<])*([^>])*>)", "", line) # delete everything contained in < > symbols
    txt = re.sub(r"([^a-zA-Z ])", " ", txt) # delete everything that is not letters
    txt = re.sub(r'\b\w{1,1}\b', '', txt) # delete everything that is one letter long
    txt = re.sub(r'\b\w{70,}\b', '', txt) # limiting words to 70 characters
    txt = re.sub(r"( +)", " ", txt) # delete all the extra spaces
    list_of_text = txt.strip().lower().split(" ") # delete whitespace at the end and beginning, lower and split on whitespace
    set_of_text = set(list_of_text) # delete duplicates
    set_of_text.difference_update(english_stop_words) # delete the most common words in English
    if debug_print:
        print(set_of_text)
    return set_of_text

    #return line.replace('.', ' ').replace('?', ' ').replace('\'', ' ').replace('&quot', '"')
filter_sentence('Is it me or does it feel weird, being  A watcher of 100 baffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffby challenge but it’s in season 32 seasons after this yeah this is weird <a href="http://www.youtube.com/results?search_query=%23gardensalad">#GardenSalad</a>🥗')

{'challenge', 'feel', 'season', 'seasons', 'watcher', 'weird', 'yeah'}

In [11]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [4]:
def write_json(data:dict, filepath: Path): 
    with filepath.open(mode='w') as f:
        json.dump(data, f)

In [20]:
def count_names(filtered_words: set, names_found:dict, name_dataset:NameDataset) -> dict:
    for word in filtered_words:
        try:
            if name_dataset.search_first_name(word, use_upper_case=False):
                if word in names_found:
                    names_found[word] = names_found[word] + 1
                else:
                    names_found[word] = 1
        except:
            raise Exception("Error: ", word, " in ",  filtered_words)
    return names_found
    
def check_if_same_line(line: str, prev_line: str, count: int):
    how_similar = similar(line, prev_line) > 0.9
    # if(how_similar):
    #     print("\nLine {} and {} are {} similar: \n{}{}\n".format(count-1, count, similar(line, prev_line), prev_line, line))
    return how_similar


def get_time(start_time):
    m, s = divmod(time() - start_time, 60)
    return m, s

def for_every_comment_page(episode_data_path: Path, name_dataset:NameDataset):
    page = 1
    names_found = {}
    data_path = episode_data_path.joinpath(f"{str(page)}.txt")
    duplicates = 0
    while(data_path.is_file()):
        file1 = open(data_path, 'r', encoding='utf8', errors='ignore')
        Lines = file1.readlines()
        prev_line = ""
        count = 0
        for line in Lines:
            count += 1
            if check_if_same_line(line, prev_line, count):
                duplicates += 1
            else:
                filtered_words = filter_sentence(line, False)
                count_names(filtered_words, names_found, name_dataset)
                prev_line = line

        names_found = dict(sorted(names_found.items(), key=lambda item: item[1], reverse=True))
        write_json(names_found, episode_data_path.joinpath("names_found.json"))

        page += 1
        data_path = episode_data_path.joinpath(f"{str(page)}.txt")
    print("Duplicates: ", duplicates)
    return names_found

name_dataset = NameDataset()
for episode in range(1, 80):
    start_time_video = time()
    episode_data_path = data_path.joinpath(str(episode))
    if(episode_data_path.exists()):
        for_every_comment_page(episode_data_path, name_dataset)
    else:
        raise Exception("Directory not found", episode_data_path) 

    m, s = get_time(start_time_video)
    print(f"Finished with episode {episode}. Time: {m:.0f}:{s:.000f}")

Duplicates:  28
Finished with episode 1. Time: 0:4
Duplicates:  17
Finished with episode 2. Time: 0:4
Duplicates:  25
Finished with episode 3. Time: 0:4
Duplicates:  56
Finished with episode 4. Time: 0:8
Duplicates:  31
Finished with episode 5. Time: 0:5
Duplicates:  102
Finished with episode 6. Time: 0:5
Duplicates:  66
Finished with episode 7. Time: 0:8
Duplicates:  60
Finished with episode 8. Time: 0:8
Duplicates:  217
Finished with episode 9. Time: 1:6
Duplicates:  172
Finished with episode 10. Time: 0:22
Duplicates:  78
Finished with episode 11. Time: 0:9
Duplicates:  183
Finished with episode 12. Time: 0:18
Duplicates:  157
Finished with episode 13. Time: 0:21
Duplicates:  89
Finished with episode 14. Time: 0:12
Duplicates:  85
Finished with episode 15. Time: 0:18
Duplicates:  97
Finished with episode 16. Time: 0:14
Duplicates:  59
Finished with episode 17. Time: 0:8
Duplicates:  60
Finished with episode 18. Time: 0:6
Duplicates:  35
Finished with episode 19. Time: 0:5
Duplicates

In [7]:

        
total_results = {}
for episode in range(1, 80):
    episode_data_path = data_path.joinpath(str(episode), "names_found.json")
    if(episode_data_path.exists()):
        # Get data and add it to total_results
        with open(episode_data_path) as json_file: 
            data = json.load(json_file)
            total_results[episode] = data
    else:
        raise Exception("Directory not found", episode_data_path) 

    print(f"Finished with episode {episode}")
write_json(total_results, data_path.joinpath("total_results.json"))

Finished with episode 1
Finished with episode 2
Finished with episode 3
Finished with episode 4
Finished with episode 5
Finished with episode 6
Finished with episode 7
Finished with episode 8
Finished with episode 9
Finished with episode 10
Finished with episode 11
Finished with episode 12
Finished with episode 13
Finished with episode 14
Finished with episode 15
Finished with episode 16
Finished with episode 17
Finished with episode 18
Finished with episode 19
Finished with episode 20
Finished with episode 21
Finished with episode 22
Finished with episode 23
Finished with episode 24
Finished with episode 25
Finished with episode 26
Finished with episode 27
Finished with episode 28
Finished with episode 29
Finished with episode 30
Finished with episode 31
Finished with episode 32
Finished with episode 33
Finished with episode 34
Finished with episode 35
Finished with episode 36
Finished with episode 37
Finished with episode 38
Finished with episode 39
Finished with episode 40
Finished 

In [1]:
from names_dataset import NameDataset
from pathlib import Path
import re
from nltk.corpus import stopwords
from difflib import SequenceMatcher
from time import time
import json
import requests

In [8]:
file1 = open(Path.cwd().joinpath("children.txt"), 'r')
Lines = file1.readlines()
 
count = 1
# Strips the newline character

def filter_sentence(line: str, debug_print: bool = False) -> str:
    if debug_print:
        print(line.strip())
    txt = re.sub(r"([^a-zA-Z ])", "", line)
total = []
pair = []
prev_parent = ""
for line in Lines:
    txt = ""
    print("Line{}: {}".format(count, line.strip()))
    if count % 2 == 1:
        pair = []
        txt = re.sub(r"([^a-zA-Z])", "", line)
        pair.append(txt)
    else:
        txt = re.findall(r"(\[\[([^\]]*)\]\])", line)
        print(txt)
        if(len(txt) == 0):
            txt = prev_parent
        else:
            txt = txt[0][1]
        prev_parent = txt
        pair.append(txt)
        total.append(pair)
    count += 1
    print(txt)
total[11][0] = "Rosé"
print(total)

# Now use https://pypi.org/project/requests/ library to get the web-site 
r = requests.get('https://100-baby-challenge.fandom.com/wiki/Brooke_Impiccishmay')
#r.content.decode('unicode_escape')
txt = r.text
len(txt)

def get_birth_episode(name:str) -> int:
    if("jr" in name.lower()):
        name_true = name.replace("Jr", "")
        link =  f"https://100-baby-challenge.fandom.com/wiki/{name_true}_Impiccishmay,_Jr."
    else:
        link = f'https://100-baby-challenge.fandom.com/wiki/{name}_Impiccishmay'
    r = requests.get(link)
    txt = r.text
    short_txt = txt[txt.find("age stages through the series"):]
    shorter_txt = short_txt[:short_txt.find("background:#f2f2f2; border-radius: 0 0 7px 0;")]
    part = shorter_txt[shorter_txt.find('>Part'):shorter_txt.find('</a>')]
    if(len(part) == 0):
        print(link)
        print(len(txt), len(short_txt), len(shorter_txt), len(part))
        print(shorter_txt)
        print(part)
    return int(re.findall(r"\d+", part)[0])
get_birth_episode("Brooke")

Line1: <!--Olive-->
Olive
Line2: {{.child|n=1|col4={{lesbian}}|age={{elder}}|col6={{sim}}|col7=|col9=[[Marcus Flex]]|col10={{old.age}}}}
[('[[Marcus Flex]]', 'Marcus Flex')]
Marcus Flex
Line3: <!--Brielle-->
Brielle
Line4: {{.child|n=2|col4={{female}}|age={{elder}}|col6={{sim}}|col7=|col9=[[Dominic Fyres]]|col10={{old.age}}}}
[('[[Dominic Fyres]]', 'Dominic Fyres')]
Dominic Fyres
Line5: <!--Jaime-->
Jaime
Line6: {{.child|n=3|col4={{male}}|age={{elder}}|col6={{sim}}|col7=|col9=[[J Huntington III]]|col10={{old.age}}}}
[('[[J Huntington III]]', 'J Huntington III')]
J Huntington III
Line7: <!--Alexis-->
Alexis
Line8: {{.child|n=4|col4={{female}}|age={{elder}}|col6={{sim}}|col7=|col9=[[Sergio Romeo]]|col10={{old.age}}}}
[('[[Sergio Romeo]]', 'Sergio Romeo')]
Sergio Romeo
Line9: <!--Miles-->
Miles
Line10: {{.child|n=5|col4={{male}}|age={{elder}}|col6={{sim}}|col7={{twin|#=1}}|col9=[[Lars Rosewood]]|rs=2|col10={{old.age}}}}
[('[[Lars Rosewood]]', 'Lars Rosewood')]
Lars Rosewood
Line11: <!--Re

30

In [22]:
children_parents_episodes = {}
for i in range(len(total)):
    name = total[i][0]
    episode = get_birth_episode(name)
    print(name, episode)
    if(episode in children_parents_episodes):
        children_parents_episodes[episode]["child"].append(name)
        parent_name = total[i][1]
        if(parent_name not in children_parents_episodes[episode]["parent"]):
            children_parents_episodes[episode]["parent"].append(parent_name)
    else:
        children_parents_episodes[episode] = {"child": [name]}
        children_parents_episodes[episode]["parent"] = [total[i][1]]

write_json(children_parents_episodes, data_path.joinpath("children_parents_per_episode.json"))
children_parents_episodes

Olive 1
Brielle 2
Jaime 2
Alexis 4
Miles 4
Renee 4
Charlie 6
Hazel 7
Eric 9
Niya 9
Natalie 9
Rosé 10
River 10
Flynn 12
Cooper 12
Addi 13
Ellie 15
Dorian 15
Theo 15
Tristen 15
Willow 16
Ginny 16
Bran 17
Jon 19
Arya 19
Nova 21
Freya 21
Sirius 22
Stacey 22
Kasey 23
Taylor 25
Tayler 25
Tegan 29
Archer 29
Blaire 30
Brooke 30
Brendon 30
Hope 30
Bella 30
Ever 34
Jake 34
Romeo 35
Caesar 35
Harry 36
Dustin 36
LeoJr 37
Logan 38
Holly 39
Hayley 39
Olivia 40
ChelseaJr 42
Annie 42
Ashton 42
Phoebe 44
Owen 46
Lucien 46
Autumn 47
August 47
Mars 48
Conner 48
Blake 50
Hannah 52
Elliot 52
Oliver 55
Kelly 56
Wesley 56
Henry 57
Holden 57
Josie 61
Cove 61
Delilah 62
Danyel 62
Ruth 63
Todd 64
Jacob 64
Eileen 68
Waldo 68
Jennie 69
Rebekah 70
Dwayne 70
Loki 71
Eleanor 71
Koya 73
Rain 73
Baby 74
Jane 74
Han 76
Hwan 76
Keira 77
Teddy 78
Levi 79
Belle 79


{1: {'child': ['Olive'], 'parent': ['Marcus Flex']},
 2: {'child': ['Brielle', 'Jaime'],
  'parent': ['Dominic Fyres', 'J Huntington III']},
 4: {'child': ['Alexis', 'Miles', 'Renee'],
  'parent': ['Sergio Romeo', 'Lars Rosewood']},
 6: {'child': ['Charlie'], 'parent': ['Craig Slater']},
 7: {'child': ['Hazel'], 'parent': ['Kim Mingyu']},
 9: {'child': ['Eric', 'Niya', 'Natalie'],
  'parent': ['Caron Simmons', 'Steven Smith']},
 10: {'child': ['Rosé', 'River'], 'parent': ['Maria Wills']},
 12: {'child': ['Flynn', 'Cooper'], 'parent': ['Cayden Cross']},
 13: {'child': ['Addi'], 'parent': ['Yusuf Malik']},
 15: {'child': ['Ellie', 'Dorian', 'Theo', 'Tristen'],
  'parent': ['Craig Dream Daddy', 'Hailey Willis']},
 16: {'child': ['Willow', 'Ginny'], 'parent': ['Hamza Mounib']},
 17: {'child': ['Bran'], 'parent': ['Kade Pelletier']},
 19: {'child': ['Jon', 'Arya'], 'parent': ['Korbin Sherwood']},
 21: {'child': ['Nova', 'Freya'], 'parent': ['Chance Turner']},
 22: {'child': ['Sirius', 'Stac