### Data Cleaning

In [None]:
# !pip install pandas
# https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large
# !pip install deepmultilingualpunctuation protobuf
# !pip install nltk

In [None]:
import pandas as pd
import re
import csv
from deepmultilingualpunctuation import PunctuationModel
import nltk
import matplotlib.pyplot as plt

### Data Cleaning

In [43]:
# INSIGHT_DATA_FILE = 'insight-transcripts-data.csv'
INSIGHT_DATA_FILE = 'insight-transcripts-data-cleaned.csv'
YOUTUBE_DATA_FILE = 'yt-transcripts-data.csv'

In [None]:
model = PunctuationModel()

In [42]:
# Common intros to remove
PHRASES_TO_REMOVE = [
    r'.*welcome to carries conscious living are you ready to',
    r'.*on the 7th of this month are you ready to',
    r'.*are (you|we) ready to meditate (with|the)',
    r'.*join the patreon the links are in the description',
    r'.*if you are returning welcome back here at the',
    r".*I'm Sara Raymond here at the mindful movement",
    r'.*your host Brian Scott',
    r'.*create your free Mindvalley account today at mindvalley.com'
]

def data_clean(csv_file_path):
    cleaned_file_name = csv_file_path.rstrip('.csv')
    cleaned_file_name += '-cleaned.csv'

    df = pd.read_csv(csv_file_path, encoding='utf8')
    # df.dropna()

    with open(cleaned_file_name, 'a', encoding='utf8', newline='') as csvf:
        writer = csv.writer(csvf)
        # if csvf empty
        # writer.writerow(['Meditation_Type','URL','Script'])
        for idx, item in df.iterrows():
            s = item['Script']
            s = re.sub('\[.*?\]', ' ', s)  # Remove content inside square brackets
            s = re.sub('\(.*?\)', ' ', s)  # Remove contents inside square brackets
            s = re.sub('\s+',' ', s)  # Replace consecutive whitespace with a single space

            # Remove specific phrases  (FOR youtube data)
            s = re.sub('so( so)+', 'so', s)  # Remove consecutive 'so'
            s = re.sub('foreign( foreign)+', 'foreign', s)  # Remove consecutive 'foreign' 
            s = re.sub('you( you)+', 'you', s)  # Remove consecutive 'you'

            # Remove the unrelated introductions
            s = re.sub(".*let's (begin|start)", "let's start", s)
            s = re.sub(".*(begin|start) by", 'start by', s)
            for phrase in PHRASES_TO_REMOVE:
                s = re.sub(phrase, '', s)

            # Add Punctuation
            s = model.restore_punctuation(s)  # Takes very long time

            script = s
            url = item['URL']
            med_type = item['Meditation_Type']
            writer.writerow([med_type, url, script])
            print(f"Done Line {idx + 2}")  # Account for 0 index and the header line

    print(f'Written cleaned data to {cleaned_file_name}')

In [None]:
data_clean(YOUTUBE_DATA_FILE)  # ADD -cleaned file??m

In [None]:
data_clean(INSIGHT_DATA_FILE)

In [None]:
# After adding punctuation, grep for lines that don't end with period

# Remove all instances of . you. and you" using find and replace (regex)
# Do the same with foreign  (also very common in youtube scripts)
# (youtube transcripts have a bunch of random you.)

#### Created new file `med-transcript-dataset.csv` with the insight timer data followed by the youtube data

### Add special tokens to dataset

In [None]:
INPUT_DATA_FILE = 'med-transcript-dataset.csv'
TOKENIZED_DATA_FILE = 'tokenized-dataset.csv'

In [None]:
import pandas as pd
import math
import csv

df = pd.read_csv(INPUT_DATA_FILE, encoding="utf8")  # encoding="ISO-8859-1"  Original
# df = df.dropna()  # DO NOT dropna() because insight timer rows have nan urls
# Empty file
with open(TOKENIZED_DATA_FILE, 'w') as f:
    f.truncate(0)

with open(TOKENIZED_DATA_FILE, 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['Meditation_Type','URL','Script'])
    for idx, item in df.iterrows():
        script = item['Script']
        url = item['URL']
        med_type = item['Meditation_Type']

        # ADD SPECIAL TOKEN indicating the meditation type
        if type(med_type)==str:
            special_token = f'[{med_type} MEDITATION]'.upper()
        # If nan values (type float), then don't add token
        else:
            special_token = ""

        script = f'{special_token} {script}'

        writer.writerow([med_type, url, script])

print(f"Written tokenized data file to {TOKENIZED_DATA_FILE}")

Analyze what the most common words to find phrases we should remove