In [None]:
!pip install clean-text
!pip install unidecode

In [None]:
%matplotlib inline
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import sent_tokenize
from gensim.parsing.preprocessing import preprocess_string
from functools import partial
from cleantext import clean
from gensim.parsing.preprocessing import DEFAULT_FILTERS
import matplotlib.pylab as plt
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

pbar = ProgressBar()
pbar.register()

In [None]:
spotify_meta_df = pd.read_csv("../datasets/raw/spotify/spotify-podcasts-2020/metadata.tsv", sep="\t", )
print("##Column names## : ", ", ".join(spotify_meta_df.columns))
# spotify_meta_df = spotify_meta_df.repartition(npartitions=spotify_meta_df.npartitions // 100)

In [None]:
cleaner = partial(clean,
    fix_unicode=True,               # fix various unicode errors
    to_ascii=True,                  # transliterate to closest ASCII representation
    lower=True,                     # lowercase text
    no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
    no_urls=True,                  # replace all URLs with a special token
    no_emails=True,                # replace all email addresses with a special token
    no_phone_numbers=True,         # replace all phone numbers with a special token
    no_numbers=True,               # replace all numbers with a special token
    no_digits=True,                # replace all digits with a special token
    no_currency_symbols=True,      # replace all currency symbols with a special token
    no_punct=True,                 # remove punctuations
    lang="en"                       # set to 'de' for German special handling
)
filters = [cleaner] + DEFAULT_FILTERS
string_preprocessor=partial(preprocess_string, filters=filters)

In [None]:
eposide_sentences = (
    spotify_meta_df["episode_description"]
    .dropna()
    .map(lambda x: sent_tokenize(x) if isinstance(x, str) else None)
    .explode()
    .dropna()
    .to_frame()
    )

eposide_sentences["preprocessed_sentences"] = eposide_sentences["episode_description"].map(string_preprocessor)
eposide_sentences["preprocessed_sentences"] = (
    eposide_sentences["preprocessed_sentences"]
    .map(lambda x: x if isinstance(x, list) and len(x) > 0 else None))
eposide_sentences = eposide_sentences.dropna(subset=["preprocessed_sentences"])
eposide_sentences["preprocessed_sentences"] = eposide_sentences["preprocessed_sentences"].map(lambda x: " ".join(x))
eposide_sentences

In [None]:
vectorizer = TfidfVectorizer(binary=True, min_df=5,max_df=0.85, norm=False)
vectorizer = vectorizer.fit(eposide_sentences["preprocessed_sentences"])

In [None]:
sentence_vecs = vectorizer.transform(eposide_sentences["preprocessed_sentences"])


In [None]:
filtered_sentences = eposide_sentences.iloc[sentence_vecs.sum(axis=1).A1 > 25]
filtered_descriptions = (
    filtered_sentences
    .groupby(filtered_sentences.index)
    .agg({"episode_description": list})
    .episode_description
    .map(lambda x: x if isinstance(x, list) and len(x)>0 else None)
    .dropna()
    )
filtered_descriptions.iloc[-2]

In [None]:
filtered_sentences = eposide_sentences.iloc[sentence_vecs.sum(axis=1).A1 > 25]
filtered_sentences.iloc[-1]["episode_description"]

In [None]:
spotify_meta_df["filtered_descriptions"] = filtered_descriptions
spotify_meta_df

In [None]:
(
    spotify_meta_df[["episode_filename_prefix", "filtered_descriptions"]]
    .dropna()
    .rename(
        {"episode_filename_prefix": "file_prefix",
         "filtered_descriptions": "summary"
         }, axis=1)
).to_json("../datasets/raw/spotify/clean_summaries.json", lines=True, orient="records")


In [None]:
def load_transcript(item):
    utterances = []
    results = json.load(item.open())
    results = results.get("results")
    if results is not None:
        for result in results:
            alternatives = result.get("alternatives")
            if alternatives:
                transcript = alternatives[0].get("transcript")
                if transcript:
                    utterances.append(transcript)

    file_prefix = item.stem
    data = {"document": utterances, "file_prefix": file_prefix}
    return data


In [None]:
from tqdm.notebook import tqdm
from multiprocessing import Pool, cpu_count
pool = Pool(cpu_count()-1)
import pathlib

transcripts = pathlib.Path("../datasets/raw/spotify/spotify-podcasts-2020/podcasts-transcripts").glob("**/*.json")
transcripts = list(transcripts)
with open("../datasets/raw/spotify/transcripts.json", "w+") as outfile:
    loaded_transcripts = pool.imap_unordered(load_transcript, transcripts)
    for item in tqdm(loaded_transcripts, total=len(transcripts)):
        outfile.write(json.dumps(item) + "\n")

In [None]:
import dask.dataframe as dd

transcripts_df = dd.read_json("../datasets/raw/spotify/transcripts.json", lines=True, orient="records").set_index("file_prefix")
summaries_df = dd.read_json("../datasets/raw/spotify/clean_summaries.json", lines=True, orient="records").set_index("file_prefix")

In [None]:
transcripts_df["summary"]= summaries_df["summary"]
transcripts_df = transcripts_df.dropna()
transcripts_df = transcripts_df.persist()
transcripts_df.shape

In [None]:
transcripts_df.to_json("../datasets/raw/spotify/podcast_dataset.json", lines=True, orient="records")

In [2]:
from dask_ml.model_selection import train_test_split as dtts
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

pbar = ProgressBar()
pbar.register()

In [3]:

data = dd.read_json("../datasets/raw/spotify/podcast_dataset.json", lines=True, orient="records")

train_df, test_df = dtts(data, test_size=0.2, shuffle=True)
test_df, val_df = dtts(test_df, test_size=0.5, shuffle=True)

train_df.to_json("../datasets/raw/spotify/train")
val_df.to_json("../datasets/raw/spotify/valid")
test_df.to_json("../datasets/raw/spotify/test")

[########################################] | 100% Completed | 53.3s
[                                        ] | 0% Completed |  0.1s



[########################################] | 100% Completed |  2min 59.3s
[########################################] | 100% Completed |  1min 31.3s
[########################################] | 100% Completed |  1min 44.5s


['/Users/bebop/Documents/courses/APCOMP215/AC215_projectgarble/notebooks/../datasets/raw/spotify/test/0.part']

In [None]:
gzip -c train.json > train.json.gz
gzip -c valid.json > valid.json.gz
gzip -c test.json > test.json.gz