In [167]:
import os
import re
import numpy as np
import pandas as pd
import freesound
from collections import ChainMap
import matplotlib.pyplot as plt

In [2]:
token_dir = "my_tokens"
token_filename = "freesound_tokens.txt"
token_filepath = os.path.join(os.getcwd(), token_dir, token_filename)
terms_dir = os.path.join(os.path.dirname(os.getcwd()), "miso-sound-taxonomy")
terms_filename = "MSI_sound_order.txt"
terms_filepath = os.path.join(terms_dir, terms_filename)

In [94]:
a = np.loadtxt(terms_filepath, dtype=str, delimiter="\n")

list_terms = [re.sub(r"((\*.*)|(\/.*)|(\(.*))", "", string) for string in a]
terms_order = {
    key: value for (key, value) in zip(list_terms, np.arange(1, len(list_terms) + 1))
}
# to remove special characters: ["".join(e for e in string if e.isalnum() or e==" ") for string in a]

In [95]:
tokens = []
with open(token_filepath, "r") as token_file:
    for token in token_file:
        tokens.append(token.strip())

In [96]:
client = freesound.FreesoundClient()
client.set_token(tokens[0], "token")

In [86]:
each_field_list = [
    "id",
    "name",
    "username",
    "duration",
    "description",
    "tags",
    "url",
    "license",
    "previews",
    "samplerate",
    "created",
    "type",
    "filesize",
    "num_downloads",
    "avg_rating",
    "num_ratings",
    #"analysis_stats",
    #"analysis_frames",
    "ac_analysis"
]

fields = each_field_list[0]
for field in each_field_list:
    fields = fields + "," + field
fields
#"id,name,username,duration,description,tags,url,license,previews,samplerate,created,type,filesize,num_downloads,avg_rating,num_ratings,ac_analysis,is_remix,was_remixed"

'id,id,name,username,duration,description,tags,url,license,previews,samplerate,created,type,filesize,num_downloads,avg_rating,num_ratings,ac_analysis'

In [139]:
list_terms[:4]

['sipping hot liquid',
 'crunching on chips',
 'drinking from a water fountain',
 'slurping']

In [283]:
list_terms

['sipping hot liquid',
 'crunching on chips',
 'drinking from a water fountain',
 'slurping',
 'lip smacks',
 'clicking a pen',
 'chewing gum',
 'clipping nails',
 'male sniffles',
 'flipping newspaper pages',
 'typing',
 'pen cap covering',
 'basketball dribbling',
 'eating and slurping',
 'bowls on table',
 'silverware on table',
 'knife cutting',
 'walking with heels',
 'human breathing',
 'load dishwasher ',
 'sniffling',
 'coins shuffling',
 'plastic crumpling',
 'metal spoon scrapes pot',
 'mouse click continuous',
 'stirring liquid in a glass',
 'human eating peach',
 'baby breaths',
 'water drops',
 'clicking a mouse',
 'human nose sniffing',
 'clearing throat',
 'swallowing',
 'slow typing',
 'floating ice in a glass',
 'cutting fruit',
 'calm breathing',
 'card dealing',
 'wash hands running water',
 'hand wipe',
 'keyboard typing fast',
 'paper bag opening',
 'human nose breathing',
 'dog drinking water',
 'dog drinking close up',
 'chopping celery',
 'flip flops on wood',
 

In [236]:
for term in list_terms:
    sounds_results_pager = client.text_search(
        query=term, fields=fields
    )
    pages_left = True
    sound_count = 0
    while pages_left:
        for sound in sounds_results_pager:
            sound_count += 1
            search_info = {
                "term": term,
                "term_order": terms_order[term],
                "search_result_number": sound_count,
            }
            items_not_dict = {
                key: value
                for (key, value) in sound.json_dict.items()
                if not isinstance(value, dict)
            }
            list_dict = [
                value for (key, value) in sound.json_dict.items() if isinstance(value, dict)
            ]
            list_dict.append(items_not_dict)
            list_dict.append(search_info)
            comb_dict = dict(ChainMap(*list_dict))
            search_results_df = pd.DataFrame(comb_dict)
            search_results_df.to_csv(
                "search_results_new.csv", mode="a", header=False, index=False
            )
            if sound.json_dict["id"] not in all_sounds.keys():
                all_sounds[sound.json_dict["id"]] = comb_dict
            else:
                repeated_sounds.append(comb_dict)
        if not sounds_results_pager.next or sound_count >= 100:
            pages_left = False
        else:
            sounds_results_pager = sounds_results_pager.next_page()

In [237]:
df = pd.DataFrame.from_dict(all_sounds, orient="index").reset_index()

In [265]:
df["priority_check1"] = (
    (df["license"].values == "http://creativecommons.org/publicdomain/zero/1.0/")
    & (df["duration"].values >= 4)
    & (df["duration"].values <= 150)
    & (df["samplerate"] >= 44100)
)

In [266]:
used_usernames = []
bin_first_use = np.zeros_like(df.index).astype(bool)

for index, row in df.iterrows():
    if row["priority_check1"]:
        if row["username"] in used_usernames:
            bin_first_use[index] = False
        else:
            used_usernames.append(row["username"])
            bin_first_use[index] = True

for index, row in df.iterrows():
    if not row["priority_check1"]:
        if row["username"] in used_usernames:
            bin_first_use[index] = False
        else:
            used_usernames.append(row["username"])
            bin_first_use[index] = True
            
df["first_instance_username"] = bin_first_use

In [267]:
df["first_instance_username"].value_counts()

False    3132
True     2190
Name: first_instance_username, dtype: int64

In [268]:
df["priority_check2"] = ((df["priority_check1"]) & (df["first_instance_username"]))

In [269]:
df["priority_check2"].value_counts()

False    4266
True     1056
Name: priority_check2, dtype: int64

In [300]:
out_df = df.sort_values(
    by=["priority_check2", "term_order", "search_result_number"],
    ascending=(False, True, True),
).rename_axis("orig_index").reset_index().rename_axis("priority").reset_index()
out_df["priority"] = out_df["priority"] + 1

In [305]:
filt_out_df = out_df.loc[:,["term","url","priority"]]
filt_out_df["current_term_present"] = ""
filt_out_df["other_term_present"] = ""
filt_out_df["notes"] = ""
filt_out_df.to_csv(
                "search_results_20220617_urls.csv", mode="a", index=False
            )