Adds new text parsing algorithm

mps-youtube · Jun 15, 2017 · 271e6f5 · 271e6f5
1 parent e07a117
commit 271e6f5
Show file tree

Hide file tree

Showing 5 changed files with 177 additions and 28 deletions.
diff --git a/mps_youtube/commands/generate_playlist.py b/mps_youtube/commands/generate_playlist.py
@@ -106,7 +106,7 @@ def description_generator(text):
     query['maxResults'] = '1'
     data = pafy.call_gdata('videos', query)['items'][0]['snippet']
     title = "mkp %s" % data['title']
-    data = util.fetch_songs(data['description'])
+    data = util.fetch_songs(data['description'], data['title'])
 
     columns = [
         {"name": "idx", "size": 3, "heading": "Num"},

diff --git a/mps_youtube/description_parser.py b/mps_youtube/description_parser.py
@@ -0,0 +1,162 @@
+"""
+    Module for trying to parse and retrieve song data from descriptions
+"""
+import re
+import random
+import pafy
+
+
+def calculate_certainty(line):
+    """ Determine if a line contains a  """
+    certainty_indexes = [
+        {'regex': r"(?:\(?(?:\d{0,4}:)?\d{0,2}:\d{0,2}\)?(?: - )?){1,2}",
+         'weight': 1},
+        {'regex': r"(([\w&()\[\]'\.\/ ]+)([ ]?[-]+[ ]?)([\w&()\[\]'\.\/ ]+))+",
+         'weight': 0.75},
+        {'regex': r"^([\d]+[. ]+)",
+         'weight': 1}
+    ]
+
+    certainty = 0.0
+    for method in certainty_indexes:
+        if re.match(method['regex'], line):
+            certainty += method['weight']
+
+    return certainty / len(certainty_indexes)
+
+
+def has_artist(text):
+    """ Determine if the strìng has artist or not """
+    regex = r"(?:([\w&()\[\]'\.\/ ]+)(?:[ ]?[-]+[ ]?)([\w&()\[\]'\.\/ ]+))+"
+    return not re.match(regex, text)
+
+
+def strip_string(text, single=False):
+    """ Strip an artist-combo string """
+    # Removes timestamps
+    ts_reg = r"(?:\(?(?:\d{0,4}:)?\d{1,2}:\d{1,2}\)?(?: - )?){1,2}"
+    text = re.sub(ts_reg, "", text)
+
+    # Removes Tracknumbers.
+    text = re.sub(r"^([\d]+[. ]+)", "", text)
+
+    # Removes starting with non words
+    text = re.sub(r"^[^\w&()\[\]'\.\/]", "", text, flags=re.MULTILINE)
+
+    artist, track = None, None
+    if not single:
+        rgex = r"(?:([\w&()\[\]'\.\/ ]+)(?:[ ]?[-]+[ ]?)([\w&()\[\]'\.\/ ]+))+"
+        artist, track = (re.findall(rgex, text)[0])
+    else:
+        track = text
+
+    return artist, track
+
+
+def long_substr(data):
+    """ https://stackoverflow.com/a/2894073 """
+    substr = ''
+    if len(data) > 1 and len(data[0]) > 0:
+        for i in range(len(data[0])):
+            for j in range(len(data[0])-i+1):
+                if j > len(substr) and is_substr(data[0][i:i+j], data):
+                    substr = data[0][i:i+j]
+    return substr
+
+
+def is_substr(find, data):
+    """ Check if is substring """
+    if len(data) < 1 and len(find) < 1:
+        return False
+    for i, _ in enumerate(data):
+        if find not in data[i]:
+            return False
+    return True
+
+
+def artist_from_title(title):
+    """ Try to determine an artist by doing a search on the video
+        and try to find the most common element by n number of times looking
+        for the most common substring in a subset of the results from youtube
+    """
+    query = {}
+    query['q'] = title
+    query['type'] = 'video'
+    query['fields'] = "items(snippet(title))"
+    query['maxResults'] = 50
+    query['part'] = "snippet"
+
+    results = pafy.call_gdata('search', query)['items']
+    titles = [x['snippet']['title'].upper() for x in results]
+
+    alts = {}
+    for _ in range(100):
+        random.shuffle(titles)
+        subset = titles[:10]
+        string = long_substr(subset).strip()
+        if len(string) > 3:
+            alts[string] = alts.get(string, 0) + 1
+
+    best_string = None
+    if len(alts) == 1:
+        best_string = list(alts.keys())[0].capitalize()
+    else:
+        best_guess = 99999
+        best_string = None
+
+        for key in list(alts.keys()):
+            current_guess = title.upper().find(key)
+            if current_guess < best_guess:
+                best_guess = current_guess
+                best_string = key.capitalize()
+
+    best_string = re.sub(r"([^\w]+)$", "", best_string)
+    best_string = re.sub(r"^([^\w]+)", "", best_string)
+    return best_string
+
+
+def parse(text, title="Unknown"):
+    """ Main function"""
+
+    # Determine a certainty index for each line
+    lines = []
+    for line in text.split('\n'):
+        lines.append((calculate_certainty(line), line))
+
+    # Get average from all strings
+    certainty_average = sum([x[0] for x in lines]) / len(lines)
+
+    # Single out lines with above average certainty index
+    lines = filter(lambda a: a is not None,
+                   [x if x[0] > certainty_average else None for x in lines])
+
+    # Determine if they are artist combo strings or only title
+    cmbs = []
+    for line in lines:
+        is_ac = has_artist(line[1])
+        cmbs.append(strip_string(line[1], is_ac))
+
+    # No or very few tracklists will ommit aritsts or add artist information
+    # on only a few select number of tracks, therefore we count entries with
+    # and without artist, and remove the anomalities IF the number of
+    # anomalities are small enough
+
+    counters = {'has': 0, 'not': 0}
+    for combo in cmbs:
+        counters['has' if combo[0] else 'not'] += 1
+
+    dominant = 'has' if counters['has'] > counters['not'] else 'not'
+
+    diff = abs(counters['has'] - counters['not'])
+    if diff > sum([counters['has'], counters['not']]):
+        print("Too many anomalities detected")
+        return []
+
+    if dominant == 'has':
+        cmbs = filter(lambda a: a is not None,
+                      [x if x[0] is not None else None for x in cmbs])
+    else:
+        arti = artist_from_title(title)
+        cmbs = filter(lambda a: a is not None,
+                      [(arti, x[1]) if x[0] is None else None for x in cmbs])
+    return list(cmbs)
diff --git a/mps_youtube/listview/songtitle.py b/mps_youtube/listview/songtitle.py
@@ -7,9 +7,11 @@ class ListSongtitle(ListViewItem):
     """
     # pylint: disable=unused-argument
     _checked = False
+    _certainty = 1.0
 
-    def __init__(self, data):
+    def __init__(self, data, certainty=1.0):
         self._checked = True
+        self._certainty = certainty
         super(ListSongtitle, self).__init__(data)
 
     def artist(self, l=10):
@@ -24,6 +26,10 @@ def checked(self, l=10):
         """ String from for checked """
         return "  X  " if self._checked else "     "
 
+    def certainty(self):
+        """ Float """
+        return self._certainty
+
     def is_checked(self):
         """ Returns true if checked """
         return self._checked

diff --git a/mps_youtube/main.py b/mps_youtube/main.py
@@ -120,11 +120,11 @@ def main():
     # open history from file
     history.load()
 
-    arg_inp = ' '.join(g.argument_commands)
-
+    arg_inp = " ".join(g.argument_commands)
+    
     prompt = "> "
-    arg_inp = arg_inp.replace(r",,", "[mpsyt-comma]")
-    arg_inp = arg_inp.split(",")
+    arg_inp = arg_inp.replace(r",,", "[mpsyt-comma]") 
+    arg_inp = arg_inp.split(",") 
 
     while True:
         next_inp = ""

diff --git a/mps_youtube/util.py b/mps_youtube/util.py
@@ -11,7 +11,7 @@
 
 import pafy
 
-from . import g, c, terminalsize
+from . import g, c, terminalsize, description_parser
 from .playlist import Video
 
 
@@ -419,27 +419,8 @@ def load_player_info(player):
         g.mplayer_version = _get_mplayer_version(player)
 
 
-def fetch_songs(text):
-    """ Parses cleartext to find song titles """
-
-    # Removes timestamps
-    ts_reg = r"(?:\(?(?:\d{0,4}:)?\d{0,2}:\d{0,2}\)?(?: - )?){1,2}"
-    text = re.sub(ts_reg, "", text)
-
-    # Removes Tracknumbers.
-    text = re.sub(r"\d+\.", "", text)
-
-    # Removes starting with non words
-    text = re.sub(r"^[^\w&()\[\]'\.\/]", "", text, flags=re.MULTILINE)
-
-    titles = []
-    rgex = r"(([\w&()\[\]'\.\/ ]+)([ ]?[-]+[ ]?)([\w&()\[\]'\.\/ ]+))+"
-    for result in re.findall(rgex, text):
-        artist, track = result[1].strip(), result[3].strip()
-
-        if len("%s - %s" % (artist, track)) > 5:
-            titles.append((artist, track))
-    return titles
+def fetch_songs(text,title="Unknown"):
+    return description_parser.parse(text, title)
 
 
 def number_string_to_list(text):