From 0925b715e2d0513e4cda096e3b3c0ba55fb52444 Mon Sep 17 00:00:00 2001 From: Macoy Madson Date: Sat, 1 Jun 2019 21:08:06 +0000 Subject: [PATCH] Integrated LikedSavedDatabase * LikedSavedDatabase now records submissions and their associated files after downloading. This will pave the way for more cool features in the future * Split up redditUserImageScraper main function to be a bit more readable --- LikedSavedDatabase.py | 30 +++- LikedSavedDownloaderServer.py | 12 +- imageSaver.py | 15 +- redditUserImageScraper.py | 255 ++++++++++++++++++---------------- settings.py | 29 ++-- submission.py | 5 + utilities.py | 12 +- 7 files changed, 212 insertions(+), 146 deletions(-) diff --git a/LikedSavedDatabase.py b/LikedSavedDatabase.py index fa03f90..c4d86f4 100644 --- a/LikedSavedDatabase.py +++ b/LikedSavedDatabase.py @@ -2,6 +2,9 @@ import sqlite3 import submission as Submissions +# Global database +db = None + class LikedSavedDatabase: def __init__(self, databaseFilePath): isNewDatabase = not os.path.exists(databaseFilePath) @@ -46,6 +49,13 @@ def addSubmission(self, submission): submission.getAsList()) self.save() + def addSubmissions(self, submissions): + cursor = self.dbConnection.cursor() + + cursor.executemany("insert or ignore into Submissions values (NULL,?,?,?,?,?,?,?,?)", + Submissions.getAsList_generator(submissions)) + self.save() + def printSubmissions(self): cursor = self.dbConnection.cursor() @@ -73,11 +83,23 @@ def addSubmissionToCollection(self, submissionId, collectionId): (submissionId, collectionId)) self.save() - def associateFileToSubmission(self, filePath, submissionId): + def associateFileToSubmissionId(self, filePath, submissionId): cursor = self.dbConnection.cursor() cursor.execute("insert or ignore into FilesToSubmissions values (?,?)", (filePath, submissionId)) self.save() + + def associateFileToSubmission(self, filePath, submission): + cursor = self.dbConnection.cursor() + cursor.execute("select * from Submissions where postUrl=?", (submission.postUrl,)) + submissionInDb = cursor.fetchone() + if submissionInDb: + submissionId = submissionInDb[0] + cursor.execute("insert or ignore into FilesToSubmissions values (?,?)", + (filePath, submissionId)) + self.save() + else: + print("DB error: couldn't find submission from post URL {}".format(submission.postUrl)) def getAllSubmissionsInCollection(self, collectionId): cursor = self.dbConnection.cursor() @@ -126,10 +148,14 @@ def testOnRealSubmissions(): print("Couldn't find {}".format(title)) else: db.addSubmissionToCollection(dbSubmission[0], dbCollection[0]) - db.associateFileToSubmission("{}.png".format(title), dbSubmission[0]) + db.associateFileToSubmissionId("{}.png".format(title), dbSubmission[0]) print(db.getAllSubmissionsInCollection(dbCollection[0])) print(db.getAllFiles()) + +def initializeFromSettings(userSettings): + global db + db = LikedSavedDatabase(userSettings['Database']) if __name__ == '__main__': #testDatabase() diff --git a/LikedSavedDownloaderServer.py b/LikedSavedDownloaderServer.py index 97c43f4..27e234b 100644 --- a/LikedSavedDownloaderServer.py +++ b/LikedSavedDownloaderServer.py @@ -12,7 +12,7 @@ import json import multiprocessing -from utilities import sort_naturally +import utilities import settings import redditUserImageScraper @@ -44,10 +44,6 @@ def generateSavedImagesCache(outputDir): print('Finished creating Liked Saved cache ({} images/videos)'.format(len(savedImagesCache))) -def outputPathToServerPath(path): - # This is a little weird - return 'output' + path.split(settings.settings['Output_dir'])[1] - def getRandomImage(filteredImagesCache=None, randomImageFilter=''): if not savedImagesCache: generateSavedImagesCache(settings.settings['Output_dir']) @@ -59,7 +55,7 @@ def getRandomImage(filteredImagesCache=None, randomImageFilter=''): print('\tgetRandomImage(): Chose random image {} (filter {})'.format(randomImage, randomImageFilter)) - serverPath = outputPathToServerPath(randomImage) + serverPath = utilities.outputPathToServerPath(randomImage) return randomImage, serverPath @@ -377,7 +373,7 @@ def on_message(self, message): for file in files: if file.endswith(supportedExtensions): imagesInFolder.append(os.path.join(root, file)) - sort_naturally(imagesInFolder) + utilities.sort_naturally(imagesInFolder) currentImageIndex = imagesInFolder.index(fullImagePath) if currentImageIndex >= 0: action = 'setImage' @@ -389,7 +385,7 @@ def on_message(self, message): nextImageIndex = len(imagesInFolder) - 1 fullImagePath = imagesInFolder[nextImageIndex] - serverImagePath = outputPathToServerPath(fullImagePath) + serverImagePath = utilities.outputPathToServerPath(fullImagePath) if command == 'setFilter': newFilter = parsedMessage['filter'] diff --git a/imageSaver.py b/imageSaver.py index 9651735..4643f91 100644 --- a/imageSaver.py +++ b/imageSaver.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +import LikedSavedDatabase import imgurpython as imgur import logger import os @@ -304,8 +305,9 @@ def saveAllImgurAlbums(outputDir, imgurAuth, subredditAlbums, soft_retrieve_imgs numAlbums = len(albums) for albumIndex, album in enumerate(albums): - albumTitle = album[0] - albumUrl = cleanImgurAlbumUrl(album[1]) + albumSubmission = album[0] + albumTitle = album[1] + albumUrl = cleanImgurAlbumUrl(album[2]) logger.log('\t[' + percentageComplete(albumIndex, numAlbums) + '] ' + '\t' + albumTitle + ' (' + albumUrl + ')') @@ -358,6 +360,8 @@ def saveAllImgurAlbums(outputDir, imgurAuth, subredditAlbums, soft_retrieve_imgs if not soft_retrieve_imgs: # Retrieve the image and save it urlretrieve(imageUrl, saveFilePath) + LikedSavedDatabase.db.associateFileToSubmission( + utilities.outputPathToDatabasePath(saveFilePath), albumSubmission) logger.log('\t\t[' + percentageComplete(imageIndex, numImages) + '] ' + ' [save] ' + imageUrl + ' saved to "' + saveAlbumPath + '"') @@ -425,9 +429,9 @@ def saveAllImages(outputDir, submissions, imgur_auth = None, only_download_album else: # We're going to save Imgur Albums at a separate stage if subredditDir in imgurAlbumsToSave: - imgurAlbumsToSave[subredditDir].append((submissionTitle, url)) + imgurAlbumsToSave[subredditDir].append((submission, submissionTitle, url)) else: - imgurAlbumsToSave[subredditDir] = [(submissionTitle, url)] + imgurAlbumsToSave[subredditDir] = [(submission, submissionTitle, url)] continue elif only_download_albums: continue @@ -497,6 +501,9 @@ def saveAllImages(outputDir, submissions, imgur_auth = None, only_download_album # Retrieve the image and save it try: urlretrieve(url, saveFilePath) + + LikedSavedDatabase.db.associateFileToSubmission( + utilities.outputPathToDatabasePath(saveFilePath), submission) except IOError as e: logger.log('[ERROR] IOError: Url {0} raised exception:\n\t{1} {2}' .format(url, e.errno, e.strerror)) diff --git a/redditUserImageScraper.py b/redditUserImageScraper.py index 11f2365..05a4fcf 100644 --- a/redditUserImageScraper.py +++ b/redditUserImageScraper.py @@ -3,6 +3,7 @@ import time import os +import LikedSavedDatabase import imageSaver import logger import redditScraper @@ -14,124 +15,140 @@ scriptFinishedSentinel = '>>> runLikedSavedDownloader() Process Finished <<<' def runLikedSavedDownloader(pipeConnection): - if pipeConnection: - logger.setPipe(pipeConnection) - - settings.getSettings() - - if (not settings.settings['Use_cached_submissions'] - and not settings.hasTumblrSettings() and not settings.hasRedditSettings()): - logger.log('Please provide Tumblr or Reddit account details settings.txt') - return - - imgurAuth = None - if (settings.settings['Should_download_albums'] - and settings.hasImgurSettings()): - imgurAuth = imageSaver.ImgurAuth(settings.settings['Imgur_client_id'], - settings.settings['Imgur_client_secret']) - else: - logger.log('No Imgur Client ID and/or Imgur Client Secret was provided, or album download is not' - ' enabled. This is required to download imgur albums. They will be ignored. Check' - ' settings.txt for how to fill in these values.') - - if not settings.settings['Gfycat_Client_id']: - logger.log('No Gfycat Client ID and/or Gfycat Client Secret was provided, or album download is not' - ' enabled. This is required to download Gfycat media reliably.') - - logger.log('Output: ' + settings.settings['Output_dir']) - utilities.makeDirIfNonexistant(settings.settings['Output_dir']) - - # TODO: Only save one post for early out. Only save once all downloading is done - redditRequestOnlyNewSavedCache = None - redditRequestOnlyNewLikedCache = None - if settings.settings['Reddit_Try_Request_Only_New']: - redditRequestOnlyNewSavedCache = submission.readCacheSubmissions( - settings.settings['Reddit_Try_Request_Only_New_Saved_Cache_File']) - redditRequestOnlyNewLikedCache = submission.readCacheSubmissions( - settings.settings['Reddit_Try_Request_Only_New_Liked_Cache_File']) - - tumblrRequestOnlyNewCache = None - if settings.settings['Tumblr_Try_Request_Only_New']: - tumblrRequestOnlyNewCache = submission.readCacheSubmissions( - settings.settings['Tumblr_Try_Request_Only_New_Cache_File']) - - submissions = [] - - if settings.settings['Use_cached_submissions']: - logger.log('Using cached submissions') - submissions += submission.readCacheSubmissions(settings.settings['Reddit_cache_file']) - submissions += submission.readCacheSubmissions(settings.settings['Tumblr_cache_file']) - else: - if settings.hasRedditSettings(): - redditSubmissions, redditComments, earlyOutPoints = redditScraper.getRedditUserLikedSavedSubmissions( - settings.settings['Username'], settings.settings['Password'], - settings.settings['Client_id'], settings.settings['Client_secret'], - request_limit = settings.settings['Reddit_Total_requests'], - saveLiked = settings.settings['Reddit_Save_Liked'], - saveSaved = settings.settings['Reddit_Save_Saved'], - earlyOutPointSaved = redditRequestOnlyNewSavedCache, - earlyOutPointLiked = redditRequestOnlyNewLikedCache, - unlikeLiked = settings.settings['Reddit_Unlike_Liked'], - unsaveSaved = settings.settings['Reddit_Unsave_Saved']) - - # Cache them in case it's needed later - submission.writeCacheSubmissions(redditSubmissions, settings.settings['Reddit_cache_file']) - - # Set new early out points - submission.writeCacheSubmissions([earlyOutPoints[0]], - settings.settings['Reddit_Try_Request_Only_New_Saved_Cache_File']) - submission.writeCacheSubmissions([earlyOutPoints[1]], - settings.settings['Reddit_Try_Request_Only_New_Liked_Cache_File']) - - submissions += redditSubmissions - - # For reddit only: write out comments to separate json file - if settings.settings['Reddit_Save_Comments']: - submission.saveSubmissionsAsJson(redditComments, settings.settings['Output_dir'] + u'/' - + 'Reddit_SavedComment_Submissions_' + time.strftime("%Y%m%d-%H%M%S") + '.json') - submission.saveSubmissionsAsHtml(redditComments, settings.settings['Output_dir'] + u'/' - + 'Reddit_SavedComment_Submissions_' + time.strftime("%Y%m%d-%H%M%S") + '.html') - logger.log('Saved ' + str(len(redditComments)) + ' reddit comments') - - if settings.hasTumblrSettings(): - tumblrSubmissions, earlyOutPoint = tumblrScraper.getTumblrUserLikedSubmissions( - settings.settings['Tumblr_Client_id'], settings.settings['Tumblr_Client_secret'], - settings.settings['Tumblr_Client_token'], settings.settings['Tumblr_Client_token_secret'], - likeRequestLimit = settings.settings['Tumblr_Total_requests'], - requestOnlyNewCache = tumblrRequestOnlyNewCache) - - # Cache them in case it's needed later - submission.writeCacheSubmissions(tumblrSubmissions, settings.settings['Tumblr_cache_file']) - - # Set new early out point - submission.writeCacheSubmissions([earlyOutPoint], - settings.settings['Tumblr_Try_Request_Only_New_Cache_File']) - - submissions += tumblrSubmissions - - # Write out a .json file with all of the submissions in case the user wants the data - submission.saveSubmissionsAsJson(submissions, settings.settings['Output_dir'] + u'/' - + 'AllSubmissions_' + time.strftime("%Y%m%d-%H%M%S") + '.json') - - logger.log('Saving images. This will take several minutes...') - unsupportedSubmissions = imageSaver.saveAllImages(settings.settings['Output_dir'], submissions, - imgur_auth = imgurAuth, only_download_albums = settings.settings['Only_download_albums'], - skip_n_percent_submissions = settings.settings['Skip_n_percent_submissions'], - soft_retrieve_imgs = settings.settings['Should_soft_retrieve'], - only_important_messages = settings.settings['Only_important_messages']) - - # Write out a .json file listing all of the submissions the script failed to download - submission.saveSubmissionsAsJson(unsupportedSubmissions, settings.settings['Output_dir'] + u'/' - + 'UnsupportedSubmissions_' + time.strftime("%Y%m%d-%H%M%S") + '.json') - - if settings.settings['Should_soft_retrieve']: - logger.log('\nYou have run the script in Soft Retrieve mode - if you actually\n' - 'want to download images now, you should change SHOULD_SOFT_RETRIEVE\n' - 'to False in settings.txt') - - if pipeConnection: - logger.log(scriptFinishedSentinel) - pipeConnection.close() + if pipeConnection: + logger.setPipe(pipeConnection) + + settings.getSettings() + + if not settings.settings['Database']: + logger.log('Please provide a location for the Database') + return + + # Do this early so we can use it anywhere + LikedSavedDatabase.initializeFromSettings(settings.settings) + + if (not settings.settings['Use_cached_submissions'] + and not settings.hasTumblrSettings() and not settings.hasRedditSettings()): + logger.log('Please provide Tumblr or Reddit account details in settings.txt' + ' or via the Settings page provided by LikedSavedDownloader server') + return + + imgurAuth = None + if (settings.settings['Should_download_albums'] + and settings.hasImgurSettings()): + imgurAuth = imageSaver.ImgurAuth(settings.settings['Imgur_client_id'], + settings.settings['Imgur_client_secret']) + else: + logger.log('No Imgur Client ID and/or Imgur Client Secret was provided, or album download is not' + ' enabled. This is required to download imgur albums. They will be ignored. Check' + ' settings.txt for how to fill in these values.') + + if not settings.settings['Gfycat_Client_id']: + logger.log('No Gfycat Client ID and/or Gfycat Client Secret was provided, or album download is not' + ' enabled. This is required to download Gfycat media reliably.') + + logger.log('Output: ' + settings.settings['Output_dir']) + utilities.makeDirIfNonexistant(settings.settings['Output_dir']) + + submissions = getSubmissionsToSave() + + logger.log('Saving images. This will take several minutes...') + unsupportedSubmissions = imageSaver.saveAllImages(settings.settings['Output_dir'], submissions, + imgur_auth = imgurAuth, only_download_albums = settings.settings['Only_download_albums'], + skip_n_percent_submissions = settings.settings['Skip_n_percent_submissions'], + soft_retrieve_imgs = settings.settings['Should_soft_retrieve'], + only_important_messages = settings.settings['Only_important_messages']) + + # Write out a .json file listing all of the submissions the script failed to download + submission.saveSubmissionsAsJson(unsupportedSubmissions, settings.settings['Output_dir'] + u'/' + + 'UnsupportedSubmissions_' + time.strftime("%Y%m%d-%H%M%S") + '.json') + + if settings.settings['Should_soft_retrieve']: + logger.log('\nYou have run the script in Soft Retrieve mode - if you actually\n' + 'want to download images now, you should change SHOULD_SOFT_RETRIEVE\n' + 'to False in settings.txt') + + if pipeConnection: + logger.log(scriptFinishedSentinel) + pipeConnection.close() + +def getSubmissionsToSave(): + # TODO: Only save one post for early out. Only save once all downloading is done + redditRequestOnlyNewSavedCache = None + redditRequestOnlyNewLikedCache = None + if settings.settings['Reddit_Try_Request_Only_New']: + redditRequestOnlyNewSavedCache = submission.readCacheSubmissions( + settings.settings['Reddit_Try_Request_Only_New_Saved_Cache_File']) + redditRequestOnlyNewLikedCache = submission.readCacheSubmissions( + settings.settings['Reddit_Try_Request_Only_New_Liked_Cache_File']) + + tumblrRequestOnlyNewCache = None + if settings.settings['Tumblr_Try_Request_Only_New']: + tumblrRequestOnlyNewCache = submission.readCacheSubmissions( + settings.settings['Tumblr_Try_Request_Only_New_Cache_File']) + + submissions = [] + + if settings.settings['Use_cached_submissions']: + logger.log('Using cached submissions') + submissions += submission.readCacheSubmissions(settings.settings['Reddit_cache_file']) + submissions += submission.readCacheSubmissions(settings.settings['Tumblr_cache_file']) + else: + if settings.hasRedditSettings(): + redditSubmissions, redditComments, earlyOutPoints = redditScraper.getRedditUserLikedSavedSubmissions( + settings.settings['Username'], settings.settings['Password'], + settings.settings['Client_id'], settings.settings['Client_secret'], + request_limit = settings.settings['Reddit_Total_requests'], + saveLiked = settings.settings['Reddit_Save_Liked'], + saveSaved = settings.settings['Reddit_Save_Saved'], + earlyOutPointSaved = redditRequestOnlyNewSavedCache, + earlyOutPointLiked = redditRequestOnlyNewLikedCache, + unlikeLiked = settings.settings['Reddit_Unlike_Liked'], + unsaveSaved = settings.settings['Reddit_Unsave_Saved']) + + # Cache them in case it's needed later + submission.writeCacheSubmissions(redditSubmissions, settings.settings['Reddit_cache_file']) + + # Set new early out points + submission.writeCacheSubmissions([earlyOutPoints[0]], + settings.settings['Reddit_Try_Request_Only_New_Saved_Cache_File']) + submission.writeCacheSubmissions([earlyOutPoints[1]], + settings.settings['Reddit_Try_Request_Only_New_Liked_Cache_File']) + + submissions += redditSubmissions + + # For reddit only: write out comments to separate json file + if settings.settings['Reddit_Save_Comments']: + submission.saveSubmissionsAsJson(redditComments, settings.settings['Output_dir'] + u'/' + + 'Reddit_SavedComment_Submissions_' + time.strftime("%Y%m%d-%H%M%S") + '.json') + submission.saveSubmissionsAsHtml(redditComments, settings.settings['Output_dir'] + u'/' + + 'Reddit_SavedComment_Submissions_' + time.strftime("%Y%m%d-%H%M%S") + '.html') + logger.log('Saved ' + str(len(redditComments)) + ' reddit comments') + + if settings.hasTumblrSettings(): + tumblrSubmissions, earlyOutPoint = tumblrScraper.getTumblrUserLikedSubmissions( + settings.settings['Tumblr_Client_id'], settings.settings['Tumblr_Client_secret'], + settings.settings['Tumblr_Client_token'], settings.settings['Tumblr_Client_token_secret'], + likeRequestLimit = settings.settings['Tumblr_Total_requests'], + requestOnlyNewCache = tumblrRequestOnlyNewCache) + + # Cache them in case it's needed later + submission.writeCacheSubmissions(tumblrSubmissions, settings.settings['Tumblr_cache_file']) + + # Set new early out point + submission.writeCacheSubmissions([earlyOutPoint], + settings.settings['Tumblr_Try_Request_Only_New_Cache_File']) + + submissions += tumblrSubmissions + + # Write out a .json file with all of the submissions in case the user wants the data + submission.saveSubmissionsAsJson(submissions, settings.settings['Output_dir'] + u'/' + + 'AllSubmissions_' + time.strftime("%Y%m%d-%H%M%S") + '.json') + + LikedSavedDatabase.db.addSubmissions(submissions) + + return submissions + if __name__ == '__main__': - runLikedSavedDownloader(None) + runLikedSavedDownloader(None) diff --git a/settings.py b/settings.py index 04d270d..ff64736 100644 --- a/settings.py +++ b/settings.py @@ -77,21 +77,22 @@ # made previous submissions successfully download, so we always re-check submissions 'Skip_n_percent_submissions': 0, - 'Output_dir' : 'output' + 'Output_dir' : 'output', + 'Database' : 'LikedSaved.db' } redditClientSecretInstructions = '''You need OAuth tokens to run the script. To get them follow these steps:

    -
  1. Go to Reddit app preferences (while signed in to reddit)
  2. -
  3. Scroll down to the bottom and click "create app" (something like that)
  4. -
  5. Fill in the fields as such:
  6. +
  7. Go to Reddit app preferences (while signed in to reddit)
  8. +
  9. Scroll down to the bottom and click "create app" (something like that)
  10. +
  11. Fill in the fields as such:
  12. -
  13. Click create app
  14. +
  15. Click create app
  16. Copy the text which is right below "personal use script" for Client ID
  17. Copy the secret for Client Secret as well
@@ -172,7 +173,7 @@ 'If true, do not download single images, only submissions which are imgur albums') ]), - ('Debugging', + ('Debugging and Development', [ ('Only_important_messages', 'Output minimal information to the console'), ('Use_cached_submissions', 'Do not get new stuff, just use the cache files from last run'), @@ -180,7 +181,9 @@ 'Tumblr_cache_file', ('Skip_n_percent_submissions', "If the script failed at say 70%, you could use toggle Use_cached_submissions and set this value to 69. The script would then restart 69% of the way into the cached submissions nearer to where you left off. The reason why this isn't default is because there might have been changes to the script which made previous submissions successfully download, so we always re-check submissions"), - ('Should_soft_retrieve', "If True, don't actually download the images - just pretend to") + ('Should_soft_retrieve', "If True, don't actually download the images - just pretend to"), + + ('Database') ]), ] @@ -293,5 +296,7 @@ def writeServerSettings(): def getSettings(): settingsFilename = getSettingsFilename() - print('Reading settings from ' + settingsFilename) + print('Reading settings from settings file with most recent timestamp, which was:\n' + + settingsFilename + + "\nIf you want to read from a different settings file, make it more recent") readSettings(settingsFilename) diff --git a/submission.py b/submission.py index e26ea89..5516ac0 100644 --- a/submission.py +++ b/submission.py @@ -52,6 +52,11 @@ def getAsList(self): return [self.source, self.title, self.author, self.subreddit, self.subredditTitle, self.body, self.bodyUrl, self.postUrl] + + +def getAsList_generator(submissions): + for submission in submissions: + yield submission.getAsList() def writeOutSubmissionsAsJson(redditList, file): file.write('{\n'.encode('utf8')) diff --git a/utilities.py b/utilities.py index bbc67c7..f53cd09 100644 --- a/utilities.py +++ b/utilities.py @@ -1,7 +1,8 @@ -# From https://stackoverflow.com/questions/4623446/how-do-you-sort-files-numerically import re import os +import settings +# From https://stackoverflow.com/questions/4623446/how-do-you-sort-files-numerically def tryint(s): try: return int(s) @@ -22,3 +23,12 @@ def sort_naturally(l): def makeDirIfNonexistant(directory): if not os.path.exists(directory): os.makedirs(directory) + +def outputPathToServerPath(path): + # This is a little weird + return 'output' + path.split(settings.settings['Output_dir'])[1] + +# For the DB, just have the root be output_dir +def outputPathToDatabasePath(path): + # This is a little weird + return path.split(settings.settings['Output_dir'])[1]