Skip to content

Commit

Permalink
Initial commit.
Browse files Browse the repository at this point in the history
  • Loading branch information
malfunct committed Apr 11, 2018
1 parent e0cdc38 commit 14e6ecf
Show file tree
Hide file tree
Showing 8 changed files with 395 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
*.userosscache
*.sln.docstates

#Visual studio 2017 db
/.vs/

# User-specific files (MonoDevelop/Xamarin Studio)
*.userprefs

Expand Down
22 changes: 22 additions & 0 deletions SyncCollection.sln
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 15
VisualStudioVersion = 15.0.26206.0
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SyncCollection", "SyncCollection\SyncCollection.csproj", "{D08AC687-571B-44E1-8BD0-4BA69152467C}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{D08AC687-571B-44E1-8BD0-4BA69152467C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{D08AC687-571B-44E1-8BD0-4BA69152467C}.Debug|Any CPU.Build.0 = Debug|Any CPU
{D08AC687-571B-44E1-8BD0-4BA69152467C}.Release|Any CPU.ActiveCfg = Release|Any CPU
{D08AC687-571B-44E1-8BD0-4BA69152467C}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal
6 changes: 6 additions & 0 deletions SyncCollection/App.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="utf-8" ?>
<configuration>
<startup>
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2" />
</startup>
</configuration>
45 changes: 45 additions & 0 deletions SyncCollection/InternetArchiveSearchResult.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace SyncCollection
{
class InternetArchiveSearchResult
{
public ResponseHeader responseHeader { get; set; }
public Response response { get; set; }
}

public class Params
{
public string q { get; set; }
public string qin { get; set; }
public string fl { get; set; }
public string wt { get; set; }
public string sort { get; set; }
public string rows { get; set; }
public int start { get; set; }
}

public class ResponseHeader
{
public int status { get; set; }
public int QTime { get; set; }
public Params @params { get; set; }
}

public class Doc
{
public string identifier { get; set; }
public List<DateTime> oai_updatedate { get; set; }
}

public class Response
{
public int numFound { get; set; }
public int start { get; set; }
public List<Doc> docs { get; set; }
}
}
203 changes: 203 additions & 0 deletions SyncCollection/Program.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
using Newtonsoft.Json;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Net.Http.Headers;
using System.Threading.Tasks;

namespace SyncCollection
{
class Program
{
const string Collection = "apple_ii_library_4am";
const string Rows = "2000";

static void Main(string[] args)
{
string collection = Collection;
if (args.Length > 0)
{
collection = args[0];
}
else
{
Console.WriteLine($"Using default collection {collection}");
}

var task = MainAsync(collection);
task.Wait();
}

static async Task MainAsync(string collection)
{
var searchResults = await GetSearchResults(collection);

var localFileList = GetListOfAlreadyDownloadedFiles(collection);

DownloadFiles(searchResults, localFileList, collection);
}

private static void ArchiveOldDownloadList(string collection)
{
string fileListPath = Path.Combine(collection, "fileList.txt");
string filePathOldList = Path.Combine(collection, "fileListOld.txt");

if (File.Exists(fileListPath))
{
if (File.Exists(filePathOldList))
{
File.Delete(filePathOldList);
}

File.Move(fileListPath, filePathOldList);
}
}

private static void DownloadFiles(Dictionary<string, DateTime> searchResults,
Dictionary<string, DateTime> localFileList, string collection)
{
ArchiveOldDownloadList(collection);

Dictionary<string, DateTime> updatedFileList = new Dictionary<string, DateTime>(localFileList);
string currentlyDownloading = null;

try
{
string resourceBase = "https://archive.org/compress";
WebClient client = new WebClient();

foreach (var indicatorToDownload in searchResults.Keys)
{
if (!localFileList.ContainsKey(indicatorToDownload) || searchResults[indicatorToDownload] >
localFileList[indicatorToDownload])
{
currentlyDownloading = $"{collection}/{indicatorToDownload}.zip";

var url = $"{resourceBase}/{indicatorToDownload}";

Console.WriteLine("Downloading {0}", currentlyDownloading);
Console.WriteLine("Downloading from {0}", url);

bool success = false;
try
{
client.DownloadFile(url, currentlyDownloading);
success = true;
}
catch (WebException e)
{
// Just skip webexceptions and clean up so we can
// download as much of the collection as possible
Console.WriteLine("Error while downloading {0}", e.Message);

// delete failed download
if (File.Exists(currentlyDownloading))
{
File.Delete(currentlyDownloading);
}

UpdateLocalFileList(updatedFileList, collection);
}

if (success)
{
currentlyDownloading = null;
updatedFileList[indicatorToDownload] = searchResults[indicatorToDownload];

UpdateLocalFileList(updatedFileList, collection); //we churn this file a lot so we don't lose much state if process interrupted
}
System.Threading.Thread.Sleep(500); //forcing a sleep to be nice to archive.org?
}
}
}
catch (Exception exc)
{
Console.WriteLine("Error while downloading {0}", exc.Message);
}
finally
{
// delete failed download
if (currentlyDownloading != null && File.Exists(currentlyDownloading))
{
File.Delete(currentlyDownloading);
}

UpdateLocalFileList(updatedFileList, collection);
}
}

private static void UpdateLocalFileList(Dictionary<string, DateTime> updatedFileList, string collection)
{
var filePath = Path.Combine(collection, "fileList.txt");
if (File.Exists(filePath))
{
File.Delete(filePath);
}

using (var fileWriter = new StreamWriter(filePath))
{
foreach (var key in updatedFileList.Keys)
{
fileWriter.WriteLine($"{key}\t{updatedFileList[key]}");
}
}
}

private static Dictionary<string, DateTime> GetListOfAlreadyDownloadedFiles(string collection)
{
Dictionary<string, DateTime> localFileList = new Dictionary<string, DateTime>(5000);

if (!Directory.Exists(collection))
{
Directory.CreateDirectory(collection);
}

string fileListPath = Path.Combine(collection, "fileList.txt");

if (File.Exists(fileListPath))
{
foreach (var line in File.ReadAllLines(fileListPath))
{
var split = line.Split('\t');
localFileList[split[0]] = DateTime.Parse(split[1]);
}
}
return localFileList;
}

private static async Task<Dictionary<string, DateTime>> GetSearchResults(string collection)
{
var url =
$"https://archive.org/advancedsearch.php?q=collection%3A{collection}&fl%5B%5D=identifier&fl%5B%5D=oai_updatedate&sort%5B%5D=identifier+asc&sort%5B%5D=&sort%5B%5D=&rows={Rows}&page=1&output=json";

var httpClient = new HttpClient();
httpClient.DefaultRequestHeaders.Accept.Clear();
httpClient.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("application/json"));

var response = await httpClient.GetAsync(url);

if (!response.IsSuccessStatusCode)
{
Console.WriteLine("Request for collection list failed with the error: ({0}) {1}", response.StatusCode,
response.ReasonPhrase);
throw new Exception("Request Failed");
}

var jsonResult = await response.Content.ReadAsStringAsync();

var searchResult = JsonConvert.DeserializeObject<InternetArchiveSearchResult>(jsonResult);

Dictionary<string, DateTime> searchResultPairs = new Dictionary<string, DateTime>(5000);

foreach (var docDescriptor in searchResult.response.docs)
{
searchResultPairs[docDescriptor.identifier] =
docDescriptor.oai_updatedate.Last(); //Last element in oai_updatedate is the "Updated" date
}
return searchResultPairs;
}
}
}
36 changes: 36 additions & 0 deletions SyncCollection/Properties/AssemblyInfo.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;

// General Information about an assembly is controlled through the following
// set of attributes. Change these attribute values to modify the information
// associated with an assembly.
[assembly: AssemblyTitle("Sync4am")]
[assembly: AssemblyDescription("")]
[assembly: AssemblyConfiguration("")]
[assembly: AssemblyCompany("")]
[assembly: AssemblyProduct("Sync4am")]
[assembly: AssemblyCopyright("Copyright © 2017")]
[assembly: AssemblyTrademark("")]
[assembly: AssemblyCulture("")]

// Setting ComVisible to false makes the types in this assembly not visible
// to COM components. If you need to access a type in this assembly from
// COM, set the ComVisible attribute to true on that type.
[assembly: ComVisible(false)]

// The following GUID is for the ID of the typelib if this project is exposed to COM
[assembly: Guid("d08ac687-571b-44e1-8bd0-4ba69152467c")]

// Version information for an assembly consists of the following four values:
//
// Major Version
// Minor Version
// Build Number
// Revision
//
// You can specify all the values or you can default the Build and Revision Numbers
// by using the '*' as shown below:
// [assembly: AssemblyVersion("1.0.*")]
[assembly: AssemblyVersion("1.0.0.0")]
[assembly: AssemblyFileVersion("1.0.0.0")]
Loading

2 comments on commit 14e6ecf

@WaffEB
Copy link

@WaffEB WaffEB commented on 14e6ecf Jan 27, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The explanation on your website is not really helping how to use the Downloader, that is why I can't download any other Files than the Default ones.

@malfunct
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The explanation on your website is not really helping how to use the Downloader, that is why I can't download any other Files than the Default ones.

I have updated the readme and hopefully the added detail helps.

Please sign in to comment.