Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

revert #16663, which was a revert of #16039 #16675

Merged
merged 7 commits into from Jun 13, 2017
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 0 additions & 1 deletion doc/source/whatsnew/v0.20.3.txt
Expand Up @@ -54,7 +54,6 @@ Indexing
I/O
^^^

- Bug in ``pd.read_csv()`` in which files containing EOF characters mid-field could fail with the C engine on Windows (:issue:`16039`, :issue:`16559`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

re-add this (and add this PR number on as well)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok.



Plotting
Expand Down
148 changes: 90 additions & 58 deletions pandas/_libs/src/parser/io.c
Expand Up @@ -9,33 +9,46 @@ The full license is in the LICENSE file, distributed with this software.

#include "io.h"

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

#ifndef O_BINARY
#define O_BINARY 0
#endif /* O_BINARY */

/*
On-disk FILE, uncompressed
*/

void *new_file_source(char *fname, size_t buffer_size) {
file_source *fs = (file_source *)malloc(sizeof(file_source));
fs->fp = fopen(fname, "rb");

if (fs->fp == NULL) {
free(fs);
if (fs == NULL) {
return NULL;
}
setbuf(fs->fp, NULL);

fs->initial_file_pos = ftell(fs->fp);
fs->fd = open(fname, O_RDONLY | O_BINARY);
if (fs->fd == -1) {
goto err_free;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would in-line these instead of goto

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

im not keen on replacing the goto with duplicate code in all the error paths. in my experience using goto in this specific situation, ie, unwinding the generation of state like this, has more benefits than caveats. inlining the cleanup leads to larger code, and is more error prone to maintain. if you add the allocation of something else in the future, you have to judge and fix the cleanup in multiple places rather than adding to the stack of goto targets.

if you feel strongly about it ill change it. after all, im just visiting :). i just wanted to discuss why i did it like this in the first place.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i am fine either way

but right now it's about half and half

so if u can remove some calls and use the goto that's fine (or go the other way imho is actually more explicit)

code size makes almost no difference here
correct is the important issue

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so inline?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure

}

// Only allocate this heap memory if we are not memory-mapping the file
fs->buffer = (char *)malloc((buffer_size + 1) * sizeof(char));

if (fs->buffer == NULL) {
return NULL;
goto err_close;
}

memset(fs->buffer, 0, buffer_size + 1);
fs->buffer[buffer_size] = '\0';
memset(fs->buffer, '\0', buffer_size + 1);
fs->size = buffer_size;

return (void *)fs;

err_close:
close(fs->fd);
err_free:
free(fs);
return NULL;
}

void *new_rd_source(PyObject *obj) {
Expand All @@ -56,12 +69,12 @@ void *new_rd_source(PyObject *obj) {

*/

int del_file_source(void *fs) {
int del_file_source(void *ptr) {
file_source *fs = ptr;
if (fs == NULL) return 0;

/* allocated on the heap */
free(FS(fs)->buffer);
fclose(FS(fs)->fp);
free(fs->buffer);
close(fs->fd);
free(fs);

return 0;
Expand All @@ -83,17 +96,31 @@ int del_rd_source(void *rds) {

void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read,
int *status) {
file_source *src = FS(source);
file_source *fs = FS(source);
ssize_t rv;

*bytes_read = fread((void *)src->buffer, sizeof(char), nbytes, src->fp);
if (nbytes > fs->size) {
nbytes = fs->size;
}

if (*bytes_read == 0) {
rv = read(fs->fd, fs->buffer, nbytes);
switch (rv) {
case -1:
*status = CALLING_READ_FAILED;
*bytes_read = 0;
return NULL;
case 0:
*status = REACHED_EOF;
} else {
*bytes_read = 0;
return NULL;
default:
*status = 0;
*bytes_read = rv;
fs->buffer[rv] = '\0';
break;
}

return (void *)src->buffer;
return (void *)fs->buffer;
}

void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
Expand Down Expand Up @@ -152,80 +179,85 @@ void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
#ifdef HAVE_MMAP

#include <sys/mman.h>
#include <sys/stat.h>

void *new_mmap(char *fname) {
struct stat buf;
int fd;
memory_map *mm;
off_t filesize;
struct stat stat;
size_t filesize;

mm = (memory_map *)malloc(sizeof(memory_map));
mm->fp = fopen(fname, "rb");

fd = fileno(mm->fp);
if (fstat(fd, &buf) == -1) {
fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n", errno);
return NULL;
}
filesize = buf.st_size; /* XXX This might be 32 bits. */

if (mm == NULL) {
/* XXX Eventually remove this print statement. */
fprintf(stderr, "new_file_buffer: malloc() failed.\n");
return NULL;
return (NULL);
}
mm->fd = open(fname, O_RDONLY | O_BINARY);
if (mm->fd == -1) {
fprintf(stderr, "new_file_buffer: open(%s) failed. errno =%d\n",
fname, errno);
goto err_free;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here

}
mm->size = (off_t)filesize;
mm->line_number = 0;

mm->fileno = fd;
mm->position = ftell(mm->fp);
mm->last_pos = (off_t)filesize;
if (fstat(mm->fd, &stat) == -1) {
fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n",
errno);
goto err_close;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here

}
filesize = stat.st_size; /* XXX This might be 32 bits. */

mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, fd, 0);
if (mm->memmap == NULL) {
mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, mm->fd, 0);
if (mm->memmap == MAP_FAILED) {
/* XXX Eventually remove this print statement. */
fprintf(stderr, "new_file_buffer: mmap() failed.\n");
free(mm);
mm = NULL;
goto err_close;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here

}

return (void *)mm;
mm->size = (off_t)filesize;
mm->position = 0;

return mm;

err_close:
close(mm->fd);
err_free:
free(mm);
return NULL;
}

int del_mmap(void *src) {
munmap(MM(src)->memmap, MM(src)->size);
int del_mmap(void *ptr) {
memory_map *mm = ptr;

if (mm == NULL) return 0;

fclose(MM(src)->fp);
free(src);
munmap(mm->memmap, mm->size);
close(mm->fd);
free(mm);

return 0;
}

void *buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read,
int *status) {
void *retval;
memory_map *src = MM(source);
memory_map *src = source;
size_t remaining = src->size - src->position;

if (src->position == src->last_pos) {
if (remaining == 0) {
*bytes_read = 0;
*status = REACHED_EOF;
return NULL;
}

retval = src->memmap + src->position;

if (src->position + (off_t)nbytes > src->last_pos) {
// fewer than nbytes remaining
*bytes_read = src->last_pos - src->position;
} else {
*bytes_read = nbytes;
if (nbytes > remaining) {
nbytes = remaining;
}

*status = 0;
retval = src->memmap + src->position;

/* advance position in mmap data structure */
src->position += *bytes_read;
src->position += nbytes;

*bytes_read = nbytes;
*status = 0;

return retval;
}
Expand Down
28 changes: 6 additions & 22 deletions pandas/_libs/src/parser/io.h
Expand Up @@ -15,19 +15,10 @@ The full license is in the LICENSE file, distributed with this software.

typedef struct _file_source {
/* The file being read. */
FILE *fp;
int fd;

char *buffer;

/* file position when the file_buffer was created. */
off_t initial_file_pos;

/* Offset in the file of the data currently in the buffer. */
off_t buffer_file_pos;

/* Actual number of bytes in the current buffer. (Can be less than
* buffer_size.) */
off_t last_pos;
size_t size;
} file_source;

#define FS(source) ((file_source *)source)
Expand All @@ -37,20 +28,13 @@ typedef struct _file_source {
#endif

typedef struct _memory_map {
FILE *fp;
int fd;

/* Size of the file, in bytes. */
off_t size;

/* file position when the file_buffer was created. */
off_t initial_file_pos;

int line_number;

int fileno;
off_t position;
off_t last_pos;
char *memmap;
size_t size;

size_t position;
} memory_map;

#define MM(src) ((memory_map *)src)
Expand Down