Skip to content

Commit

Permalink
Revert the date cache feature as it somehow (cause: unknown) causing …
Browse files Browse the repository at this point in the history
…pageviews to be lost when importing big log files.

This particular log file I'm testing on is for an intranet with thousands times the same IP address. Not sure if it's related, but the same IP address will have many visits at the same second, for different users (different _id=X in the piwik.php requests)
refs matomo-org/matomo#300
  • Loading branch information
mattab committed Aug 13, 2014
1 parent e930303 commit b148679
Showing 1 changed file with 17 additions and 42 deletions.
59 changes: 17 additions & 42 deletions import_logs.py
Expand Up @@ -47,13 +47,7 @@
print >> sys.stderr, 'simplejson (http://pypi.python.org/pypi/simplejson/) is required.'
sys.exit(1)

try:
from collections import OrderedDict
except ImportError:
try:
from ordereddict import OrderedDict
except ImportError:
pass


##
## Constants.
Expand Down Expand Up @@ -1556,10 +1550,6 @@ def invalid_line(line, reason):
resolver.check_format(format)

hits = []
try:
cache_dates = OrderedDict()
except NameError:
cache_dates = None
for lineno, line in enumerate(file):
try:
line = line.decode(config.options.encoding)
Expand All @@ -1585,7 +1575,6 @@ def invalid_line(line, reason):
is_robot=False,
is_error=False,
is_redirect=False,
date=None,
args={},
)

Expand Down Expand Up @@ -1640,38 +1629,24 @@ def invalid_line(line, reason):
# Parse date.
# We parse it after calling check_methods as it's quite CPU hungry, and
# we want to avoid that cost for excluded hits.
if cache_dates is not None:
# To mitigate CPU usage, parsed dates are cached.
try:
timezone_key = format.get('timezone')
except BaseFormatException:
timezone_key = ''
date_key = (format.get('date'), timezone_key)
hit.date = cache_dates.get(date_key)
if not hit.date:
date_string = format.get('date')
try:
hit.date = datetime.datetime.strptime(date_string, format.date_format)
except ValueError:
invalid_line(line, 'invalid date')
continue

# Parse timezone and substract its value from the date
try:
timezone = float(format.get('timezone'))
except BaseFormatException:
timezone = 0
except ValueError:
invalid_line(line, 'invalid timezone')
continue
date_string = format.get('date')
try:
hit.date = datetime.datetime.strptime(date_string, format.date_format)
except ValueError:
invalid_line(line, 'invalid date')
continue

if timezone:
hit.date -= datetime.timedelta(hours=timezone/100)
# Parse timezone and substract its value from the date
try:
timezone = float(format.get('timezone'))
except BaseFormatException:
timezone = 0
except ValueError:
invalid_line(line, 'invalid timezone')
continue

if cache_dates is not None:
if len(cache_dates) > 3600:
cache_dates.popitem(False)
cache_dates[date_key] = hit.date
if timezone:
hit.date -= datetime.timedelta(hours=timezone/100)

if config.options.replay_tracking:
# we need a query string and we only consider requests with piwik.php
Expand Down

0 comments on commit b148679

Please sign in to comment.