Skip to content

Commit 74b7fc8

Browse files
committed
Fixes #13 Charset Detector implemented
1 parent eac22a4 commit 74b7fc8

File tree

10 files changed

+801
-31
lines changed

10 files changed

+801
-31
lines changed
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
<?php
2+
3+
namespace Ems\Core\Exceptions;
4+
5+
use Ems\Core\StringConverter\CharsetGuard;
6+
use Exception;
7+
8+
/**
9+
* Throw a MisConfiguredException if an object was not configured propely
10+
**/
11+
class InvalidCharsetException extends MisConfiguredException
12+
{
13+
14+
/**
15+
* @var string
16+
**/
17+
protected $failedString;
18+
19+
/**
20+
* @var string
21+
**/
22+
protected $awaitedCharset;
23+
24+
/**
25+
* @var CharsetGuard
26+
**/
27+
protected $guard;
28+
29+
/**
30+
* @param string $failedString
31+
* @param string $awaitedCharset
32+
**/
33+
public function __construct($failedString, $awaitedCharset, Exception $previous=null)
34+
{
35+
parent::__construct("String is not in $awaitedCharset", 0, $previous);
36+
$this->awaitedCharset = $awaitedCharset;
37+
$this->failedString = $failedString;
38+
}
39+
40+
/**
41+
* @return string
42+
**/
43+
public function failedString()
44+
{
45+
return $this->failedString;
46+
}
47+
48+
/**
49+
* @return string
50+
**/
51+
public function awaitedCharset()
52+
{
53+
return $this->awaitedCharset;
54+
}
55+
56+
/**
57+
* Try to guess the correct charset
58+
*
59+
* @return string
60+
**/
61+
public function suggestedCharset()
62+
{
63+
return $this->guard()->detect($this->failedString());
64+
}
65+
66+
/**
67+
* @return string
68+
**/
69+
public function getHelp()
70+
{
71+
$awaited = $this->awaitedCharset();
72+
$suggested = $this->suggestedCharset();
73+
74+
if (!$suggested) {
75+
return "String should be encoded in $awaited but has an undetectable charset.";
76+
}
77+
78+
return "String should be encoded in $awaited but seems to be $suggested";
79+
80+
}
81+
82+
/**
83+
* Set the guard to determine the charset.
84+
*
85+
* @param CharsetGuard $guard
86+
**/
87+
public function useGuard(CharsetGuard $guard)
88+
{
89+
$this->guard = $guard;
90+
return $this;
91+
}
92+
93+
/**
94+
* @return CharsetGuard
95+
**/
96+
protected function guard()
97+
{
98+
if (!$this->guard) {
99+
$this->guard = new CharsetGuard;
100+
}
101+
102+
return $this->guard;
103+
}
104+
105+
}

src/Ems/Core/Filesystem/CsvDetector.php

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
use Ems\Contracts\Core\Configurable;
66
use Ems\Core\Exceptions\DetectionFailedException;
77
use Ems\Core\ConfigurableTrait;
8-
8+
use Ems\Core\StringConverter\CharsetGuard;
99

1010
/**
1111
* The CSV Detector tries to guess the csv format. Even if the most people
@@ -36,6 +36,19 @@ class CsvDetector implements Configurable
3636
',', ';', "\t", '|', '^'
3737
];
3838

39+
/**
40+
* @var CharsetGuard
41+
**/
42+
protected $charsetGuard;
43+
44+
/**
45+
* @param CharsetGuard $charsetGuard (optional)
46+
**/
47+
public function __construct(CharsetGuard $charsetGuard = null)
48+
{
49+
$this->charsetGuard = $charsetGuard ?: new CharsetGuard;
50+
}
51+
3952
/**
4053
* Detect the separator char. CSV Files with one line are invalid by
4154
* definition.
@@ -93,6 +106,11 @@ public function header($firstLines, $separator, $delimiter='"')
93106
{
94107

95108
$lines = $this->toCheckableLines($firstLines);
109+
if (strpos($firstLines, 'id;')) {
110+
echo "\nLINES:";
111+
var_dump($lines);
112+
echo "\n$firstLines";
113+
}
96114
$row = str_getcsv($lines[0], $separator, $delimiter);
97115

98116
$header = $this->guessHeader($row);
@@ -202,7 +220,7 @@ protected function eachLineHasColumnCountOf(array $lines, $count, $separator, $d
202220
protected function toCheckableLines($firstLines)
203221
{
204222

205-
$lines = explode("\n", $firstLines);
223+
$lines = explode("\n", $this->charsetGuard->withoutBOM($firstLines));
206224

207225
if (count($lines) < 2) {
208226
throw new DetectionFailedException('No lines found to detect separator');

src/Ems/Core/Filesystem/CsvReadIterator.php

Lines changed: 113 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@
66
use Ems\Contracts\Core\ContentIterator;
77
use Ems\Contracts\Core\Configurable;
88
use Ems\Contracts\Core\StringConverter;
9+
use Ems\Core\Exceptions\DetectionFailedException;
910
use Ems\Core\StringConverter\MBStringConverter;
11+
use Ems\Core\StringConverter\CharsetGuard;
1012
use Ems\Core\ConfigurableTrait;
1113
use Ems\Core\LocalFilesystem;
1214
use Ems\Core\Helper;
@@ -94,6 +96,11 @@ class CsvReadIterator implements ContentIterator
9496
**/
9597
protected $firstLines;
9698

99+
/**
100+
* @var string
101+
**/
102+
protected $firstLinesRaw;
103+
97104
/**
98105
* @var self
99106
**/
@@ -117,8 +124,20 @@ class CsvReadIterator implements ContentIterator
117124
protected $shouldConvert = false;
118125

119126
/**
120-
* @param string $filePath (optional)
121-
* @param Filesystem $filesystem (optional)
127+
* @var string
128+
**/
129+
protected $convertFrom = '';
130+
131+
/**
132+
* @var CharsetGuard
133+
**/
134+
protected $charsetGuard;
135+
136+
/**
137+
* @param string $filePath (optional)
138+
* @param Filesystem $filesystem (optional)
139+
* @param CsvDetector $detector (optional)
140+
* @param LineReadIterator $lineReader (optional)
122141
**/
123142
public function __construct($filePath = '', Filesystem $filesystem = null, CsvDetector $detector = null, LineReadIterator $lineReader = null)
124143
{
@@ -229,11 +248,38 @@ public function setLineReader(LineReadIterator $lineReader)
229248
return $this;
230249
}
231250

251+
/**
252+
* @return StringConverter
253+
**/
232254
public function getStringConverter()
233255
{
234256
return $this->stringConverter;
235257
}
236258

259+
/**
260+
* @return CharsetGuard
261+
**/
262+
public function getCharsetGuard()
263+
{
264+
if (!$this->charsetGuard) {
265+
$this->setCharsetGuard(new CharsetGuard);
266+
}
267+
return $this->charsetGuard;
268+
}
269+
270+
/**
271+
* Set the charset guard to better handle encoding errors.
272+
*
273+
* @param CharsetGuard $guard
274+
*
275+
* @return self
276+
**/
277+
public function setCharsetGuard(CharsetGuard $guard)
278+
{
279+
$this->charsetGuard = $guard;
280+
return $this;
281+
}
282+
237283
/**
238284
* Return the csv header. A header is an array of column names ['id', 'name'...].
239285
*
@@ -245,19 +291,42 @@ public function getHeader()
245291
return $this->header;
246292
}
247293

294+
$this->updateConversion();
295+
248296
if (!$this->getFilePath()) {
249297
return $this->header;
250298
}
251299

252-
$header = $this->detector->header(
253-
$this->firstLines(),
254-
$this->getSeparator(),
255-
$this->getDelimiter()
256-
);
300+
$detectorException = null;
257301

258-
$this->headerWasDetected = !$this->isNumericHeader($header);
302+
try {
303+
$header = $this->detector->header(
304+
$this->firstLines(),
305+
$this->getSeparator(),
306+
$this->getDelimiter()
307+
);
308+
} catch (DetectionFailedException $detectorException) {
309+
$header = [];
310+
}
259311

260-
$this->setHeader($this->headerWasDetected ? $header : []);
312+
$this->headerWasDetected = $header && !$this->isNumericHeader($header);
313+
314+
315+
if ($this->headerWasDetected) {
316+
$this->setHeader($header);
317+
return $this->header;
318+
}
319+
320+
// If the header could not be detected, check for encoding errors
321+
// These will be thrown also if the header is not forced
322+
$this->getCharsetGuard()->forceCharset($this->firstLinesRaw(), $this->convertFrom);
323+
324+
// If we can get here the charsetGuard did not throw an exception
325+
if ($detectorException) {
326+
throw $detectorException;
327+
}
328+
329+
$this->setHeader([]);
261330

262331
return $this->header;
263332
}
@@ -334,7 +403,7 @@ protected function readRow($handle, $chunkSize)
334403
}
335404

336405
if (!$this->hasHeader) {
337-
return $this->convertEncoding($row);
406+
return array_map( function ($value) { return $this->convertEncoding($value); }, $row);
338407
}
339408

340409
$namedRow = [];
@@ -377,6 +446,7 @@ protected function isSkippableRow(array $row)
377446
**/
378447
protected function onFileChanged()
379448
{
449+
$this->firstLinesRaw = null;
380450
$this->firstLines = null;
381451
if ($this->separatorWasDetected) {
382452
$this->separator = '';
@@ -394,8 +464,6 @@ protected function onFileChanged()
394464
protected function onRewind()
395465
{
396466
$this->headerRowSkipped = false;
397-
$this->shouldConvert = strtolower($this->getOption(self::ENCODING)) != 'utf-8';
398-
399467
// Trigger detection once
400468
$this->getHeader();
401469
}
@@ -413,6 +481,25 @@ protected function firstLines($lineCount=20)
413481
return $this->firstLines;
414482
}
415483

484+
$this->firstLines = $this->convertEncoding($this->firstLinesRaw($lineCount));
485+
486+
return $this->firstLines;
487+
488+
}
489+
490+
/**
491+
* Return the first lines of the input file
492+
*
493+
* @param int $lineCount
494+
*
495+
* @return string
496+
**/
497+
protected function firstLinesRaw($lineCount=20)
498+
{
499+
if ($this->firstLinesRaw !== null) {
500+
return $this->firstLinesRaw;
501+
}
502+
416503
$this->lineReader->setFilePath($this->filePath);
417504

418505
$lines = [];
@@ -427,10 +514,9 @@ protected function firstLines($lineCount=20)
427514

428515
$this->lineReader->releaseHandle();
429516

430-
$this->firstLines = $this->convertEncoding(implode("\n", $lines));
431-
432-
return $this->firstLines;
517+
$this->firstLinesRaw = implode("\n", $lines);
433518

519+
return $this->firstLinesRaw;
434520
}
435521

436522
/**
@@ -467,27 +553,25 @@ protected function newCountInstance()
467553
/**
468554
* Converts encoding if needed.
469555
*
470-
* @param array|string $data
556+
* @param string $data
471557
*
472-
* @return array|string
558+
* @return string
473559
**/
474560
protected function convertEncoding($data)
475561
{
476-
477562
if (!$this->shouldConvert) {
478563
return $data;
479564
}
565+
return $this->stringConverter->convert("$data", 'utf-8', $this->getOption(self::ENCODING));
566+
}
480567

481-
if (!is_array($data)) {
482-
return $this->stringConverter->convert("$data", 'utf-8', $this->getOption(self::ENCODING));
483-
}
484-
dd('Issoch array');
485-
$converted = [];
486-
487-
foreach ($data as $key=>$value) {
488-
$converted[$key] = $this->convert($data);
489-
}
490-
491-
return $converted;
568+
/**
569+
* Update the convertion options. This is handled like this for performance
570+
* reasons.
571+
**/
572+
protected function updateConversion()
573+
{
574+
$this->convertFrom = strtoupper($this->getOption(self::ENCODING));
575+
$this->shouldConvert = $this->convertFrom != 'UTF-8';
492576
}
493577
}

0 commit comments

Comments
 (0)