Skip to content

Commit

Permalink
Add language auto-detection for php, xml, html and bash
Browse files Browse the repository at this point in the history
Check the beginning of file content to learn which programming language
in the content.
The detected lang type will be applied only if php, xml, html or bash is
detected.
The language type is determinated via file extension, if the file
extension is unknown or  the determinated lang type is different from
the detected value, then the detected lang type via the file content
will be used.
  • Loading branch information
donho committed Sep 19, 2015
1 parent 69a57e5 commit 9b91480
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 6 deletions.
Binary file added PowerEditor/bin/Nppold.exe
Binary file not shown.
76 changes: 71 additions & 5 deletions PowerEditor/src/ScitillaComponent/Buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

#include <deque>
#include <algorithm>
#include <time.h>
#include <sys/stat.h>
#include "Buffer.h"
Expand Down Expand Up @@ -592,8 +593,8 @@ BufferID FileManager::loadFile(const TCHAR * filename, Document doc, int encodin

char data[blockSize + 8]; // +8 for incomplete multibyte char
FormatType bkformat = FormatType::unknown;

bool res = loadFileData(doc, backupFileName?backupFileName:fullpath, data, &UnicodeConvertor, L_TEXT, encoding, &bkformat);
LangType detectedLang = L_TEXT;
bool res = loadFileData(doc, backupFileName ? backupFileName : fullpath, data, &UnicodeConvertor, detectedLang, encoding, &bkformat);
if (res)
{
Buffer* newBuf = new Buffer(this, _nextBufferID, doc, DOC_REGULAR, fullpath);
Expand All @@ -620,6 +621,11 @@ BufferID FileManager::loadFile(const TCHAR * filename, Document doc, int encodin
buf->setUnicodeMode(ndds._unicodeMode);
buf->setEncoding(-1);

// if a language has been detected, and the detected value is different from the file extension,
// we use the detected value
if (detectedLang != L_TEXT && detectedLang != buf->getLangType())
buf->setLangType(detectedLang);

if (encoding == -1)
{
// 3 formats : WIN_FORMAT, UNIX_FORMAT and MAC_FORMAT
Expand Down Expand Up @@ -667,8 +673,9 @@ bool FileManager::reloadBuffer(BufferID id)
int encoding = buf->getEncoding();
char data[blockSize + 8]; // +8 for incomplete multibyte char
FormatType bkformat;
LangType lang = buf->getLangType();

bool res = loadFileData(doc, buf->getFullPathName(), data, &UnicodeConvertor, buf->getLangType(), encoding, &bkformat);
bool res = loadFileData(doc, buf->getFullPathName(), data, &UnicodeConvertor, lang, encoding, &bkformat);
buf->_canNotify = true;

if (res)
Expand Down Expand Up @@ -1245,8 +1252,60 @@ int FileManager::detectCodepage(char* buf, size_t len)
return codepage;
}

LangType FileManager::detectLanguageFromTextBegining(const unsigned char *data, unsigned int dataLen)
{
// it detectes xml, php and bash script file
std::string xmlHeader = "<?xml "; // length : 6
std::string phpHeader = "<?php "; // length : 6
std::string bashHeader = "#!/bin/sh"; // length : 9

This comment has been minimized.

Copy link
@linquize

linquize Sep 20, 2015

Contributor

Also detect #!/bin/bash because some people write bash specific shell scripts.

std::string htmlHeader2 = "<html>"; // length : 6
std::string htmlHeader1 = "<!DOCTYPE html>"; // length : 15

This comment has been minimized.

Copy link
@Rikk

Rikk Sep 19, 2015

Contributor

This is only HTML5 doctype, dozens of other variants probably won't be detected by this.


const size_t longestLength = htmlHeader1.length(); // longest length - html header Length
size_t i = 0;

for (; i < dataLen; ++i)
{
if (data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r')
break;
}

std::string buf2Test = std::string((const char *)data + i, longestLength);

auto res = std::mismatch(bashHeader.begin(), bashHeader.end(), buf2Test.begin());
if (res.first == bashHeader.end())
{
return L_BASH;
}

res = std::mismatch(phpHeader.begin(), phpHeader.end(), buf2Test.begin());
if (res.first == phpHeader.end())
{
return L_PHP;
}

res = std::mismatch(xmlHeader.begin(), xmlHeader.end(), buf2Test.begin());
if (res.first == xmlHeader.end())
{
return L_XML;
}

res = std::mismatch(htmlHeader1.begin(), htmlHeader1.end(), buf2Test.begin());
if (res.first == htmlHeader1.end())
{
return L_HTML;
}
res = std::mismatch(htmlHeader2.begin(), htmlHeader2.end(), buf2Test.begin());
if (res.first == htmlHeader2.end())
{
return L_HTML;
}

return L_TEXT;
}

inline bool FileManager::loadFileData(Document doc, const TCHAR * filename, char* data, Utf8_16_Read * UnicodeConvertor,
LangType language, int & encoding, FormatType* pFormat)
LangType & language, int & encoding, FormatType* pFormat)
{
FILE *fp = generic_fopen(filename, TEXT("rb"));
if (!fp)
Expand Down Expand Up @@ -1319,9 +1378,9 @@ inline bool FileManager::loadFileData(Document doc, const TCHAR * filename, char
lenFile = fread(data+incompleteMultibyteChar, 1, blockSize-incompleteMultibyteChar, fp) + incompleteMultibyteChar;
if (lenFile == 0) break;

// check if file contain any BOM
if (isFirstTime)
{
// check if file contain any BOM
if (Utf8_16_Read::determineEncoding((unsigned char *)data, lenFile) != uni8Bit)
{
// if file contains any BOM, then encoding will be erased,
Expand All @@ -1333,6 +1392,13 @@ inline bool FileManager::loadFileData(Document doc, const TCHAR * filename, char
if (NppParameters::getInstance()->getNppGUI()._detectEncoding)
encoding = detectCodepage(data, lenFile);
}

if (language == L_TEXT)
{
// check the language du fichier
language = detectLanguageFromTextBegining((unsigned char *)data, lenFile);
}

isFirstTime = false;
}

Expand Down
3 changes: 2 additions & 1 deletion PowerEditor/src/ScitillaComponent/Buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,8 @@ class FileManager final
private:
~FileManager();
int detectCodepage(char* buf, size_t len);
bool loadFileData(Document doc, const TCHAR* filename, char* buffer, Utf8_16_Read* UnicodeConvertor, LangType language, int& encoding, FormatType* pFormat = nullptr);
bool loadFileData(Document doc, const TCHAR* filename, char* buffer, Utf8_16_Read* UnicodeConvertor, LangType & language, int& encoding, FormatType* pFormat = nullptr);
LangType detectLanguageFromTextBegining(const unsigned char *data, unsigned int dataLen);


private:
Expand Down

2 comments on commit 9b91480

@bucweat
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just wondering if anyone has considered making this auto-detection feature optional via Preferences or prefer file extension over detected format by default? There are times where I'd rather have an xml file identified as something else (windows script file with .wsf file extension for example). I can add the .wsf extension to VB or Javascript, but it is overridden by this feature and always switches to xml.

@rdipardo
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bucweat,

Just wondering if anyone has considered making this auto-detection feature optional via Preferences or prefer file extension over detected format by default? There are times where I'd rather have an xml file identified as something else (windows script file with .wsf file extension for example). I can add the .wsf extension to VB or Javascript, but it is overridden by this feature and always switches to xml.

For better or worse, users already rely on the current behaviour for certain "features" that are purely accidental and undocumented, like automatic highlighting of VC++ project files: #12226 (comment)

Please sign in to comment.