-
Notifications
You must be signed in to change notification settings - Fork 4.4k
Commit
Check the beginning of file content to learn which programming language in the content. The detected lang type will be applied only if php, xml, html or bash is detected. The language type is determinated via file extension, if the file extension is unknown or the determinated lang type is different from the detected value, then the detected lang type via the file content will be used.
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,6 +26,7 @@ | |
// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
|
||
#include <deque> | ||
#include <algorithm> | ||
#include <time.h> | ||
#include <sys/stat.h> | ||
#include "Buffer.h" | ||
|
@@ -592,8 +593,8 @@ BufferID FileManager::loadFile(const TCHAR * filename, Document doc, int encodin | |
|
||
char data[blockSize + 8]; // +8 for incomplete multibyte char | ||
FormatType bkformat = FormatType::unknown; | ||
|
||
bool res = loadFileData(doc, backupFileName?backupFileName:fullpath, data, &UnicodeConvertor, L_TEXT, encoding, &bkformat); | ||
LangType detectedLang = L_TEXT; | ||
bool res = loadFileData(doc, backupFileName ? backupFileName : fullpath, data, &UnicodeConvertor, detectedLang, encoding, &bkformat); | ||
if (res) | ||
{ | ||
Buffer* newBuf = new Buffer(this, _nextBufferID, doc, DOC_REGULAR, fullpath); | ||
|
@@ -620,6 +621,11 @@ BufferID FileManager::loadFile(const TCHAR * filename, Document doc, int encodin | |
buf->setUnicodeMode(ndds._unicodeMode); | ||
buf->setEncoding(-1); | ||
|
||
// if a language has been detected, and the detected value is different from the file extension, | ||
// we use the detected value | ||
if (detectedLang != L_TEXT && detectedLang != buf->getLangType()) | ||
buf->setLangType(detectedLang); | ||
|
||
if (encoding == -1) | ||
{ | ||
// 3 formats : WIN_FORMAT, UNIX_FORMAT and MAC_FORMAT | ||
|
@@ -667,8 +673,9 @@ bool FileManager::reloadBuffer(BufferID id) | |
int encoding = buf->getEncoding(); | ||
char data[blockSize + 8]; // +8 for incomplete multibyte char | ||
FormatType bkformat; | ||
LangType lang = buf->getLangType(); | ||
|
||
bool res = loadFileData(doc, buf->getFullPathName(), data, &UnicodeConvertor, buf->getLangType(), encoding, &bkformat); | ||
bool res = loadFileData(doc, buf->getFullPathName(), data, &UnicodeConvertor, lang, encoding, &bkformat); | ||
buf->_canNotify = true; | ||
|
||
if (res) | ||
|
@@ -1245,8 +1252,60 @@ int FileManager::detectCodepage(char* buf, size_t len) | |
return codepage; | ||
} | ||
|
||
LangType FileManager::detectLanguageFromTextBegining(const unsigned char *data, unsigned int dataLen) | ||
{ | ||
// it detectes xml, php and bash script file | ||
std::string xmlHeader = "<?xml "; // length : 6 | ||
std::string phpHeader = "<?php "; // length : 6 | ||
std::string bashHeader = "#!/bin/sh"; // length : 9 | ||
This comment has been minimized.
Sorry, something went wrong. |
||
std::string htmlHeader2 = "<html>"; // length : 6 | ||
std::string htmlHeader1 = "<!DOCTYPE html>"; // length : 15 | ||
This comment has been minimized.
Sorry, something went wrong.
Rikk
Contributor
|
||
|
||
const size_t longestLength = htmlHeader1.length(); // longest length - html header Length | ||
size_t i = 0; | ||
|
||
for (; i < dataLen; ++i) | ||
{ | ||
if (data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r') | ||
break; | ||
} | ||
|
||
std::string buf2Test = std::string((const char *)data + i, longestLength); | ||
|
||
auto res = std::mismatch(bashHeader.begin(), bashHeader.end(), buf2Test.begin()); | ||
if (res.first == bashHeader.end()) | ||
{ | ||
return L_BASH; | ||
} | ||
|
||
res = std::mismatch(phpHeader.begin(), phpHeader.end(), buf2Test.begin()); | ||
if (res.first == phpHeader.end()) | ||
{ | ||
return L_PHP; | ||
} | ||
|
||
res = std::mismatch(xmlHeader.begin(), xmlHeader.end(), buf2Test.begin()); | ||
if (res.first == xmlHeader.end()) | ||
{ | ||
return L_XML; | ||
} | ||
|
||
res = std::mismatch(htmlHeader1.begin(), htmlHeader1.end(), buf2Test.begin()); | ||
if (res.first == htmlHeader1.end()) | ||
{ | ||
return L_HTML; | ||
} | ||
res = std::mismatch(htmlHeader2.begin(), htmlHeader2.end(), buf2Test.begin()); | ||
if (res.first == htmlHeader2.end()) | ||
{ | ||
return L_HTML; | ||
} | ||
|
||
return L_TEXT; | ||
} | ||
|
||
inline bool FileManager::loadFileData(Document doc, const TCHAR * filename, char* data, Utf8_16_Read * UnicodeConvertor, | ||
LangType language, int & encoding, FormatType* pFormat) | ||
LangType & language, int & encoding, FormatType* pFormat) | ||
{ | ||
FILE *fp = generic_fopen(filename, TEXT("rb")); | ||
if (!fp) | ||
|
@@ -1319,9 +1378,9 @@ inline bool FileManager::loadFileData(Document doc, const TCHAR * filename, char | |
lenFile = fread(data+incompleteMultibyteChar, 1, blockSize-incompleteMultibyteChar, fp) + incompleteMultibyteChar; | ||
if (lenFile == 0) break; | ||
|
||
// check if file contain any BOM | ||
if (isFirstTime) | ||
{ | ||
// check if file contain any BOM | ||
if (Utf8_16_Read::determineEncoding((unsigned char *)data, lenFile) != uni8Bit) | ||
{ | ||
// if file contains any BOM, then encoding will be erased, | ||
|
@@ -1333,6 +1392,13 @@ inline bool FileManager::loadFileData(Document doc, const TCHAR * filename, char | |
if (NppParameters::getInstance()->getNppGUI()._detectEncoding) | ||
encoding = detectCodepage(data, lenFile); | ||
} | ||
|
||
if (language == L_TEXT) | ||
{ | ||
// check the language du fichier | ||
language = detectLanguageFromTextBegining((unsigned char *)data, lenFile); | ||
} | ||
|
||
isFirstTime = false; | ||
} | ||
|
||
|
2 comments
on commit 9b91480
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just wondering if anyone has considered making this auto-detection feature optional via Preferences or prefer file extension over detected format by default? There are times where I'd rather have an xml file identified as something else (windows script file with .wsf file extension for example). I can add the .wsf extension to VB or Javascript, but it is overridden by this feature and always switches to xml.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just wondering if anyone has considered making this auto-detection feature optional via Preferences or prefer file extension over detected format by default? There are times where I'd rather have an xml file identified as something else (windows script file with .wsf file extension for example). I can add the .wsf extension to VB or Javascript, but it is overridden by this feature and always switches to xml.
For better or worse, users already rely on the current behaviour for certain "features" that are purely accidental and undocumented, like automatic highlighting of VC++ project files: #12226 (comment)
Also detect
#!/bin/bash
because some people writebash
specific shell scripts.