In [1]:
import pandas as pd
import os
import xml.etree.ElementTree as ET

In [2]:
WORKING_DIR = '.' # os.path.dirname(__file__)

NON_TEXT_DIR = os.path.join(WORKING_DIR, 'Nontext')
TEXT_DIR = os.path.join(WORKING_DIR, 'Text')

TEXT_CLASS = 0
NON_TEXT_CLASS = 1

In [3]:
def process_xml(filename, class_value):
    tree = ET.parse(filename)
    xml_dicts = []
    for i, child in enumerate(tree.iter('WordFragment')):
        xml_dicts.append(dict())
        xml_dicts[-1]['class_value'] = class_value
        for key, value in child[0].items():
            xml_dicts[-1][key] = value
        for key, value in child.attrib.items():
            xml_dicts[-1][key] = value
    return pd.DataFrame(xml_dicts)

In [4]:
def read_xml_folder(directory, class_value):
    frames = []
    for filename in os.listdir(directory):
        if filename.endswith('.xml'):
            full_name = os.path.join(directory, filename)
            frames.append(process_xml(full_name, class_value))
    return pd.concat(frames)

In [5]:
NUMERIC_COLUMNS = ['BlackCount', 'HorzStrokesCount', 'MaxHorzStrokeLength', 
                   'PunctuationQuality', 'TextQuality', 'VertStrokesCount', 'WhiteHolesCount',
                  'Bottom', 'Right', 'Top', 'Left']

DROP_COLUMNS = ['Bottom', 'Right', 'Top', 'Left', 'Area', 'Height', 'Width']

NEED_NORMALISATION_COLUMNS = ['BlackCount', 'HorzStrokesCount', 
                   'PunctuationQuality', 'VertStrokesCount', 'WhiteHolesCount']

def preprocess_fetures(df_raw):
    df_raw[NUMERIC_COLUMNS] = df_raw[NUMERIC_COLUMNS].apply(pd.to_numeric)
    bool_dict = {'false': 0, 'true': 1}
    df_raw['IsInverted'].replace(bool_dict, inplace=True)
    df_raw['IsVertical'].replace(bool_dict, inplace=True)
    df_raw['Height'] = df_raw['Bottom'] - df_raw['Top']
    df_raw['Width'] = df_raw['Right'] - df_raw['Left']
    df_raw['Area'] = df_raw['Width'] * df_raw['Height']
    df_raw['custom_1'] = (df_raw['Area'] - df_raw['BlackCount']) / ((df_raw['HorzStrokesCount'] + \
                            df_raw['Height'] ) * df_raw['Height'] )
    df_raw['custom_2'] = (df_raw['HorzStrokesCount'] + df_raw['VertStrokesCount']) / \
        df[['Width', 'Height']].max(axis=1)
    df_raw['custom_3'] = df_raw['BlackCount'] / (df_raw['HorzStrokesCount'] + df_raw['VertStrokesCount'])
    df_raw['MaxHorzStrokeLength'] /= df_raw['Width']
    for column in NEED_NORMALISATION_COLUMNS:
        df_raw[column] /= df_raw['Area']
    df_raw.drop(DROP_COLUMNS, axis=1, inplace=True)

In [6]:
df_non_text = read_xml_folder(NON_TEXT_DIR, NON_TEXT_CLASS)
df_text = read_xml_folder(TEXT_DIR, TEXT_CLASS)

df = pd.concat([df_non_text, df_text])
preprocess_fetures(df)

In [7]:
df.head()

Unnamed: 0,BlackCount,HorzStrokesCount,IsInverted,IsVertical,MaxHorzStrokeLength,PunctuationQuality,TextQuality,VertStrokesCount,WhiteHolesCount,class_value,custom_1,custom_2,custom_3
0,0.521674,0.015149,0,0,0.779661,0.0,0.333008,0.015374,7.5e-05,1,0.179182,3.449153,17.090909
1,0.510452,0.012926,0,0,0.374016,0.0,0.0,0.012125,7e-05,1,0.256911,2.830709,20.376912
2,0.426407,0.045455,0,0,0.681818,0.002165,0.5,0.047619,0.0,1,0.300454,1.954545,4.581395
3,0.434783,0.043478,0,0,0.695652,0.001976,0.5,0.045455,0.0,1,0.295455,1.956522,4.888889
4,0.551084,0.052632,0,0,0.394737,0.0,0.333008,0.047988,0.0,1,0.334487,1.710526,5.476923


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43009 entries, 0 to 4644
Data columns (total 13 columns):
BlackCount             43009 non-null float64
HorzStrokesCount       43009 non-null float64
IsInverted             43009 non-null int64
IsVertical             43009 non-null int64
MaxHorzStrokeLength    43009 non-null float64
PunctuationQuality     43009 non-null float64
TextQuality            43009 non-null float64
VertStrokesCount       43009 non-null float64
WhiteHolesCount        43009 non-null float64
class_value            43009 non-null int64
custom_1               43009 non-null float64
custom_2               43009 non-null float64
custom_3               43009 non-null float64
dtypes: float64(10), int64(3)
memory usage: 4.6 MB


In [9]:
y = df.class_value.values
X = df.drop(['class_value'], axis=1).values

In [15]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from tools import ExtendedModel, ClfTester

In [16]:
N_SPLITS = 10
SCORING = 'accuracy'

In [17]:
FEATURES = {
    'Raw': ExtendedModel(None)
}

In [18]:
CLASSIFIERS = {
    'Random Forest Classifier': ExtendedModel(
        RandomForestClassifier(n_jobs=-1),
        {
            'max_features': ('auto', 'sqrt', 'log2'),
            'n_estimators': [i for i in range(10, 50, 5)]
        }
    ),
    'Gradient Boosting Classifier': ExtendedModel(
        GradientBoostingClassifier(),
        {
            'loss' : ('deviance', 'exponential'), 
            'warm_start': (True, False),
            'learning_rate': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001], 
            'n_estimators': [i for i in range(10, 50, 5)],
            'max_features': ('auto', 'sqrt', 'log2')
        }
    ),
    'Logistic Regression' : ExtendedModel( 
        LogisticRegression(), 
        {
            'penalty': ('l1', 'l2'), 
            'C': [0.01, 0.1, 0.5, 1, 5, 10, 100, 200, 500, 1000, 10000, 15000, 20000, 100000]
        }
    ),
    'Naive Bayes': ExtendedModel(
        GaussianNB()
    ),
    'Linear SVM' : ExtendedModel(
        LinearSVC(),
        {
            'C': [0.01, 0.1, 0.5, 1, 5, 10, 100, 200, 500, 1000, 10000, 15000, 20000, 100000]
        }
    ) 
}

In [19]:
clf_tester = ClfTester(FEATURES, CLASSIFIERS, SCORING, N_SPLITS)
df_test_scores, df_best_estimators = clf_tester.test(X, y, show_time_log=False)

--- Best model ---
Raw + Random Forest Classifier with score 0.889 and params:
	Classifier__max_features: log2
	Classifier__n_estimators: 40



In [20]:
df_test_scores.style.highlight_max(axis=None, subset=pd.IndexSlice[:, :])

Unnamed: 0,Random Forest Classifier,Gradient Boosting Classifier,Logistic Regression,Naive Bayes,Linear SVM
Raw,0.889046,0.883117,0.814643,0.752122,0.799554
