In [1]:
import glob
import re

import pandas as pd

from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
SEED = 8455

In [4]:
cols_to_skip = ['raw_text', 'language']

In [8]:
df_html_data = pd.concat([pd.read_csv(csv, lineterminator='\n', usecols=lambda x: x not in cols_to_skip)
               for csv in glob.glob('../data/csv/*.csv')], ignore_index=True)
df_file_labels = pd.read_csv('../data/html_targets.csv')
df = df_html_data.merge(df_file_labels, left_on='filename', right_on='file')
df = df.drop(['file', 'filename'], axis=1)

In [10]:
stop_words = set(stopwords.words('english') + \
                 stopwords.words('french') + \
                 stopwords.words('german') + \
                 stopwords.words('spanish')
                )

In [11]:
X = df.drop(['sponsored'], axis=1)
y = df['sponsored']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [12]:
lemmer = WordNetLemmatizer()
text_transformer = TfidfVectorizer()

In [13]:
text_features = ['title']
transform_features = ['language']
numeric_features = [col for col in X.columns if 'num_' in col]
bool_features = [col for col in X.columns if 'has_' in col]

In [14]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, columns, stop_words=stopwords.words('english'), lemmer=WordNetLemmatizer()):
        self.columns = columns
        self.stop_words = stop_words
        self.lemmer = lemmer

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        _X = X.copy()
        for column in self.columns:
            print(f"Processing {column} column")
            _X[column] = _X[column].fillna('')
            _X[column] = _X[column].map(lambda x: x.lower())
            _X[column] = _X[column].map(lambda x: re.sub(r'[^\w\s]', '', x))
            _X[column] = _X[column].map(lambda x: re.sub(r'\d+', '', x))
            _X[column] = _X[column].map(lambda x: [lemmer.lemmatize(
                word) for word in x.split() if word not in self.stop_words])
            _X[column] = _X[column].map(lambda x: ' '.join(x))
            print(f"{column} cleaned")
        return _X

### LGBM

In [None]:
lgbm = LGBMClassifier(n_estimators=200, class_weight='balanced', random_state=SEED)

preprocessor = Pipeline(steps=[
    ('text_preprocessor', TextPreprocessor(columns=text_features,
                                           stop_words=stop_words,
                                           lemmer=lemmer)),
    ('column_transformer', ColumnTransformer(transformers=[
        ('text_transformer', text_transformer, text_features)],
        remainder='passthrough'))
])

pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lgbm', lgbm)
])


param_grid = {
    'ct__tfidf__ngram_range': [[1, 1], [1, 2]],
    'ct__tfidf__max_features': [100, 500, 1000],
    'ct__tfidf__max_df': [0.5, 0.75, 1.0],
    'ct__tfidf__min_df': [1, 2, 3],
    'ct__tfidf__use_idf': [True, False],
    'lgbm__max_depth': [3, 5, 7],
    'lgbm__learning_rate': [0.01, 0.05, 0.1],
    'lgbm__num_leaves': [10, 20, 30],
    'lgbm__min_samples_leaf': [10, 20, 30],
    'lgbm__lambda_l1': [0.0, 0.5, 1.0],
    'lgbm__lambda_l2': [0.0, 0.5, 1.0]
}


pipe = GridSearchCV(
    pipe,
    param_grid,
    cv=10,
    n_jobs=-1,
    scoring='f1_macro',
    return_train_score=True,
    verbose=2
)

pipe = pipe.fit(X_train, y_train)
print(pipe.best_score_)
print(pipe.best_params_)


Fitting 10 folds for each of 3 candidates, totalling 30 fits
