In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

In [131]:
# Load the train and test data
train_df = pd.read_csv('train.csv')
train_df['df_type'] = 'train'
test_df = pd.read_csv('test.csv')
test_df['df_type'] = 'test'

In [3]:
# Concatenating test and train data
combined_data = pd.concat([train_df, test_df], ignore_index=True)

# Check null values
print(train_df.apply(lambda x: sum(x.isnull())))

id         0
label      0
tweet      0
df_type    0
dtype: int64


In [31]:
def extract_hash_tags(text):
    return list(set(part[1:] for part in text.split() if part.startswith('#')))

In [7]:
sample = train_df.loc[0, "tweet"]
print("Original sentence: {}".format(sample))
print("Extract hash tags: {}".format(extract_hash_tags(sample)))

Original sentence:  @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run
Extract hash tags: {'run'}


Reference from https://stackoverflow.com/a/38119388/15048366

In [126]:
import re
import string
import nltk
stopword = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()

def clean_tweets(text):
    text = strip_links(text)
    text = strip_all_entities(text)
    return text

def tokenization(text):
    text = re.split('\W+', text)
    return text

def lemmatizer(text):
    text = [wn.lemmatize(word) for word in text]
    return text

def stemming(text):
    text = [ps.stem(word) for word in text]
    return text

def strip_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text

def strip_links(text):
    link_regex = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], '[LINK]')    
    return text

def strip_all_entities(text):
    entity_prefixes = ['@', '#']
    for separator in string.punctuation.replace("[", "").replace("]", ""):
        if separator not in entity_prefixes:
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

In [39]:
sample = "@peter I really love that shirt at #Macy. http://bit.ly//WjdiW4"

print("Original tweet: \n\t{}".format(sample))
print("Cleaned tweet: \n\t{}".format(clean_tweets(sample)))
print("Hash tags: \n\t{}".format(extract_hash_tags(sample)))

Original tweet: 
	@peter I really love that shirt at #Macy. http://bit.ly//WjdiW4
Cleaned tweet: 
	I really love that shirt at [LINK]
Hash tags: 
	['Macy.']


In [147]:
train_df["tweet_clean"] = train_df["tweet"].apply(clean_tweets)
train_df["tweet_tokenised"] = train_df["tweet_clean"].apply(tokenization)
train_df["tweet_stem"] = train_df["tweet_tokenised"].apply(stemming)
train_df["tweet_lemm"] = train_df["tweet_tokenised"].apply(lemmatizer)

In [120]:
from gensim.models import KeyedVectors

w2v_model = KeyedVectors.load_word2vec_format(r"G:\kaggle\embedding\GoogleNews-vectors-negative300.bin", binary=True)

In [197]:
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = word2vec.vector_size

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    
    def fit_transform(self, X, y):
        self.fit(X)
        return self.transform(X)

class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec, use_idf=True):
        self.word2vec = word2vec
        self.word2weight = None
        self.use_idf = use_idf
        self.dim = word2vec.vector_size
        
    def word2tf(self, term_list):
        term_freq = Counter(term_list)
        total_len = sum(term_freq.values())
        term_freq = [(term, term_freq[term]/total_len) for term, count in term_freq.items()]
        return dict(term_freq)

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
        return self

    def transform(self, X):
        transformed_X = []
        for doc in X:
            weighted_array = []
            for term in doc:
                if term in self.word2vec:
                    if self.use_idf:
                        weighted_term = self.word2vec[term] * self.word2tf(doc)[term] * self.word2weight[term]
                    else:
                        weighted_term = self.word2vec[term] * self.word2tf(doc)[term]
                    weighted_array.append(weighted_term)
            weighted_array = np.mean(weighted_array or [np.zeros(self.dim)], axis=0)
            transformed_X.append(weighted_array)
        return np.array(transformed_X)
    
    def fit_transform(self, X, y):
        self.fit(X, y)
        return self.transform(X)

In [221]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

def ml_pipeline_with_embedding(clf, 
                               w2v_gensim, 
                               search_spaces, 
                               mode="mean", 
                               cv=5, 
                               refit=True, 
                               scoring='roc_auc', 
                               verbose=1, 
                               n_jobs=8):
    """
    Parameters
    ----------
    clf: scikit-learn estimator
        clf is a scikit-learn estimator interface.
    w2v_gensim: gensim model
        Word2Vec class from gensim.
    search_spaces: dict
        A dictinary where keys are parameter names (strings) and values are skopt.space.Dimension 
        instances (Real, Integer or Categorical).
    mode: str {"mean", "tfidf"}
        MeanEmbeddingVectorizer() or TfidfEmbeddingVectorizer() as embedding method.
    cv: int
        A integer, to specify the number of folds in a (Stratified) KFold.
    refit: boolean
        Refit the best estimator with the entire dataset. If “False”, it is impossible to make 
        predictions using this RandomizedSearchCV instance after fitting.
    scoring: str
        Defining model evaluation rules.
        https://scikit-learn.org/stable/modules/model_evaluation.html
    verbose: int
        Controls the verbosity: the higher, the more messages.
    
    References
    ----------
    [1] https://www.kaggle.com/nanomathias/bayesian-optimization-of-xgboost-lb-0-9769
    [2] https://stackoverflow.com/q/64264724/15048366
    """
    if mode == "mean":
        model = Pipeline([
            ("word2vec vectorizer (mean)", MeanEmbeddingVectorizer(w2v_gensim)), 
            ("machine learing classifier", BayesSearchCV(estimator=clf,
                                                         search_spaces=search_spaces,
                                                         cv=StratifiedKFold(
                                                             n_splits=cv,
                                                             shuffle=True,
                                                             random_state=42), 
                                                         refit=True, 
                                                         scoring=scoring, 
                                                         random_state=42, 
                                                         n_jobs=n_jobs))], verbose=verbose)
    elif mode == "tfidf":
        model = Pipeline([
            ("word2vec vectorizer (tfidf)", TfidfEmbeddingVectorizer(w2v_gensim)), 
            ("machine learing classifier", BayesSearchCV(estimator=clf,
                                                         search_spaces=search_spaces,
                                                         cv=StratifiedKFold(
                                                             n_splits=cv,
                                                             shuffle=True,
                                                             random_state=42), 
                                                         refit=True, 
                                                         scoring=scoring, 
                                                         random_state=42, 
                                                         n_jobs=n_jobs))], verbose=verbose)
    return model

## XGBoost

In [None]:
xgb_clf = XGBClassifier()
search_spaces = {
    'learning_rate': Real(0.01, 1.0, 'log-uniform'),
    'min_child_weight': Integer(0, 10),
    'max_depth': Integer(0, 50),
    'max_delta_step': Integer(0, 20),
    'subsample': Real(0.01, 1.0, 'uniform'),
    'colsample_bytree': Real(0.01, 1.0, 'uniform'),
    'colsample_bylevel': Real(0.01, 1.0, 'uniform'),
    'reg_lambda': Real(1e-9, 1000, 'log-uniform'),
    'reg_alpha': Real(1e-9, 1.0, 'log-uniform'),
    'gamma': Real(1e-9, 0.5, 'log-uniform'),
    'min_child_weight': Integer(0, 5),
    'n_estimators': Integer(100, 500),
    'scale_pos_weight': Real(1e-6, 500, 'log-uniform')
}
xgb_model = ml_pipeline_with_embedding(xgb_clf, w2v_model, search_spaces, mode="tfidf", cv=3, scoring="f1")
xgb_model.fit(train_df["tweet_tokenised"], train_df["label"])
print("Done training...")

[Pipeline]  (step 1 of 2) Processing word2vec vectorizer (tfidf), total=   6.4s


In [None]:
proba_test = xgb_model.predict_proba(train_df["tweet_tokenised"])
pred_test = np.argmax(proba_test, axis=1)
print("F1: {:.4f}\nAcc: {:.4f}".format(f1_score(train_df["label"], pred_test), 
                                       accuracy_score(train_df["label"], pred_test)))

In [None]:
from sklearn.metrics import f1_score, accuracy_score

test_df["tweet_clean"] = test_df["tweet"].apply(clean_tweets)
test_df["tweet_tokenised"] = test_df["tweet_clean"].apply(tokenization)
proba_test = xgb_model.predict_proba(test_df["tweet_tokenised"])
pred_test = np.argmax(proba_test, axis=1)

## LGBM

In [218]:
import lightgbm as lgb

lgb_clf = lgb.LGBMRegressor(objective='binary', metric='auc', n_jobs=-1, verbose=0)
search_spaces = {
    'learning_rate': Real(0.01, 1.0, 'log-uniform'),
    'num_leaves': Integer(1, 100),      
    'max_depth': Integer(0, 50),
    'min_child_samples': Integer(0, 50),
    'max_bin': Integer(100, 1000),
    'subsample': Real(0.01, 1.0, 'uniform'),
    'subsample_freq': Integer(0, 10),
    'colsample_bytree': Real(0.01, 1.0, 'uniform'),
    'min_child_weight': Integer(0, 10),
    'subsample_for_bin': Integer(100000, 500000),
    'reg_lambda': Real(1e-9, 1000, 'log-uniform'),
    'reg_alpha': Real(1e-9, 1.0, 'log-uniform'),
    'scale_pos_weight': Real(1e-6, 500, 'log-uniform'),
    'n_estimators': Integer(50, 100),
}

class LGBM(lgb.LGBMClassifier):
    """
    Use a callback function to echo out results after each iteration when optimizing with BayesSearchCV.
    
    Reference
    ---------
    [1] https://github.com/scikit-optimize/scikit-optimize/issues/698#issuecomment-401059610
    """
    def __init__(self, objective='binary', metric='auc', n_jobs=-1, verbose=0):
        super().__init__(objective, metric, n_jobs, verbose)

    def set_params(self, **params):
        print(params)
        super().set_params(**params)

lgb_model = ml_pipeline_with_embedding(lgb_clf, w2v_model, search_spaces, mode="tfidf", cv=3, scoring="f1")
lgb_model.fit(train_df["tweet_tokenised"][:100], train_df["label"][:100])

[Pipeline]  (step 1 of 2) Processing word2vec vectorizer (tfidf), total=   0.1s


ValueError: array must not contain infs or NaNs

# PyTorch Wrapper

## Reference
1. https://www.kaggle.com/graymant/pytorch-regression-with-sklearn-pipelines

In [None]:
import inspect
from sklearn.base import BaseEstimator, ClassifierMixin

class PytorchClassifier(BaseEstimator, ClassifierMixin):
    """A pytorch classifier"""
    def __init__(self, 
                 num_epochs=1, 
                 learning_rate=0.01, 
                 batch_size=128, 
                 shuffle=False, 
                 callbacks=[], 
                 use_gpu=True, 
                 verbose=1):
        """
        Called when initializing the regressor
        """
        self._history = None
        self._model = None
        self._gpu = use_gpu and torch.cuda.is_available()

        args, _, _, values = inspect.getargvalues(inspect.currentframe())
        values.pop("self")

        for arg, val in values.items():
            setattr(self, arg, val)

    def _build_model(self):
        self._layer_dims = [self.input_dim] + \
            self.hidden_layer_dims + [self.output_dim]

        self._model = torch.nn.Sequential()

        # Loop through the layer dimensions and create an input layer, then
        # create each hidden layer with relu activation.
        for idx, dim in enumerate(self._layer_dims):
            if (idx < len(self._layer_dims) - 1):
                module = torch.nn.Linear(dim, self._layer_dims[idx + 1])
                init.xavier_uniform(module.weight)
                self._model.add_module("linear" + str(idx), module)

            if (idx < len(self._layer_dims) - 2):
                self._model.add_module("relu" + str(idx), torch.nn.ReLU())

        if self._gpu:
            self._model = self._model.cuda()

    def _train_model(self, X, y):
        torch_x = torch.from_numpy(X).float()
        torch_y = torch.from_numpy(y).float()
        if self._gpu:
            torch_x = torch_x.cuda()
            torch_y = torch_y.cuda()

        train = data_utils.TensorDataset(torch_x, torch_y)
        train_loader = data_utils.DataLoader(train, batch_size=self.batch_size,
                                             shuffle=self.shuffle)

        loss_fn = torch.nn.MSELoss(size_average=False)

        optimizer = torch.optim.Adam(
            self._model.parameters(), lr=self.learning_rate)

        self._history = {"loss": [], "val_loss": [], "mse_loss": []}

        finish = False
        for epoch in range(self.num_epochs):
            if finish:
                break

            loss = None
            idx = 0
            for idx, (minibatch, target) in enumerate(train_loader):
                y_pred = self._model(Variable(minibatch))

                loss = loss_fn(y_pred, Variable(
                    target.cuda().float() if self._gpu else target.float()))

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            y_labels = target.cpu().numpy() if self._gpu else target.numpy()
            y_pred_results = y_pred.cpu().data.numpy() if self._gpu else y_pred.data.numpy()

            error = mean_absolute_error(y_labels, y_pred_results)

            self._history["mse_loss"].append(loss.data[0])
            self._history["loss"].append(error)

            if self.verbose > 0:
                print("Results for epoch {}, loss {}, mse_loss {}".format(epoch + 1,
                                                                          error, loss.data[0]))
            for callback in self.callbacks:
                callback.call(self._model, self._history)
                if callback.finish:
                    finish = True
                    break

    def fit(self, X, y):
        """
        Trains the pytorch regressor.
        """
        assert (type(self.input_dim) == int), "input_dim parameter must be defined"
        assert (type(self.output_dim) == int), "output_dim must be defined"

        self._build_model()
        self._train_model(X, y)

        return self

    def predict(self, X, y=None):
        """
        Makes a prediction using the trained pytorch model
        """
        if self._history == None:
            raise RuntimeError("Classifier has not been fit")

        results = []
        split_size = math.ceil(len(X) / self.batch_size)

        # In case the requested size of prediction is too large for memory (especially gpu)
        # split into batchs, roughly similar to the original training batch size. Not
        # particularly scientific but should always be small enough.
        for batch in np.array_split(X, split_size):
            x_pred = Variable(torch.from_numpy(batch).float())
            y_pred = self._model(x_pred.cuda() if self._gpu else x_pred)
            y_pred_formatted = y_pred.cpu().data.numpy() if self._gpu else y_pred.data.numpy()
            results = np.append(results, y_pred_formatted)

        return results

    def score(self, X, y, sample_weight=None):
        """
        Scores the data using the trained pytorch model. Under current implementation
        returns negative mae.
        """
        y_pred = self.predict(X, y)
        return mean_absolute_error(y, y_pred) * -1