In [39]:
import sys
import re
import time
import swifter
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import KeyedVectors
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn import metrics

In [2]:
df = pd.read_csv("./data/train.csv")
df

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."


In [3]:
from nlp4ml.preprocessing import clean_tweet

In [5]:
df["clean_tweet"] = df["tweet"].swifter.apply(clean_tweet)
df

Pandas Apply:   0%|          | 0/31962 [00:00<?, ?it/s]

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for credit i cannot use cause they do n...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,i love u take with u all the time in ur ! ! !
4,5,0,factsguide: society now #motivation,factsguide : society now
...,...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...,ate isz that youuu ?
31958,31959,0,to see nina turner on the airwaves trying to...,to see nina turner on the airwaves trying to w...
31959,31960,0,listening to sad songs on a monday morning otw...,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,...","vandalised in in , condemns act"


In [9]:
df["clean_tweet_tokenised"] = df["clean_tweet"].swifter.apply(lambda x: re.split(r"\W+", x))
df

Pandas Apply:   0%|          | 0/31962 [00:00<?, ?it/s]

Unnamed: 0,id,label,tweet,clean_tweet,clean_tweet_tokenised
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...,"[when, a, father, is, dysfunctional, and, is, ..."
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for credit i cannot use cause they do n...,"[thanks, for, credit, i, cannot, use, cause, t..."
2,3,0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]"
3,4,0,#model i love u take with u all the time in ...,i love u take with u all the time in ur ! ! !,"[i, love, u, take, with, u, all, the, time, in..."
4,5,0,factsguide: society now #motivation,factsguide : society now,"[factsguide, society, now]"
...,...,...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...,ate isz that youuu ?,"[ate, isz, that, youuu, ]"
31958,31959,0,to see nina turner on the airwaves trying to...,to see nina turner on the airwaves trying to w...,"[to, see, nina, turner, on, the, airwaves, try..."
31959,31960,0,listening to sad songs on a monday morning otw...,listening to sad songs on a monday morning otw...,"[listening, to, sad, songs, on, a, monday, mor..."
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,...","vandalised in in , condemns act","[vandalised, in, in, condemns, act]"


In [35]:
class EmbeddingVectorizer(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self):
        return X

    def fit_transform(self, X, y=None):
        self.fit(X, y)

    def progressbar(self, iteration, prefix="", size=50, file=sys.stdout):
        count = len(iteration)
        def show(t):
            x = int(size*t/count)
            # file.write("%s[%s%s] %i/%i\r" % (prefix, "#"*x, "."*(size-x), int(100*t/count), 100))
            file.write("{}[{}{}] {}%\r".format(prefix, "█"*x, "."*(size-x), int(100*t/count)))
            file.flush()
        show(0)
        for i, item in enumerate(iteration):
            yield item
            show(i+1)
        file.write("\n")
        file.flush()

class SifEmbeddingVectorizer(EmbeddingVectorizer):
    """
    Parameters
    ----------
    word2vec: gensim.models.KeyedVectors()
        Word2Vec: https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
        GloVe: https://nlp.stanford.edu/projects/glove/
        FastText: https://fasttext.cc/docs/en/crawl-vectors.html
    smoothing_constant: float (default: 1e-3)
        Default value of smoothing constant suggested in the paper is 0.001.
        The range of a suggested in the paper: [1e−4, 1e−3]

    Examples
    --------
    >>> from gensim.scripts import glove2word2vec
    >>> from gensim.models import KeyedVectors
    >>> w2v_model = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)
    >>> glove2word2vec(glove_input_file=r"glove.840B.300d.txt", word2vec_output_file=r"gensim_glove_vectors.txt")
    >>> glove_model = KeyedVectors.load_word2vec_format("gensim_glove_vectors.txt", binary=False)
    >>> embedding_dict = KeyedVectors.load_word2vec_format(r"cc.en.300.vec", binary=False)
    >>> embedding_dict.save_word2vec_format(r"cc.en.300.bin", binary=True)
    >>> ft_model = KeyedVectors.load_word2vec_format("cc.en.300.bin", binary=True)
    >>> vectoriser = SifEmbeddingVectorizer(word2vec=w2v_model)
    >>> feature = vectoriser.fit_transform(df["text"], None)
    """
    def __init__(self, word2vec, smoothing_constant=1e-3):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = word2vec.vector_size
        self.smoothing_constant = smoothing_constant
        self.term_freq = None

    def fit(self, X, y=None):
        X_list = [item for sublist in X for item in sublist]
        term_freq = Counter(X_list)
        total_len = sum(term_freq.values())
        term_freq = [(term, term_freq[term]/total_len) for term, count in term_freq.items()]
        self.term_freq = dict(term_freq)
        return self

    def transform(self, X):
        transformed_X = []
        for doc in self.progressbar(X, prefix="SIF"):
            weighted_array = []
            for term in doc:
                if term in self.word2vec:
                    # Compute smooth inverse frequency (SIF)
                    weight = self.smoothing_constant / (self.smoothing_constant + self.term_freq.get(term, 0))
                    weighted_term = self.word2vec[term] * weight
                    weighted_array.append(weighted_term)
            weighted_array = np.mean(weighted_array or [np.zeros(self.dim)], axis=0)
            transformed_X.append(weighted_array)
        transformed_X = np.array(transformed_X)

        # Common component removal: remove the projections of the average vectors on their first singular vector
        svd = TruncatedSVD(n_components=1, n_iter=20, random_state=0)
        svd.fit(transformed_X)
        pc = svd.components_
        transformed_X = transformed_X - transformed_X.dot(pc.T).dot(pc)
        return transformed_X

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X)

    def get_params(self, deep=True):
        return {"word2vec": self.word2vec, "smoothing_constant": self.smoothing_constant}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

In [8]:
w2v_model = KeyedVectors.load_word2vec_format(r"./embedding/GoogleNews-vectors-negative300.bin", binary=True)

In [32]:
X_train, X_valid, y_train, y_valid = train_test_split(df.clean_tweet_tokenised, 
                                                      df.label, 
                                                      test_size=0.2, 
                                                      stratify=df.label)

In [36]:
tweet_list = df["clean_tweet_tokenised"].values.tolist()
vectoriser = SifEmbeddingVectorizer(word2vec=w2v_model)
X_train_feature = vectoriser.fit_transform(X_train)
X_valid_feature = vectoriser.transform(X_valid)

SIF[██████████████████████████████████████████████████] 100%
SIF[████████████████████████████████████████████████..] 97%

In [38]:
clf = LogisticRegressionCV(cv=5)
clf.fit(X_train_feature, y_train)
y_valid_pred = clf.predict(X_valid_feature)
metrics.classification_report(y_)

LogisticRegressionCV(cv=5)