<a href="https://colab.research.google.com/github/manishiitg/ML_Experments/blob/master/ml/logistic_regression_with_word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text classification with logistic regression. 

Very simple experiment to use sklearn logistic regression. 
Using word2vec embeddings to classify text in 2 labels 

In [0]:
# http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/

%matplotlib inline

import pandas as pd
import numpy as np

import spacy 
nlp = spacy.load("en_core_web_sm")

from sklearn.datasets import fetch_20newsgroups

news = fetch_20newsgroups(subset="train")

from gensim.models import KeyedVectors

from sklearn.cluster import KMeans;
from sklearn.neighbors import KDTree;

from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt;
from itertools import cycle;
import matplotlib.pyplot as plt;
from sklearn.model_selection import train_test_split


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


from collections import defaultdict


In [0]:
def normalize(comment, lowercase=True, remove_stopwords=True):
    if lowercase:
        comment = comment.lower()
    lines = comment.splitlines()
    lines = [x.strip(' ') for x in lines]
    lines = [x.replace('"', '') for x in lines]
    lines = [x.replace('\\"', '') for x in lines]
    lines = [x.replace(u'\xa0', u'') for x in lines]
    comment = " ".join(lines)
    doc = nlp(comment)

    # for token in doc:
    #   print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
    #     token.shape_, token.is_alpha, token.is_stop)

    words = [token for token in doc if token.is_stop !=
             True and token.is_punct != True]
    # return " ".join(words)
    lemmatized = list()
    for word in words:
        lemma = word.lemma_.strip()
        if lemma:
            lemmatized.append(lemma)
    return lemmatized

Cleaning up data, lemma is optional

In [10]:
import os

def writetofile(dir, filename, data):
    if not os.path.exists(dir):
        os.makedirs(dir)
    f = os.path.join(dir,str(filename))

    with open(f, 'wb') as the_file:
      the_file.write(data)

news = fetch_20newsgroups(subset="train", categories=['alt.atheism', 'comp.graphics'])


clean_data  = []

# print(news.keys())

# print(news["filenames"][:10])

# print(len(news["data"][:1000]))

# print(news["target_names"][:50])
# print(news["target"][:10])


max_limit = 1000

targets = news["target"][:max_limit]
filenames = news["filenames"][:max_limit]
news = news["data"][:max_limit]

dir = "news_group_cleaned"

print("cleaning data")
for i, row in enumerate(news):
  filename = filenames[i]
  filename = filename[(filename.rfind('/'))+1:]
  if os.path.exists(filename):
    with open(os.path.join(dir, filename), 'r') as content_file:
      data = content_file.read()
      cleaned = data.split(" ")
  else:
    cleaned = normalize(row)
    writetofile(dir, filename, " ".join(cleaned).encode("utf-8"))
    
  
  clean_data.append(cleaned)

print("data cleaned")

# print(clean_data[10])

cleaning data
data cleaned


Just using two category for simple binary classification 

In [11]:
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

model = gensim.models.Word2Vec(clean_data)


#https://stackoverflow.com/questions/45159693/word2vec-models-consist-of-characters-instead-of-words

2019-11-08 05:49:56,012 : INFO : collecting all words and their counts
2019-11-08 05:49:56,013 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-11-08 05:49:56,053 : INFO : collected 18870 word types from a corpus of 166087 raw words and 1000 sentences
2019-11-08 05:49:56,054 : INFO : Loading a fresh vocabulary
2019-11-08 05:49:56,072 : INFO : effective_min_count=5 retains 4338 unique words (22% of original 18870, drops 14532)
2019-11-08 05:49:56,073 : INFO : effective_min_count=5 leaves 143114 word corpus (86% of original 166087, drops 22973)
2019-11-08 05:49:56,089 : INFO : deleting the raw counts dictionary of 18870 items
2019-11-08 05:49:56,091 : INFO : sample=0.001 downsamples 30 most-common words
2019-11-08 05:49:56,092 : INFO : downsampling leaves estimated 121517 word corpus (84.9% of prior 143114)
2019-11-08 05:49:56,106 : INFO : estimated required memory for 4338 words and 100 dimensions: 5639400 bytes
2019-11-08 05:49:56,107 : INFO : resetting l

In [0]:

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = 100

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

w2v = dict(zip(model.wv.index2word, model.wv.vectors))


em = MeanEmbeddingVectorizer(w2v)
mean_embeddings = em.transform(clean_data)


This will simple take mean of all vectors of words  in a single sentence/document to represent it. 

In [13]:
print(len(mean_embeddings))
print(len(targets))

X_train, X_test, Y_train, Y_test = train_test_split(mean_embeddings, targets, test_size=0.33, random_state=42)

scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear', random_state=0, C=5, penalty='l2', max_iter=1000)
lr_model = scikit_log_reg.fit(X_train, Y_train)

print("Score:", scikit_log_reg.score(X_test, Y_test))

1000
1000
[LibLinear]Score: 0.9636363636363636


In [0]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = 100

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        #https://stackoverflow.com/a/5900634
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

w2v = dict(zip(model.wv.index2word, model.wv.vectors))

tf = TfidfEmbeddingVectorizer(w2v)
tf.fit(clean_data, targets)
tf_idf_mean_embeddings = tf.transform(clean_data)   


In [15]:
print(len(tf_idf_mean_embeddings))
print(len(targets))

X_train, X_test, Y_train, Y_test = train_test_split(tf_idf_mean_embeddings, targets, test_size=0.33, random_state=42)

scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear', random_state=0, C=5, penalty='l2', max_iter=1000)
lr_model2 = scikit_log_reg.fit(X_train, Y_train)

print("Score:", scikit_log_reg.score(X_test, Y_test))

1000
1000
[LibLinear]Score: 0.9787878787878788
