In [1]:
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
import string

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression

<div align='center'><font size="6" color="#F39C12">Getting started with NLP-Feature Vectors</font></div>

<hr>


<p style='text-align:justify'><b>Key Objectives:</b>This notebook comes as a second part to the <b>[Getting started with NLP Notebooks](https://www.kaggle.com/parulpandey/getting-started-with-nlp-a-general-intro)</b>.In this notebook we shall study the various ways of vectorizing text data.Vectorization converts text data into feature vectors.</p>



## Importing the dataset

In [2]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


# Text Vectorization Methods

There are many methods to vctorize text, but in this notebook I shall discuss few of them:

## 1.Countvectorizer

The [Scikit-Learn's CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) provides a simple way to both tokenize a collection of text documents and build a vocabulary of known words, but also to encode new documents using that vocabulary.

![](https://imgur.com/xxErhnB.png)

We take a dataset and convert it into a corpus. Then we create a vocabulary of all the unique words in the corpus. Using this vocabulary, we can then  create a feature vector of the count of the words. Let's see this through a simple example. Let's say we have a corpus containing two sentences as follows

In [3]:
sentences = ['The weather is sunny', 'The weather is partly sunny and partly cloudy.']

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(sentences)
vectorizer.vocabulary_

{'the': 5,
 'weather': 6,
 'is': 2,
 'sunny': 4,
 'partly': 3,
 'and': 0,
 'cloudy': 1}

In [5]:
# Converting all the sentences to arrays
vectorizer.transform(sentences).toarray()

array([[0, 0, 1, 0, 1, 1, 1],
       [1, 1, 1, 2, 1, 1, 1]])

By default, a scikit learn Count vectorizer can perform the following opertions over a text corpus:

- Encoding via utf-8
- converts text to lowercase
- Tokenizes text using word level tokenization

CountVectorizer has a number of parameters. Let's look at some of them :

### 1.1 Stopword 

Sometimes, some extremely common words which would appear to be of little value in helping select documents matching a user need are excluded from the vocabulary entirely. These words are called stop words. If `stop_word` parameter is specified with a list of stopwords, they will be removed from the vocabulary. Here I'll use the stopwords from NLTK but we can also specify custom stopwords too.


In [6]:
stopwords = stopwords.words('english')

count_vectorizer = CountVectorizer(stop_words = stopwords)
count_vectorizer.fit(train['text'])

train_vectors = count_vectorizer.transform(train['text'])
test_vectors = count_vectorizer.transform(test['text'])

train_vectors.shape

(7613, 21498)

See how the columns have reduced from 21637 to 21498. This is because some of the stopwords were removed.

### 1.2 MIN_DF and MAX_DF parameter

`MIN_DF` lets you ignore those terms that appear rarely in a corpus. In other words, if `MIN_df`is 2, it  means that a word has to occur at least two documents to be considered useful.

`MAX_DF` on the other hand, ignores terms that have a document frequency strictly higher than the given threshold.These will be words which appear a lot of documents.

This means we can eliminate those words that are either rare or appear too frequently in a corpus. 

When mentioned in absolute values i.e 1,2, etc, the value means if the word appears in 1 or 2 documents. However, when given in float, eg 30%, it means it appears in 30% of the documents.

In [7]:
count_vectorizer = CountVectorizer(stop_words = stopwords, min_df=2 ,max_df=0.8)
count_vectorizer.fit(train['text'])

train_vectors = count_vectorizer.transform(train['text'])
test_vectors = count_vectorizer.transform(test['text'])

### 1.3.Custom Preprocesser

We can also preprocess the text by passing it as an argument to countvectorizer. The following options are avialable:

- strip_accents - This removes any accents from the text during the preprocessing step.
- lowercase -  which is default set as true but can be set to False if lowercasing isnot desired
- preprocessor - we can create our custom preprocessor and set this argument to that.



In [8]:
# Creating a custom preprocessor that lowercases, removes special characters, removes hyperlinks and punctuation

def custom_preprocessor(text):
    '''
    Make text lowercase, remove text in square brackets,remove links,remove special characters
    and remove words containing numbers.
    '''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) # remove special chars
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    return text

    

In [9]:
count_vectorizer = CountVectorizer(list(train['text']),preprocessor=custom_preprocessor)

train_vectors = count_vectorizer.fit_transform(list(train['text']))
test_vectors = count_vectorizer.transform(list(test['text']))
 

### 1.4. N-Grams and analyzer parameter

This paramneter specifies the upper and lower limit for the range of words/characters to be extracted from text. The following n-grams range stand for:
(1,1) - unigrams  eg 'United'
(1,2) - unigrams and bigrams eg - 'United', 'United States'
(2, 2)- only bigrams etc eg 'United States)


In [10]:
# World level unigrams and bigrams

count_vectorizer = CountVectorizer(list(train['text']),preprocessor=custom_preprocessor,ngram_range=(1,2))

train_vectors = count_vectorizer.fit_transform(list(train['text']))
test_vectors = count_vectorizer.transform(list(test['text']))

list(count_vectorizer.vocabulary_)[:10]

['our',
 'deeds',
 'are',
 'the',
 'reason',
 'of',
 'this',
 'earthquake',
 'may',
 'allah']

In [11]:
# character level bigrams


count_vectorizer = CountVectorizer(list(train['text']),preprocessor=custom_preprocessor,ngram_range=(2,2),
                                  analyzer='char_wb')

train_vectors = count_vectorizer.fit_transform(list(train['text']))
test_vectors = count_vectorizer.transform(list(test['text']))

print(list(count_vectorizer.vocabulary_)[:20])

[' o', 'ou', 'ur', 'r ', ' d', 'de', 'ee', 'ed', 'ds', 's ', ' a', 'ar', 're', 'e ', ' t', 'th', 'he', ' r', 'ea', 'as']


### Creating a Baseline Model using Countvectorizer

In [12]:

count_vectorizer = CountVectorizer(token_pattern=r'\w{1,}',
                   ngram_range=(1, 2), stop_words = stopwords,preprocessor=custom_preprocessor)
count_vectorizer .fit(train['text'])

train_vectors = count_vectorizer.transform(train['text'])
test_vectors = count_vectorizer.transform(test['text'])

In [13]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1.0)
scores = model_selection.cross_val_score(clf, train_vectors, train["target"], cv=5, scoring="f1")
scores

array([0.59516908, 0.53061224, 0.61852167, 0.52475248, 0.70666667])

In [14]:
# Fitting a simple Logistic Regression on Counts
clf.fit(train_vectors, train["target"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
# Submission
sample_submission = pd.read_csv("../input/nlp-getting-started/sample_submission.csv")
sample_submission["target"] = clf.predict(test_vectors)
sample_submission.to_csv("submission.csv", index=False)

This gets me a score of 0.80777 on the Public LB, which isn't bad with simple Logistic Regression model.

## 2.TF-IDF Vectorizer

![](https://imgur.com/J5lS7kX.png)

In the CountVectorizer, we use the counts of the words, in TFIDF we take the relative importance of that term in the entire corpus. TFIDF is composed of two words: TF and IDF. 
**TF** stands for the normalized  term frequency. Term Frequency is a scoring of the frequency of the word in the current document.`TF = (Number of times term t appears in a document)/(Number of terms in the document)`

**IDF** or Inverse Document Frequency: is a scoring of how rare the word is across documents. `IDF = 1+log(N/n)`, where N is the number of documents and n is the number of documents a term t has appeared in.TF-IDF weight is often used in information retrieval and text mining. This weight is a statistical measure used to evaluate how important a word is to a document in a collection or corpus

*Because the ratio of the id f log function is greater or equal to 1, the TF–IDF score is
always greater than or equal to zero. We interpret the score to mean that the closer the
TF–IDF score of a term is to 1, the more informative that term is to that document.
The closer the score is to zero, the less informative that term is.*
from : [Applied Text Analysis with Python](https://www.amazon.in/Applied-Text-Analysis-Python-Language-Aware/dp/9352137434/ref=asc_df_9352137434/?tag=googleshopdes-21&linkCode=df0&hvadid=396988721232&hvpos=1o1&hvnetw=g&hvrand=11704105753328600061&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9062140&hvtargid=pla-838697427991&psc=1&ext_vrnc=hi)



TFIDF can be generated at word, character or even N gram level. 

In [16]:
# word level
tfidf = TfidfVectorizer(analyzer='word',token_pattern=r'\w{1,}',max_features=5000)
train_tfidf = tfidf.fit_transform(train['text'])
test_tfidf = tfidf.transform(test["text"])

In [17]:
#ngram level
tfidf = TfidfVectorizer(analyzer='word',ngram_range=(2,3),token_pattern=r'\w{1,}',max_features=5000)
train_tfidf = tfidf.fit_transform(train['text'])
test_tfidf = tfidf.transform(test["text"])

In [18]:
# characters level
tfidf = TfidfVectorizer(analyzer='char',ngram_range=(2,3),token_pattern=r'\w{1,}',max_features=5000)
train_tfidf = tfidf.fit_transform(train['text'])
test_tfidf = tfidf.transform(test["text"])


### Creating a Baseline Model using TFIDF

In [19]:
tfidf_vectorizer = TfidfVectorizer( min_df=3,  max_features=None,analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = stopwords)

train_tfidf = tfidf.fit_transform(train['text'])
test_tfidf = tfidf.transform(test["text"])


In [20]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1.0)
scores = model_selection.cross_val_score(clf, train_tfidf, train["target"], cv=5, scoring="f1")
scores

array([0.64711274, 0.61686747, 0.63149079, 0.62045061, 0.74009509])

In [21]:
# Fitting a simple Logistic Regression on TFIDF
clf.fit(train_tfidf, train["target"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

## 2.Hashing Vectorizer

![](https://imgur.com/e3GRaHn.png)

Hashing Vectorizer is yet another technique for vectorizing a collection of text documents. So why do we need yet another technique when we already have so many already. Well, the reason is that,both CountVectorizer and TF-IDF result in storing the entire vocabulary dictionary in memory i.e the number of unique tokens.This could be challenging in scenarios when the token vocabulary becomes very large, to the order of millions.

Hashing Vectorizer is based on the Hash Function which are  are fundamental to computer science.A Hash Function fundamentally maps data of arbitrary sizes to data of a fixed size.Here is a great article which explains this concept very well and is a great read: [Introducing One of the Best Hacks in Machine Learning: the Hashing Trick](https://medium.com/value-stream-design/introducing-one-of-the-best-hacks-in-machine-learning-the-hashing-trick-bf6a9c8af18f). Let's see how can we implement it in scikit learn:



In [22]:
hash_vectorizer = HashingVectorizer(n_features=10000,norm=None,alternate_sign=False)
hash_vectorizer.fit(train['text'])



HashingVectorizer(alternate_sign=False, analyzer='word', binary=False,
                  decode_error='strict', dtype=<class 'numpy.float64'>,
                  encoding='utf-8', input='content', lowercase=True,
                  n_features=10000, ngram_range=(1, 1), norm=None,
                  preprocessor=None, stop_words=None, strip_accents=None,
                  token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None)

In [23]:
train_vectors = hash_vectorizer.transform(train['text'])
test_vectors = hash_vectorizer.transform(test['text'])

In [24]:
print(train_vectors[0])

  (0, 696)	1.0
  (0, 796)	1.0
  (0, 1116)	1.0
  (0, 1582)	1.0
  (0, 2299)	1.0
  (0, 5136)	1.0
  (0, 5183)	1.0
  (0, 5196)	1.0
  (0, 6722)	1.0
  (0, 8526)	1.0
  (0, 8958)	1.0
  (0, 9531)	1.0
  (0, 9851)	1.0


In [25]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1.0)
scores = model_selection.cross_val_score(clf, train_vectors, train["target"], cv=5, scoring="f1")
scores

array([0.59946476, 0.54621149, 0.60032895, 0.57849829, 0.7026616 ])

In [26]:
# Fitting a simple Logistic Regression on TFIDF
clf.fit(train_vectors, train["target"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)