The following notebook reproduces code from Feature Engineering for Machine Learning by Alic Zheng and Amanda Casari.

In parallel, the same concepts are applied to a Twitter dataset that seeks to identify tweets suggesting a true natural disaster is occurring. See https://www.kaggle.com/c/nlp-getting-started.

## N-Grams

n-grams are sequences of n tokens. 1-grams (unigram) are just the frequency count of distinct words. 2-grams are unique 2 word pairings. While the code is taken from the book, I consolidated the code into functions for reuseability.

In [53]:
import pandas as pd
import csv
import spacy
from sklearn.feature_extraction import text
from sklearn.linear_model import LogisticRegression
import sklearn.model_selection as modsel
import sklearn.preprocessing as preproc

# Online News Popularity Data Set - first 10,000
yelp_df = pd.read_csv('C://Users/Megan.Cusey/Documents/GitHub/DataScienceProjects/MDS 564 - Twitter NLP Text Analysis/Yelp Reviews - 10000.csv', nrows=10000)
print(yelp_df.shape)

twitter_df = pd.read_csv('C://Users/Megan.Cusey/Documents/GitHub/DataScienceProjects/MDS 564 - Twitter NLP Text Analysis/twitter_train.csv')
print(twitter_df.shape)

(10000, 11)
(7613, 5)


In [54]:
def bow_and_ngrams(df, text_column):
    # Creat feature transformations for unigrams, bigrams, and trigrams.
    # Default ignored single character words, but this examples explicitely includes them.
    bow_converter = CountVectorizer(token_pattern='(?u)\\b\\w+\\b')
    bigram_converter = CountVectorizer(ngram_range=(2,2), token_pattern='(?u)\\b\\w+\\b')
    trigram_converter = CountVectorizer(ngram_range=(3,3), token_pattern='(?u)\\b\\w+\\b')

    # Fit transformers and look at vocab size
    bow_converter.fit(df[text_column])
    words = bow_converter.get_feature_names()
    bigram_converter.fit(df[text_column])
    bigrams = bigram_converter.get_feature_names()
    trigram_converter.fit(df[text_column])
    trigrams = trigram_converter.get_feature_names()

    print("Lengths of BOW, Bigrams, and Trigrams:",
           "\n Words:",len(words),
           "\n Bigrams:", len(bigrams),
           "\n Trigrams:", len(trigrams),
           "\n\n")
    
    return words, bigrams, trigrams


def view_results(words, bigrams, trigrams):
    print("Sample of Words: \n", words[:10],"\n\n",
          "Sample of Bigrams: \n", bigrams[-10:],"\n\n",
          "Sample of Trigrams: \n", trigrams[:10],"\n\n")
    

In [55]:
# Yelp Words, Bigrams, and Trigrams
words, bigrams, trigrams = bow_and_ngrams(yelp_df,"text")

view_results(words, bigrams, trigrams)

Lengths of BOW, Bigrams, and Trigrams: 
 Words: 29221 
 Bigrams: 368937 
 Trigrams: 881609 


Sample of Words: 
 ['0', '00', '000', '007', '00a', '00am', '00pm', '01', '02', '03'] 

 Sample of Bigrams: 
 ['zuzu was', 'zuzus room', 'zweigel wine', 'zwiebel kräuter', 'zy world', 'zzed in', 'éclairs napoleons', 'école lenôtre', 'ém all', 'òc châm'] 

 Sample of Trigrams: 
 ['0 0 eye', '0 20 less', '0 39 oz', '0 39 pizza', '0 5 i', '0 50 to', '0 6 can', '0 75 oysters', '0 75 that', '0 75 to'] 




In [56]:
# Twitter NLP Words, Bigrams, and Trigrams
words, bigrams, trigrams = bow_and_ngrams(twitter_df,"text")

view_results(words, bigrams, trigrams)

Lengths of BOW, Bigrams, and Trigrams: 
 Words: 21678 
 Bigrams: 69982 
 Trigrams: 87447 


Sample of Words: 
 ['0', '00', '000', '0000', '007npen6lg', '00cy9vxeff', '00end', '00pm', '01', '02'] 

 Sample of Bigrams: 
 ['ûó oh', 'ûó organizers', 'ûó rt', 'ûó the', 'ûó wallybaiter', 'ûóher upper', 'ûókody vine', 'ûónegligence and', 'ûótech business', 'ûówe work'] 

 Sample of Trigrams: 
 ['0 11 ronnie', '0 45 to', '0 6 8km', '0 75 in', '0 9 northern', '0 amp more', '0 and blew', '0 balls 0', '0 bids û_', '0 but dude'] 




## Chunking & Part of Speech Tagging
Chunking forms sequences of words (tokens) based off of parts of speech.



In [57]:
def english_chunking(df, text_column):
    ## Preload English Language
    nlp = spacy.load('en_core_web_sm')
    
    ## Create pandas dataframe of spaCy nlp variables
    doc_df = df[text_column].apply(nlp)
    
    for doc in doc_df[4]:
        print([doc.text, doc.pos_, doc.tag_])

In [58]:
## Yelp Chunking & POS Tagging
english_chunking(yelp_df,"text")

['General', 'PROPN', 'NNP']
['Manager', 'PROPN', 'NNP']
['Scott', 'PROPN', 'NNP']
['Petello', 'PROPN', 'NNP']
['is', 'VERB', 'VBZ']
['a', 'DET', 'DT']
['good', 'ADJ', 'JJ']
['egg', 'NOUN', 'NN']
['!', 'PUNCT', '.']
['!', 'PUNCT', '.']
['!', 'PUNCT', '.']
['Not', 'ADV', 'RB']
['to', 'PART', 'TO']
['go', 'VERB', 'VB']
['into', 'ADP', 'IN']
['detail', 'NOUN', 'NN']
[',', 'PUNCT', ',']
['but', 'CCONJ', 'CC']
['let', 'VERB', 'VB']
['me', 'PRON', 'PRP']
['assure', 'VERB', 'VB']
['you', 'PRON', 'PRP']
['if', 'ADP', 'IN']
['you', 'PRON', 'PRP']
['have', 'VERB', 'VBP']
['any', 'DET', 'DT']
['issues', 'NOUN', 'NNS']
['(', 'PUNCT', '-LRB-']
['albeit', 'ADP', 'IN']
['rare', 'ADJ', 'JJ']
[')', 'PUNCT', '-RRB-']
['speak', 'VERB', 'VBP']
['with', 'ADP', 'IN']
['Scott', 'PROPN', 'NNP']
['and', 'CCONJ', 'CC']
['treat', 'VERB', 'VB']
['the', 'DET', 'DT']
['guy', 'NOUN', 'NN']
['with', 'ADP', 'IN']
['some', 'DET', 'DT']
['respect', 'NOUN', 'NN']
['as', 'ADP', 'IN']
['you', 'PRON', 'PRP']
['state', 'VERB'

In [59]:
## Twitter Chunking & POS Tagging
english_chunking(twitter_df,"text")

['Just', 'ADV', 'RB']
['got', 'VERB', 'VBD']
['sent', 'VERB', 'VBN']
['this', 'DET', 'DT']
['photo', 'NOUN', 'NN']
['from', 'ADP', 'IN']
['Ruby', 'PROPN', 'NNP']
['#', 'PROPN', 'NNP']
['Alaska', 'PROPN', 'NNP']
['as', 'ADP', 'IN']
['smoke', 'NOUN', 'NN']
['from', 'ADP', 'IN']
['#', 'NOUN', 'NN']
['wildfires', 'NOUN', 'NNS']
['pours', 'NOUN', 'NNS']
['into', 'ADP', 'IN']
['a', 'DET', 'DT']
['school', 'NOUN', 'NN']


## Term Frequency - Inverse Document Frequency

In [60]:
# The book used Yelp Academic Data Set which I couldn't find posted. It's different then the 10,000 review CSV data we have.
# In the text, it discusses an imbalance for two categories that they are attempting to use as the target for a classification 
# problem. The text follows the following steps to address the issue:
# 1. They identify the two categories
# 2. Subset each of them into new data frames
# 3. Randomly select an even number of observations from each subset
# 4. Combine the randomly selected observations into a single data frame.
# 5. Compute test/train datasets that have a balanced amount of observations from both categories.

# The # of records aren't that imbalanced here so I dont' think it's necessary to do. It doesn't make me wonder what
# the difference would be if we DID balance the TRUE and FALSE observations... perhaps I'll try that.

# In addition, it makes me wonder about the Caravan dataset we discussed the first week on the course where only 5% of
# the observations we were trying to identify existed in the data set... perhaps I'll revisit it as well.

print(len(twitter_df.loc[twitter_df['target'] == 1])+1)
print(len(twitter_df.loc[twitter_df['target'] == 0])+1)

# Split to train/test
train_df, test_df = modsel.train_test_split(twitter_df, train_size=.8, random_state = 123)
print(train_df.shape)
print(test_df.shape)

3272
4343
(6090, 5)
(1523, 5)


In [72]:
# Compare BOW, l2 normalization, and TF-IDF for linear classification

## BOW
bow_transform = text.CountVectorizer()
x_bow_train = bow_transform.fit_transform(train_df['text'])
x_bow_test = bow_transform.fit_transform(test_df['text'])
print(x_bow_train.shape)
print(x_bow_test.shape)
len(bow_transform.vocabulary_)

## TF-IDF
tfidf_transform = text.TfidfTransformer(norm=None)
x_tfidf_train = tfidf_transform.fit_transform(x_bow_train)
x_tfidf_test = tfidf_transform.fit_transform(x_bow_test)

## L2
x_l2_train = preproc.normalize(x_bow_train, axis = 0)
x_l2_test = preproc.normalize(x_bow_test, axis = 0)

y_train = train_df['target']
y_test = test_df['target']

from sklearn.linear_model import SGDClassifier
m = SGDClassifier().fit(x_bow_train, y_train)
s = m.score(x_bow_test, y_test)

(6090, 18558)
(1523, 7081)


ValueError: X has 7081 features per sample; expecting 18558

In [65]:
def simple_logistic_classifier(x_train, y_train, x_test, y_test, description):
    m=LogisticRegression().fit(x_train,y_train)
    s=m.score(x_test,y_test)
    print('Test score with ',description,' features: ', s)
    return m


def simple_logistic_classify(X_tr, y_tr, X_test, y_test, description, _C=1.0):
    ## Helper function to train a logistic classifier and score on test data
    m = LogisticRegression(C=_C).fit(X_tr, y_tr)
    s = m.score(X_test, y_test)
    print ('Test score with', description, 'features:', s)
    return m

m1 = simple_logistic_classify(x_bow_train, y_train, x_bow_test, y_test, 'BOW')
m2 = simple_logistic_classifier(x_tfidf_train, y_train, x_tfidf_test, y_test, 'TF-IDF')
m3 = simple_logistic_classifier(x_l2_train, y_train, x_l2_test, y_test, 'L2')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


ValueError: X has 7081 features per sample; expecting 18558