# Application of basic nlp functions within ```sklearn``` pipelines

## Set ups and Imports

Import modules and our user-defined functions

In [9]:
import pandas as pd
import os
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion

from nlpfunctions.utils import *
from nlpfunctions.basicnlp import *

## Data

For the purpose of this notebook, we will import the labelled text data used in 'From Group to Individual Labels using Deep Features', Kotzias et. al,. KDD 2015 (available here)

In [1]:
#print(os.getcwd())
#print(os.listdir())

In [2]:
imdb = pd.read_excel('Data/imdb.xlsx', header=0)
yelp = pd.read_excel('Data/yelp_labelled.xlsx', header=0)

imdb['source'] = 'imdb'
yelp['source'] = 'yelp'

df = pd.concat([imdb, yelp])


Let's take a quick look a the data:

In [3]:
print(df.columns)

print(df.head(5))


Index(['text', 'score', 'source'], dtype='object')
                                                text  score source
0  A very, very, very slow-moving, aimless movie ...      0   imdb
1  Not sure who was more lost - the flat characte...      0   imdb
2  Attempting artiness with black & white and cle...      0   imdb
3       Very little music or anything to speak of.        0   imdb
4  The best scene in the movie was when Gerardo i...      1   imdb


In [4]:
print(df['text'].describe())   #there are some duplicates

pd.crosstab(df['source'], df['score'])

count                   1743
unique                  1737
top       Not recommended.  
freq                       2
Name: text, dtype: object


score,0,1
source,Unnamed: 1_level_1,Unnamed: 2_level_1
imdb,361,385
yelp,499,500


In [5]:
df[df.duplicated('text')]
df = df.drop_duplicates('text')

In [6]:
df[pd.isnull(df['text'])]   #yep, 1 case
df = df[pd.notnull(df['text'])]   

### Include text-preprocessing functions within sklearn ```CountVectorizer()```

#### Build ad-hoc pre-processor

Let's remove all sentences within each text that score low on subjectivity score

In [12]:
my_preprocessor = combine_functions(sent_tokenise
                                    , lambda x : remove_objective_sents(x, 0.3)
                                    )

#### Build ad-hoc word-tokenisation pipeline

Let's then lemmatise, mark negations, remove numeric digits and puncuation as part of our tokenisation pipeline. 

In [13]:
my_tokenizer_pipe = combine_functions(word_tokenise
                                       ,to_lower
                                       ,POS_tagging
                                       ,lemmatise
                                       ,fix_neg_auxiliary
                                       ,lambda x : remove_stopwords(x, extra_stopwords = ['x', "'s", 'us', 'ca',
                                                                                      'many', 'much', 'one', 'put', 'also', 'get', 'would', 'could', 'like', 'go', 'lot', 'make'])
                                       ,lambda s: [[re.sub(r'\d+','',x) for x in subs] for subs in s]
                                       ,mark_neg
                                       ,flattenIrregularListOfLists  # now we have one list of tokens per text/paragraph
                                       ,remove_punctuation
                                       ,lambda x: list(filter(None, x))   # must end with a list of token lists, each sublist is a paragraph/text
                                       )   

#### Build ad-hoc ```CountVectorizer()```

We can now use our ad-hoc preprocessor and tokenizer within ```CountVectorizer```


In [14]:
my_vec = CountVectorizer(analyzer="word",
                         preprocessor = my_preprocessor,
                         ngram_range = (1,3),
                         tokenizer = my_tokenizer_pipe,
                         stop_words=None,
                         max_features=10000,
                         min_df=1
                         )

#### Build pipeline

We will use the ```Transformers``` ```ColumnSelector()``` and ```Series2ListOfStrings()``` to select the ```pandas.Series``` that contains the text data and transform it into a list of strings which is ```CountVecorizer()```'s required input format. 

In [15]:
#CountVectorizer().build_analyzer()(list2string(new_text.text))

pipe_bags_words = Pipeline([
        
        ('selector', ColumnSelector(columns=['text'])),
        
        ('transformer', Series2ListOfStrings()),
        
        ('vec', my_vec)
        
        ])


#### Let's apply it to our data

Let's take a look a the most frequent words first

In [16]:
# https://www.kaggle.com/arthurtok/spooky-nlp-and-topic-modelling-tutorial

all_words = df['text'].str.split(expand=True).unstack().value_counts()
all_words = all_words.to_frame().reset_index().rename(columns = {'index' : 'word', 0 : 'count'})

# get 50 more frequent words, lots of "rubbish"
#all_words[:50].plot.bar(x='word')
all_words[:50].T


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
word,the,and,a,I,is,of,was,to,The,this,...,one,an,great,really,all,about,by,they,from,time
count,1012,779,618,534,473,468,461,441,354,287,...,74,71,69,68,66,62,61,59,59,59


In [17]:
print(type(df['text']))
print(df['text'][:5])

<class 'pandas.core.series.Series'>
0    A very, very, very slow-moving, aimless movie ...
1    Not sure who was more lost - the flat characte...
2    Attempting artiness with black & white and cle...
3         Very little music or anything to speak of.  
4    The best scene in the movie was when Gerardo i...
Name: text, dtype: object


In [19]:
#pipe_bags_words.fit_transform(df)

#pd.DataFrame(pipe_bags_words.fit_transform(df).A, columns=my_vec.get_feature_names())

# on new data, using the vocabulary just learned
#pd.DataFrame(pipe_bags_words4.transform(new_text).A, columns=my_vec.get_feature_names())
