In [1]:
from sklearn import datasets
from sklearn import preprocessing
from sklearn import feature_extraction
import numpy as np
import pandas as pd

In [2]:
print(pd.__version__)
print("It better say at least '0.25' or greater")

0.25.1
It better say at least '0.25' or greater


## Basic Natural Language Processing and  Bag of Words 

Bag of words modeling for a feature, turns a text feature into a a set of features where each feature is the word count of the word in the original feature.  In this example, instead of using the direct count we use the [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) value for each word.

In [3]:
newsgrps = datasets.fetch_20newsgroups()

In [4]:
feature_extraction.text.TfidfVectorizer?

In [5]:
# arguments do a couple of things,
# use a stop word list to drop words like and, the, a, etc from consideration
# max_df to .9: drop words that show up in 90% of the documents
# min_df to 10 documents, words must show up in at least 10 documents
# max_features only keep the best 1000 words
tfidf = feature_extraction.text.TfidfVectorizer(stop_words="english", max_df=.9, min_df=10, max_features=1000)

In [6]:
tfidf.fit(newsgrps.data)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.9, max_features=1000,
                min_df=10, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [7]:
data = tfidf.transform(newsgrps.data)

In [8]:
np.random.choice(tfidf.get_feature_names(), 10)  # choose 10 random words to see what they are

array(['goes', 'advice', 'mail', 'country', 'stanford', 'talk', 'al',
       'second', 'war', 'currently'], dtype='<U14')

Unfortunately or fortunately depending on what I wanted to do next the matrix returned is sparse.  We can either load it into a sparse dataframe or convert it to a normal dense matrix and load it into a normal dataframe.  Sparse matrices are matrices that are mainly one value (usually 0) and so only the non-zero elements are recorded in memory.

In [9]:
data 

<11314x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 500075 stored elements in Compressed Sparse Row format>

Stay sparse

In [10]:
df = pd.DataFrame.sparse.from_spmatrix(data=data, columns=tfidf.get_feature_names())

In [11]:
df.sparse.density

0.044199664132932644

Convert to dense

In [12]:
df = pd.DataFrame(data=data.todense(), columns=tfidf.get_feature_names())
df.head()

Unnamed: 0,00,000,01,02,03,04,0d,0t,10,100,...,write,writes,written,wrong,wrote,year,years,yes,york,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.116948,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.061846,0.0,0.0,0.134138,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.054946,0.0,0.0,0.0,0.0,0.0,0.120663,0.0,0.0


## One Hot Encoding for categorical features

This is useful when you have categorical data and the categories are not obviously numerically related to each other. One Hot Encoding works by assigning a binary feature to each possible category which means it will convert a column of `n` categories to `n` boolean columns.

In [13]:
df = pd.DataFrame({
    "fave_color":  ["red", "green", "blue", "green", "red", "red"],           # 3 unique values
    "fave_class":  ["cs5644", "bw101", "mit5555", "cs5644", "etc", "cs5644"], # 4 unique values
    "some_num":    [1, 1.5, 3, 4, 13, 12]                                     # not a category
})
df

Unnamed: 0,fave_color,fave_class,some_num
0,red,cs5644,1.0
1,green,bw101,1.5
2,blue,mit5555,3.0
3,green,cs5644,4.0
4,red,etc,13.0
5,red,cs5644,12.0


In [14]:
ohe = preprocessing.OneHotEncoder(dtype=int, sparse=False, handle_unknown="ignore")

In [15]:
data = ohe.fit_transform(df[["fave_color", "fave_class"]])
data

array([[0, 0, 1, 0, 1, 0, 0],
       [0, 1, 0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 1],
       [0, 1, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 1, 0],
       [0, 0, 1, 0, 1, 0, 0]])

In [16]:
cats = pd.DataFrame(data, columns=ohe.get_feature_names())
cats

Unnamed: 0,x0_blue,x0_green,x0_red,x1_bw101,x1_cs5644,x1_etc,x1_mit5555
0,0,0,1,0,1,0,0
1,0,1,0,1,0,0,0
2,1,0,0,0,0,0,1
3,0,1,0,0,1,0,0
4,0,0,1,0,0,1,0
5,0,0,1,0,1,0,0


In [17]:
#add back the some_num column

new_df = pd.concat([cats, df["some_num"]], axis=1)
new_df.head()

Unnamed: 0,x0_blue,x0_green,x0_red,x1_bw101,x1_cs5644,x1_etc,x1_mit5555,some_num
0,0,0,1,0,1,0,0,1.0
1,0,1,0,1,0,0,0,1.5
2,1,0,0,0,0,0,1,3.0
3,0,1,0,0,1,0,0,4.0
4,0,0,1,0,0,1,0,13.0


In [18]:
#rename the xn junk back to something nicer
new_df.columns = [c.replace("x0_", "color=").replace("x1_", "class=") for c in new_df.columns]
new_df

Unnamed: 0,color=blue,color=green,color=red,class=bw101,class=cs5644,class=etc,class=mit5555,some_num
0,0,0,1,0,1,0,0,1.0
1,0,1,0,1,0,0,0,1.5
2,1,0,0,0,0,0,1,3.0
3,0,1,0,0,1,0,0,4.0
4,0,0,1,0,0,1,0,13.0
5,0,0,1,0,1,0,0,12.0


## Advanced methods

Sklearn was originally designed with numpy arrays in mind (hence why it always converts dataframes to numpy arrays).  Pandas came on the scene at a later date and while there has been work to make the two work together there's still a lot to be done.  The normal procedure is to use your dataframes to prepare your data and to end up in a numpy array with your results intermediate and otherwise.  Below is some code that lets you keep track of column names so you can explore your data a little better.  You won't be graded on this but it might be beneficial to your projects.  You mighr also want to look into `sklearn-pandas`.

In [21]:
from sklearn import pipeline
from sklearn import base
from sklearn import compose

# Performs an identity transform on a dataframe and keeps track of the column 
# names which isn't currently supported
class DFPassThrough(base.BaseEstimator, pipeline.TransformerMixin):
    def fit(self, X):
        self.features_ = [c for c in X.columns]
        return self
    
    def transform(self, X):
        return X
    
    def get_feature_names(self):
        return self.features_

In [22]:
ct = compose.ColumnTransformer([
        ("color", ohe, ["fave_color"]),
        ("class", ohe, ["fave_class"]),
        ("", DFPassThrough(), ["some_num"])
])

In [23]:
ct.fit(df)
ct.get_feature_names()

['color__x0_blue',
 'color__x0_green',
 'color__x0_red',
 'class__x0_bw101',
 'class__x0_cs5644',
 'class__x0_etc',
 'class__x0_mit5555',
 '__some_num']

In [24]:
#Note: this isn't perfect everything gets converted into a float b/c sklearn
# still works with numpy arrays internally
pd.DataFrame(ct.transform(df), columns=ct.get_feature_names())

Unnamed: 0,color__x0_blue,color__x0_green,color__x0_red,class__x0_bw101,class__x0_cs5644,class__x0_etc,class__x0_mit5555,__some_num
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.5
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,4.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,13.0
5,0.0,0.0,1.0,0.0,1.0,0.0,0.0,12.0
