## Why text is bad for you

In [None]:
import pandas as pd

import re

pd.set_option('max_colwidth', 300)

In [None]:
df = pd.read_csv('../data/wiki/wiki.csv.gz', encoding='utf8', index_col=None)

In [None]:
df['text'] = df.text.str.extract(r'^(.*?)\.', expand=False)

In [None]:
df.head()

## So what can we do?

In [None]:
import nltk
#nltk.download('all')

In [None]:
from nltk.tokenize import SpaceTokenizer
tokenizer = SpaceTokenizer()

In [None]:
tokenizer.tokenize('He takes long walks')

In [None]:
from nltk.stem.snowball import EnglishStemmer
stemmer = EnglishStemmer()

In [None]:
stemmer.stem('Walks')

In [None]:
def tokenize_only(text):
    tokens = tokenizer.tokenize(text)
    
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    filtered_tokens = [token for token in tokens if re.search('[a-zA-Z]', token)]
    
    return filtered_tokens

def tokenize_and_stem(text):
    tokens = tokenize_only(text)
    stems = map(stemmer.stem, tokens)
    return stems

## Ok.. but, how does that help me?

In [None]:
sample = df[df.name.isin(['4chan', '8chan', 'Aerosmith', 'Alabama', 'Texas'])]

In [None]:
sample

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=2, tokenizer=tokenize_and_stem)

In [None]:
vec_text = tfidf_vectorizer.fit_transform(sample.text)

In [None]:
vec_text

In [None]:
pd.DataFrame(vec_text.toarray())

In [None]:
tfidf_vectorizer.get_feature_names()

In [None]:
tfidf_vectorizer.idf_

In [None]:
vec_df = pd.DataFrame(vec_text.toarray(), columns=tfidf_vectorizer.get_feature_names())

In [None]:
sample.reset_index(drop=True).join(vec_df)