# Data Transformation

In this notebook we demonstrate how to encode features into machine-readable representation.

In [30]:
import numpy as np
import pandas as pd
from collections import Counter
from processing import text as text_prepro

In [20]:
# pre-pro pipeline
pipeline = [
    text_prepro.to_lower,
    text_prepro.transliterate,
    text_prepro.remove_tags,
    text_prepro.tokenize_url,
    text_prepro.alphanum
]

In [29]:
text = u'Hello @foobar, VISIT my [site](http://foo.bar)'
print text
for i, pipe in enumerate(pipeline, 1):
    text = pipe(text)
    print u'[{}]: {}'.format(i, text)
print

Hello @foobar, VISIT my [site](http://foo.bar)
[1]: hello @foobar, visit my [site](http://foo.bar)
[2]: hello @foobar, visit my [site](http://foo.bar)
[3]: hello foobar, visit my [site](http://foo.bar)
[4]: hello foobar, visit my [site](__URL__)
[5]: hello foobar visit my site __URL__



In [41]:
# read datasets
toxic = pd.read_csv('datasets/toxic_comments.csv', encoding='utf-8')
non_toxic = pd.read_csv('datasets/non_toxic_comments.csv', encoding='utf-8')

In [84]:
# list of raw comments
comments = np.concatenate([non_toxic['comment'].unique(), toxic['comment'].unique()])

In [85]:
comments.shape

(159436,)

In [86]:
comments[:3]

array([ u"This: :One can make an analogy in mathematical terms by envisioning the distribution of opinions in a population as a Gaussian curve. We would then say that the consensus would be a statement that represents the range of opinions within perhaps three standard deviations of the mean opinion.  sounds arbitrary and ad hoc.  Does it really belong in n encyclopedia article?  I don't see that it adds anything useful.  The paragraph that follows seems much more useful.  Are there any political theorists out there who can clarify the issues?  It seems to me that this is an issue that Locke, Rousseau, de Toqueville, and others must have debated...  SR ",
       u"`  :Clarification for you  (and Zundark's right, i should have checked the Wikipedia bugs page first).   This is a ``bug`` in the code that makes wikipedia work  it just means that there is a line of code that may have an error as small as an extra space. It's analogous (in a VERY simplified way) to trying to make something b

In [121]:
# extract word counts
word_counts = Counter()
two_grams = Counter()
for comment in comments:
    for pipe in pipeline:
        comment = pipe(comment)
    word_counts.update(comment.split())
    two_grams.update(comment[i:i+2] for i in xrange(0, len(text), 2))

In [122]:
print len(word_counts)
print len(two_grams)

180929
1704


In [47]:
# top 10 words
word_counts.most_common(10)

[(u'the', 498552),
 (u'to', 298646),
 (u'i', 241659),
 (u'of', 225486),
 (u'and', 225296),
 (u'you', 219647),
 (u'a', 216811),
 (u'is', 177325),
 (u'that', 161825),
 (u'it', 149322)]

In [48]:
# bottom 50 words
word_counts.most_common()[-50:]

[(u'ilkali', 1),
 (u'huanguan', 1),
 (u'leaded', 1),
 (u'pe\u026al\u0268n', 1),
 (u'hmmpff', 1),
 (u'qoyunli', 1),
 (u'thoroughfare', 1),
 (u'fradaulent', 1),
 (u'proberly', 1),
 (u'pocketbook', 1),
 (u'mahakavyas', 1),
 (u'fudd', 1),
 (u'cryokinesis', 1),
 (u'wonk', 1),
 (u'sipopo', 1),
 (u'belembay', 1),
 (u'knisfo', 1),
 (u'onclelosse', 1),
 (u'pertecting', 1),
 (u'antivermins', 1),
 (u'warrig', 1),
 (u'ajna', 1),
 (u'talkapge', 1),
 (u'ipage', 1),
 (u'nepotising', 1),
 (u'cataphract', 1),
 (u'rattner2', 1),
 (u'bratwurst', 1),
 (u'publicationthe', 1),
 (u'clarityafflicting', 1),
 (u'ornella', 1),
 (u'cronyn', 1),
 (u'australianist', 1),
 (u'chromate', 1),
 (u'ehlers', 1),
 (u'spanko', 1),
 (u'thurst', 1),
 (u'gnawing', 1),
 (u'bennies', 1),
 (u'spanky', 1),
 (u'as_of', 1),
 (u'branco', 1),
 (u'\u65b0\u64b0\u59d3\u6c0f\u9332', 1),
 (u'accoutns', 1),
 (u'queensborough', 1),
 (u'commagene', 1),
 (u'psone', 1),
 (u'classsssssssss', 1),
 (u'morihiro', 1),
 (u'downstep', 1)]

In [51]:
# select words with more than 1 occurrence
select = {k: v for k, v in word_counts.iteritems() if v > 1}

In [52]:
len(select)

87334

In [54]:
sorted_words = sorted(select.iteritems(), key=lambda (k, v): (v, k), reverse=True)
word_indexes = {k: i for i, (k, _) in enumerate(sorted_words)}

In [55]:
word_indexes['is']

7

In [57]:
word_indexes['damn']

1173

In [71]:
def bag_of_words(text):
    for pipe in pipeline:
        text = pipe(text)
    vector = np.zeros((len(word_indexes),), dtype=np.float32)
    for w in text.split():
        ind = word_indexes.get(w)
        if ind is not None:
            vector[ind] = 1.
    return vector

In [87]:
vector = bag_of_words(comments[10])

In [88]:
vector.shape

(87334,)

In [89]:
comments[10]

u"`  Can anyone provide any justification for the spelling ``Middle Earth`` used throughout Wikipedia? Where in Tolkien's work is this spelling used? Everywhere I've looked he spells it ``Middle-earth`` (hyphenated, with a lowercase e). , Friday, April 12, 2002  :Since no one provided any justification, I've moved the main page. But we still need to fix the spelling in almost every Tolkien-related article. , Sunday, April 14, 2002`"

In [91]:
vector[word_indexes['used']]

1.0

In [92]:
vector.sum()

55.0

In [112]:
# n-grams
import string
import itertools
letters = u''.join(set((string.letters + string.digits).lower() + ' '))
two_grams = [u"".join(p) for p in set(itertools.permutations(letters, 2))]

In [115]:
len(two_grams)

1332