# Feature Engineering: Transformations

### Read in text

In [None]:
import pandas as pd

data = pd.read_csv("../Support/SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

### Create the two new features

In [None]:
import string

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))
data.head()

### Plot the two new features

In [None]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [None]:
bins = np.linspace(0, 200, 40)

plt.hist(data['body_len'], bins)
plt.title("body length distribution")
plt.show()

In [None]:
bins = np.linspace(0, 50, 40)

plt.hist(data['punct%'], bins)
plt.title("punctuation % distribution")
plt.show()

### Transform the punctuation % feature

### Box-Cox Power Transformation

**Base Form**: $$ y^x $$

| X    | Base Form           |           Transformation               |
|------|--------------------------|--------------------------|
| -2   | $$ y ^ {-2} $$           | $$ \frac{1}{y^2} $$      |
| -1   | $$ y ^ {-1} $$           | $$ \frac{1}{y} $$        |
| -0.5 | $$ y ^ {\frac{-1}{2}} $$ | $$ \frac{1}{\sqrt{y}} $$ |
| 0    | $$ y^{0} $$              | $$ log(y) $$             |
| 0.5  | $$ y ^ {\frac{1}{2}}  $$ | $$ \sqrt{y} $$           |
| 1    | $$ y^{1} $$              | $$ y $$                  |
| 2    | $$ y^{2} $$              | $$ y^2 $$                |


**Process**
1. Determine what range of exponents to test
2. Apply each transformation to each value of your chosen feature
3. Use some criteria to determine which of the transformations yield the best distribution

In [None]:
for i in range(1,6):
    plt.hist((data['punct%'])**(1/i), bins=40)
    plt.title("transformation: 1/{}".format(str(i)))
    plt.show()