# Data Wrangling

## Some Pandas Basics
A Dataframe can be best seen as a collection of Series.

In [307]:
import pandas as pd
import numpy as np
import scipy.io.wavfile as wavfile
import re
from sklearn.feature_extraction.text import CountVectorizer
from scipy import misc

```python
#create DF from a file
from sqlalchemy import create_engine
engine = create_engine('sqlite:///:memory:')

#read data from a SQL Database
sql_df   = pd.read_sql_table('my_table', engine, columns=['ColA', 'ColB'])

xls_df   = pd.read_excel('my_dataset.xlsx', 'Sheet1', na_values=['NA', '?'])
json_df  = pd.read_json('my_dataset.json', orient='columns')
csv_df   = pd.read_csv('my_dataset.csv', sep=',')

#loads all HTML tables into a list of DataFrames
html_df = pd.read_html('http://page.com/with/table.html')[0]

#writing dataframe back to disk
sql_df.to_sql('table', engine)
xls_df.to_excel('dataset.xlsx')
json_df.to_json('dataset.json')
csv_df.to_csv('dataset.csv')
```

In [308]:
#create df from dicts
df2 = pd.DataFrame({ 'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo' })
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [309]:
#create DF from Random data
my_df = pd.DataFrame(np.random.randn(10,4))
my_df

Unnamed: 0,0,1,2,3
0,3.429912,-0.066049,0.670934,0.010867
1,-0.07583,-0.441421,-1.327857,-1.657295
2,0.075238,-0.665197,0.833174,-1.225016
3,0.684801,2.708749,-1.745054,0.521101
4,-0.175017,1.00426,-0.295602,-0.830704
5,-1.104486,-1.939933,1.772453,1.425173
6,-1.247001,-0.03202,0.070697,-1.222431
7,-1.767824,-2.027048,1.036384,0.110421
8,1.075074,-0.821947,1.196328,-0.888075
9,-0.190531,-0.595926,-2.567793,0.819289


In [310]:
#View columns
my_df.columns

RangeIndex(start=0, stop=4, step=1)

In [311]:
#rename columns
my_df.columns = ['c1', 'c2', 'c3', 'c4']
my_df.columns

Index(['c1', 'c2', 'c3', 'c4'], dtype='object')

In [312]:
#head of data
my_df.head(3)

Unnamed: 0,c1,c2,c3,c4
0,3.429912,-0.066049,0.670934,0.010867
1,-0.07583,-0.441421,-1.327857,-1.657295
2,0.075238,-0.665197,0.833174,-1.225016


In [313]:
#tail of data
my_df.tail(3)

Unnamed: 0,c1,c2,c3,c4
7,-1.767824,-2.027048,1.036384,0.110421
8,1.075074,-0.821947,1.196328,-0.888075
9,-0.190531,-0.595926,-2.567793,0.819289


In [314]:
#summary statistics
my_df.describe()

Unnamed: 0,c1,c2,c3,c4
count,10.0,10.0,10.0,10.0
mean,0.070434,-0.287653,-0.035634,-0.293667
std,1.46507,1.376959,1.424893,1.018942
min,-1.767824,-2.027048,-2.567793,-1.657295
25%,-0.875997,-0.782759,-1.069793,-1.138842
50%,-0.125424,-0.518674,0.370816,-0.409918
75%,0.532411,-0.040527,0.985581,0.418431
max,3.429912,2.708749,1.772453,1.425173


In [315]:
#view indices
my_df.index

RangeIndex(start=0, stop=10, step=1)

In [316]:
#view df types
#objects are strings
my_df.dtypes

c1    float64
c2    float64
c3    float64
c4    float64
dtype: object

In [317]:
#transpose dataframe
my_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
c1,3.429912,-0.07583,0.075238,0.684801,-0.175017,-1.104486,-1.247001,-1.767824,1.075074,-0.190531
c2,-0.066049,-0.441421,-0.665197,2.708749,1.00426,-1.939933,-0.03202,-2.027048,-0.821947,-0.595926
c3,0.670934,-1.327857,0.833174,-1.745054,-0.295602,1.772453,0.070697,1.036384,1.196328,-2.567793
c4,0.010867,-1.657295,-1.225016,0.521101,-0.830704,1.425173,-1.222431,0.110421,-0.888075,0.819289


In [377]:
my_df.c1.unique()

array([ 3.42991185, -0.07583045,  0.0752379 ,  0.68480138, -0.17501685,
       -1.10448624, -1.24700124, -1.76782405,  1.07507379, -0.19053102])

In [318]:
my_df.sort_values(by='c1')

Unnamed: 0,c1,c2,c3,c4
7,-1.767824,-2.027048,1.036384,0.110421
6,-1.247001,-0.03202,0.070697,-1.222431
5,-1.104486,-1.939933,1.772453,1.425173
9,-0.190531,-0.595926,-2.567793,0.819289
4,-0.175017,1.00426,-0.295602,-0.830704
1,-0.07583,-0.441421,-1.327857,-1.657295
2,0.075238,-0.665197,0.833174,-1.225016
3,0.684801,2.708749,-1.745054,0.521101
8,1.075074,-0.821947,1.196328,-0.888075
0,3.429912,-0.066049,0.670934,0.010867


In [319]:
#slicing and dicing

#returns series
my_df.c1
#returns series
my_df['c1']

#returns dataframe
my_df[['c1']]

#loc is inclusive of end value range
#returns series
my_df.loc[:, 'c1']
#returns dataframe
my_df.loc[:, ['c1']]

#iloc is exclusive of end value range
#returns series
my_df.iloc[:, 0]
#returns dataframe
my_df.iloc[:, [0]]

my_df[0:2]
my_df.iloc[0:2, :]

print(type(my_df.c1))
print(type(my_df['c1']))
print(type(my_df[['c1']]))
print(type(my_df.loc[:, 'c1']))
print(type(my_df.loc[:, ['c1']]))
print(type(my_df.iloc[:, 0]))
print(type(my_df.iloc[:, [0]]))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>


In [320]:
#Boolean indexing
#Can be further combined using bitwise operators
bool_df = my_df[(my_df.c1 > 1) & (my_df.c2 < 1)]
bool_df

Unnamed: 0,c1,c2,c3,c4
0,3.429912,-0.066049,0.670934,0.010867
8,1.075074,-0.821947,1.196328,-0.888075


In [321]:
#writing to a slice
#be sure to write data on a per column basis because of homogeneous column datatypes
my_df.loc[my_df.c1 < 1, ['c4']] = 1
my_df

Unnamed: 0,c1,c2,c3,c4
0,3.429912,-0.066049,0.670934,0.010867
1,-0.07583,-0.441421,-1.327857,1.0
2,0.075238,-0.665197,0.833174,1.0
3,0.684801,2.708749,-1.745054,1.0
4,-0.175017,1.00426,-0.295602,1.0
5,-1.104486,-1.939933,1.772453,1.0
6,-1.247001,-0.03202,0.070697,1.0
7,-1.767824,-2.027048,1.036384,1.0
8,1.075074,-0.821947,1.196328,-0.888075
9,-0.190531,-0.595926,-2.567793,1.0


In [322]:
#using isin method for filtering
my_df2 = my_df.copy()
my_df2['E'] = ['one', 'one','two','three','four','three','five','six','seven','eight']
my_df2[my_df2['E'].isin(['two','four'])] = np.nan
my_df2

Unnamed: 0,c1,c2,c3,c4,E
0,3.429912,-0.066049,0.670934,0.010867,one
1,-0.07583,-0.441421,-1.327857,1.0,one
2,,,,,
3,0.684801,2.708749,-1.745054,1.0,three
4,,,,,
5,-1.104486,-1.939933,1.772453,1.0,three
6,-1.247001,-0.03202,0.070697,1.0,five
7,-1.767824,-2.027048,1.036384,1.0,six
8,1.075074,-0.821947,1.196328,-0.888075,seven
9,-0.190531,-0.595926,-2.567793,1.0,eight


In [323]:
#concat dataframes
df3 = pd.DataFrame(np.random.randn(10, 4))
pieces = [df3[:3], df3[3:7], df3[7:]]
#you can also concat on axis=1 which is columns instead of rows
pd.concat(pieces, ignore_index=False)

Unnamed: 0,0,1,2,3
0,0.059708,-1.060619,1.018158,0.615169
1,-0.451842,-1.301358,1.386228,-0.09149
2,-2.774137,0.423235,-0.969686,2.166529
3,0.135551,0.233122,0.757039,-0.542566
4,-0.11263,-1.540928,1.458573,1.23922
5,-0.27495,1.699435,-1.157547,-1.010814
6,0.594049,0.563982,0.63363,0.280327
7,0.109958,2.412108,0.770092,0.736919
8,-0.568858,0.823141,-0.963356,2.026693
9,-1.029066,-2.315624,1.329659,0.246801


In [324]:
#Append rows
df4 = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
s4 = df4.iloc[3]
df4.append(s4, ignore_index=False)
#you can also append dataframe to conconcat them on rows

Unnamed: 0,A,B,C,D
0,0.933439,0.204878,0.622217,0.549325
1,-0.098756,0.821258,1.444383,1.02363
2,1.563211,1.624338,-0.358735,-1.700873
3,-1.748375,0.603679,0.074664,-1.43847
4,2.976859,1.109203,1.114726,0.501893
5,-0.08519,0.318864,0.955607,-1.279175
6,-0.897894,1.305317,1.326528,-1.338326
7,-1.52607,-0.693856,0.422019,-1.26477
3,-1.748375,0.603679,0.074664,-1.43847


In [325]:
#joining
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


## Unique Values

In [380]:
df2

Unnamed: 0,A,B,C,D,E,F,Eup
0,1.0,2013-01-02,1.0,3,test,foo,TEST
1,1.0,2013-01-02,1.0,3,TRAIN,foo,TRAIN
2,1.0,2013-01-02,1.0,3,test,foo,TEST
3,1.0,2013-01-02,1.0,3,TRAIN,foo,TRAIN


In [379]:
df2.D.unique()

array(['3'], dtype=object)

In [395]:
df2.D.value_counts()

3    4
Name: D, dtype: int64

In [394]:
df2.E.unique()

array(['test', 'TRAIN'], dtype=object)

In [396]:
df2.E.value_counts()

test     2
TRAIN    2
Name: E, dtype: int64

## Drop duplicates

In [326]:
my_df2.drop_duplicates()

Unnamed: 0,c1,c2,c3,c4,E
0,3.429912,-0.066049,0.670934,0.010867,one
1,-0.07583,-0.441421,-1.327857,1.0,one
2,,,,,
3,0.684801,2.708749,-1.745054,1.0,three
5,-1.104486,-1.939933,1.772453,1.0,three
6,-1.247001,-0.03202,0.070697,1.0,five
7,-1.767824,-2.027048,1.036384,1.0,six
8,1.075074,-0.821947,1.196328,-0.888075,seven
9,-0.190531,-0.595926,-2.567793,1.0,eight


In [374]:
my_df2.drop_duplicates(subset=['c1','E'], inplace=False)

Unnamed: 0,c1,c2,c3,c4,E
0,3.429912,-0.066049,0.670934,0.010867,one
1,-0.07583,-0.441421,-1.327857,1.0,one
2,,,,,
3,0.684801,2.708749,-1.745054,1.0,three
5,-1.104486,-1.939933,1.772453,1.0,three
6,-1.247001,-0.03202,0.070697,1.0,five
7,-1.767824,-2.027048,1.036384,1.0,six
8,1.075074,-0.821947,1.196328,-0.888075,seven
9,-0.190531,-0.595926,-2.567793,1.0,eight


## Detecting and replacing missing values 

In [327]:
#count missing values
my_df2[['c2']].isnull().sum()

c2    2
dtype: int64

In [328]:
#get boolean values where dataframe is null
my_df2.isnull()

Unnamed: 0,c1,c2,c3,c4,E
0,False,False,False,False,False
1,False,False,False,False,False
2,True,True,True,True,True
3,False,False,False,False,False
4,True,True,True,True,True
5,False,False,False,False,False
6,False,False,False,False,False
7,False,False,False,False,False
8,False,False,False,False,False
9,False,False,False,False,False


In [329]:
#drop missing data
my_df2.dropna()

Unnamed: 0,c1,c2,c3,c4,E
0,3.429912,-0.066049,0.670934,0.010867,one
1,-0.07583,-0.441421,-1.327857,1.0,one
3,0.684801,2.708749,-1.745054,1.0,three
5,-1.104486,-1.939933,1.772453,1.0,three
6,-1.247001,-0.03202,0.070697,1.0,five
7,-1.767824,-2.027048,1.036384,1.0,six
8,1.075074,-0.821947,1.196328,-0.888075,seven
9,-0.190531,-0.595926,-2.567793,1.0,eight


In [330]:
#filling missing datab
my_df2.fillna(value=3)

Unnamed: 0,c1,c2,c3,c4,E
0,3.429912,-0.066049,0.670934,0.010867,one
1,-0.07583,-0.441421,-1.327857,1.0,one
2,3.0,3.0,3.0,3.0,3
3,0.684801,2.708749,-1.745054,1.0,three
4,3.0,3.0,3.0,3.0,3
5,-1.104486,-1.939933,1.772453,1.0,three
6,-1.247001,-0.03202,0.070697,1.0,five
7,-1.767824,-2.027048,1.036384,1.0,six
8,1.075074,-0.821947,1.196328,-0.888075,seven
9,-0.190531,-0.595926,-2.567793,1.0,eight


In [375]:
my_df2.fillna(my_df2.mean(axis=0))

Unnamed: 0,c1,c2,c3,c4,E
0,3.429912,-0.066049,0.670934,0.010867,one
1,-0.07583,-0.441421,-1.327857,1.0,one
2,0.100514,-0.401949,-0.111739,0.640349,
3,0.684801,2.708749,-1.745054,1.0,three
4,0.100514,-0.401949,-0.111739,0.640349,
5,-1.104486,-1.939933,1.772453,1.0,three
6,-1.247001,-0.03202,0.070697,1.0,five
7,-1.767824,-2.027048,1.036384,1.0,six
8,1.075074,-0.821947,1.196328,-0.888075,seven
9,-0.190531,-0.595926,-2.567793,1.0,eight


In [331]:
#forward fill
my_df2.ffill()

Unnamed: 0,c1,c2,c3,c4,E
0,3.429912,-0.066049,0.670934,0.010867,one
1,-0.07583,-0.441421,-1.327857,1.0,one
2,-0.07583,-0.441421,-1.327857,1.0,one
3,0.684801,2.708749,-1.745054,1.0,three
4,0.684801,2.708749,-1.745054,1.0,three
5,-1.104486,-1.939933,1.772453,1.0,three
6,-1.247001,-0.03202,0.070697,1.0,five
7,-1.767824,-2.027048,1.036384,1.0,six
8,1.075074,-0.821947,1.196328,-0.888075,seven
9,-0.190531,-0.595926,-2.567793,1.0,eight


In [332]:
#backward fill
my_df2.bfill()

Unnamed: 0,c1,c2,c3,c4,E
0,3.429912,-0.066049,0.670934,0.010867,one
1,-0.07583,-0.441421,-1.327857,1.0,one
2,0.684801,2.708749,-1.745054,1.0,three
3,0.684801,2.708749,-1.745054,1.0,three
4,-1.104486,-1.939933,1.772453,1.0,three
5,-1.104486,-1.939933,1.772453,1.0,three
6,-1.247001,-0.03202,0.070697,1.0,five
7,-1.767824,-2.027048,1.036384,1.0,six
8,1.075074,-0.821947,1.196328,-0.888075,seven
9,-0.190531,-0.595926,-2.567793,1.0,eight


In [376]:
#interpolate
my_df2.interpolate(method='linear')

Unnamed: 0,c1,c2,c3,c4,E
0,3.429912,-0.066049,0.670934,0.010867,one
1,-0.07583,-0.441421,-1.327857,1.0,one
2,0.304485,1.133664,-1.536456,1.0,
3,0.684801,2.708749,-1.745054,1.0,three
4,-0.209842,0.384408,0.013699,1.0,
5,-1.104486,-1.939933,1.772453,1.0,three
6,-1.247001,-0.03202,0.070697,1.0,five
7,-1.767824,-2.027048,1.036384,1.0,six
8,1.075074,-0.821947,1.196328,-0.888075,seven
9,-0.190531,-0.595926,-2.567793,1.0,eight


## Detecting outliers

In [334]:
#use boolean indexing

## Data Conversions
Here are some important pandas data conversions:
* pd.to_datetime(data, errors='coerce')
* pd.to_numeric(data, errors='raise')
* pd.to_timedelta(data, errors='ignore')
* pd.to_pickle(data)

In [335]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [336]:
df2['E'] = df2['E'].astype(dtype='object')
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E            object
F            object
dtype: object

### Pandas String methods

In [337]:
df2['Eup'] = df2.E.str.upper()
df2

Unnamed: 0,A,B,C,D,E,F,Eup
0,1.0,2013-01-02,1.0,3,test,foo,TEST
1,1.0,2013-01-02,1.0,3,train,foo,TRAIN
2,1.0,2013-01-02,1.0,3,test,foo,TEST
3,1.0,2013-01-02,1.0,3,train,foo,TRAIN


In [338]:
train_select = df2.E.isin(['train'])
train_select

0    False
1     True
2    False
3     True
Name: E, dtype: bool

In [339]:
df2.loc[train_select,'E'] = df2.loc[train_select,'E'].apply(lambda x : x.upper())
df2

Unnamed: 0,A,B,C,D,E,F,Eup
0,1.0,2013-01-02,1.0,3,test,foo,TEST
1,1.0,2013-01-02,1.0,3,TRAIN,foo,TRAIN
2,1.0,2013-01-02,1.0,3,test,foo,TEST
3,1.0,2013-01-02,1.0,3,TRAIN,foo,TRAIN


In [340]:
df2.loc[0,'D'] = '3a'
df2

Unnamed: 0,A,B,C,D,E,F,Eup
0,1.0,2013-01-02,1.0,3a,test,foo,TEST
1,1.0,2013-01-02,1.0,3,TRAIN,foo,TRAIN
2,1.0,2013-01-02,1.0,3,test,foo,TEST
3,1.0,2013-01-02,1.0,3,TRAIN,foo,TRAIN


In [341]:
#handy recipe to replace non numbers in a numerical with empty string
df2.loc[:,'D'] = df2.D.apply(lambda x : re.sub('[^0-9]','',str(x)))
df2

Unnamed: 0,A,B,C,D,E,F,Eup
0,1.0,2013-01-02,1.0,3,test,foo,TEST
1,1.0,2013-01-02,1.0,3,TRAIN,foo,TRAIN
2,1.0,2013-01-02,1.0,3,test,foo,TEST
3,1.0,2013-01-02,1.0,3,TRAIN,foo,TRAIN


## Categoricals

In [342]:
df3 = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})

In [343]:
df3["grade"] = df3["raw_grade"].astype("category")
df3["grade"]

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): [a, b, e]

In [344]:
df3["grade"].cat.categories = ["very good", "good", "very bad"]
df3["grade"]

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (3, object): [very good, good, very bad]

In [345]:
df3["grade"] = df3["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])
df3["grade"]

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (5, object): [very bad, bad, medium, good, very good]

In [346]:
df3.sort_values(by='grade')

Unnamed: 0,id,raw_grade,grade
5,6,e,very bad
1,2,b,good
2,3,b,good
0,1,a,very good
3,4,a,very good
4,5,a,very good


In [347]:
df3.groupby('grade').size()

grade
very bad     1
bad          0
medium       0
good         2
very good    3
dtype: int64

### Ordered categories

In [348]:
ordered_satisfaction = ['very bad', 'bad','neutral','good','very good']
satis_df = pd.DataFrame({"satisfactions":['mad','good','neutral','very good','bad']})
satis_df['satisfactions2'] = satis_df['satisfactions'].astype('category', ordered=True, categories=ordered_satisfaction)
satis_df

Unnamed: 0,satisfactions,satisfactions2
0,mad,
1,good,good
2,neutral,neutral
3,very good,very good
4,bad,bad


In [349]:
satis_df.dtypes

satisfactions       object
satisfactions2    category
dtype: object

### Category codes

In [350]:
satis_df['satisfactions2'].cat.codes

0   -1
1    3
2    2
3    4
4    1
dtype: int8

### Converting to categorical indicators (one hot encoding)

In [351]:
pd.get_dummies(ordered_satisfaction)

Unnamed: 0,bad,good,neutral,very bad,very good
0,0,0,0,1,0
1,1,0,0,0,0
2,0,0,1,0,0
3,0,1,0,0,0
4,0,0,0,0,1


In [352]:
satis_dummy_df = pd.get_dummies(satis_df, columns=['satisfactions'])
satis_dummy_df

Unnamed: 0,satisfactions2,satisfactions_bad,satisfactions_good,satisfactions_mad,satisfactions_neutral,satisfactions_very good
0,,0,0,1,0,0
1,good,0,1,0,0,0
2,neutral,0,0,0,1,0
3,very good,0,0,0,0,1
4,bad,1,0,0,0,0


## Text to Features
### Bag of Words

In [353]:
corpus = ["Think of your machine learning models as if they were children who have absolutely no knowledge",
          "except what you train them with;",
          "what information would they need to know to make the right decisions?"]

In [354]:
bow = CountVectorizer()
X = bow.fit_transform(corpus)
X

<3x31 sparse matrix of type '<class 'numpy.int64'>'
	with 33 stored elements in Compressed Sparse Row format>

In [355]:
bow.vocabulary_
bow.get_feature_names()

['absolutely',
 'as',
 'children',
 'decisions',
 'except',
 'have',
 'if',
 'information',
 'know',
 'knowledge',
 'learning',
 'machine',
 'make',
 'models',
 'need',
 'no',
 'of',
 'right',
 'the',
 'them',
 'they',
 'think',
 'to',
 'train',
 'were',
 'what',
 'who',
 'with',
 'would',
 'you',
 'your']

In [356]:
bow.stop_words_
bow.get_stop_words()

In [357]:
X.toarray()
X.todense()

matrix([[1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1,
         0, 0, 1, 0, 1, 0, 0, 0, 1],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
         0, 1, 0, 1, 0, 1, 0, 1, 0],
        [0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0,
         2, 0, 0, 1, 0, 0, 1, 0, 0]], dtype=int64)

In [358]:
bow.inverse_transform(X)

[array(['knowledge', 'no', 'absolutely', 'have', 'who', 'children', 'were',
        'they', 'if', 'as', 'models', 'learning', 'machine', 'your', 'of',
        'think'],
       dtype='<U11'), array(['with', 'them', 'train', 'you', 'what', 'except'],
       dtype='<U11'), array(['decisions', 'right', 'the', 'make', 'know', 'to', 'need', 'would',
        'information', 'what', 'they'],
       dtype='<U11')]

## Images to Features

In [359]:
img_rgb = misc.imread(name='CapLogo.png',flatten=False,mode='RGB')
type(img)

numpy.ndarray

In [360]:
img_rgb.shape

(115, 300, 3)

In [361]:
img_rgb.dtype

dtype('uint8')

In [362]:
#resample image if it is too big
#every 2 pixels close to each other are heavily correlated
img_rgb = img_rgb[::2, ::2]
img_rgb.shape

(58, 150, 3)

In [363]:
img_gray = misc.imread(name='CapLogo.png',flatten=True,mode='RGB')
type(img_gray)

numpy.ndarray

In [364]:
img_gray.shape

(115, 300)

In [365]:
img_gray.dtype

dtype('float32')

In [366]:
#normalize values between 0 and 1
img_gray = (img_gray/255.).reshape(-1,)
img_gray

array([ 1.,  1.,  1., ...,  1.,  1.,  1.], dtype=float32)

In [367]:
img_gray.shape

(34500,)

## Audio to Features
**Be sure to have _same sample rates_ when comparing wav files!!!**

In [368]:
sample_rate, audio_data = wavfile.read('test.wav')
sample_rate

44100

In [369]:
audio_data

array([     0,   1221,   2440, ..., -31277, -30892, -30463], dtype=int16)

In [370]:
type(sample_rate)

int

In [371]:
audio_data.shape

(22050,)

## Normalizing Data Sets

In [372]:
#use sklearn preprocessing of numpy arrays
#so remember to convert to numpy arrays!

## Feature Engineering