# Data Wrangling

## Some Pandas Basics
A Dataframe can be best seen as a collection of Series.

In [11]:
import pandas as pd
import numpy as np
import scipy.io.wavfile as wavfile
import re
from sklearn.feature_extraction.text import CountVectorizer
from scipy import misc
from sklearn.datasets import load_iris
from sklearn import preprocessing

```python
#create DF from a file
from sqlalchemy import create_engine
engine = create_engine('sqlite:///:memory:')

#read data from a SQL Database
sql_df   = pd.read_sql_table('my_table', engine, columns=['ColA', 'ColB'])

xls_df   = pd.read_excel('my_dataset.xlsx', 'Sheet1', na_values=['NA', '?'])
json_df  = pd.read_json('my_dataset.json', orient='columns')
csv_df   = pd.read_csv('my_dataset.csv', sep=',')

#loads all HTML tables into a list of DataFrames
html_df = pd.read_html('http://page.com/with/table.html')[0]

#writing dataframe back to disk
sql_df.to_sql('table', engine)
xls_df.to_excel('dataset.xlsx')
json_df.to_json('dataset.json')
csv_df.to_csv('dataset.csv')
```

In [12]:
#create df from dicts
df2 = pd.DataFrame({ 'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo' })
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [13]:
#create DF from Random data
my_df = pd.DataFrame(np.random.randn(10,4))
my_df

Unnamed: 0,0,1,2,3
0,0.736554,0.269159,0.253452,0.501988
1,-1.103899,1.334553,-0.178264,-0.17834
2,1.052203,0.996838,-1.271246,-0.488833
3,-0.756077,-0.018653,-0.516593,-1.174101
4,1.828283,-0.902417,0.670401,-0.937503
5,0.388019,0.649875,-0.246614,0.236178
6,0.508946,0.431149,-0.595974,-0.867091
7,0.501378,-0.324754,0.7712,0.457972
8,-1.013292,0.752553,0.309467,-1.006771
9,-0.90484,0.46975,1.391168,0.241953


In [14]:
#View columns
my_df.columns

RangeIndex(start=0, stop=4, step=1)

In [15]:
#rename columns
my_df.columns = ['c1', 'c2', 'c3', 'c4']
my_df.columns

Index(['c1', 'c2', 'c3', 'c4'], dtype='object')

In [16]:
#head of data
my_df.head(3)

Unnamed: 0,c1,c2,c3,c4
0,0.736554,0.269159,0.253452,0.501988
1,-1.103899,1.334553,-0.178264,-0.17834
2,1.052203,0.996838,-1.271246,-0.488833


In [17]:
#tail of data
my_df.tail(3)

Unnamed: 0,c1,c2,c3,c4
7,0.501378,-0.324754,0.7712,0.457972
8,-1.013292,0.752553,0.309467,-1.006771
9,-0.90484,0.46975,1.391168,0.241953


In [18]:
#summary statistics
my_df.describe()

Unnamed: 0,c1,c2,c3,c4
count,10.0,10.0,10.0,10.0
mean,0.123727,0.365805,0.0587,-0.321455
std,1.007521,0.651871,0.77716,0.652886
min,-1.103899,-0.902417,-1.271246,-1.174101
25%,-0.86765,0.0533,-0.449098,-0.9199
50%,0.444699,0.450449,0.037594,-0.333586
75%,0.679652,0.726883,0.580168,0.24051
max,1.828283,1.334553,1.391168,0.501988


In [19]:
#view indices
my_df.index

RangeIndex(start=0, stop=10, step=1)

In [20]:
#view df types
#objects are strings
my_df.dtypes

c1    float64
c2    float64
c3    float64
c4    float64
dtype: object

In [21]:
#transpose dataframe
my_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
c1,0.736554,-1.103899,1.052203,-0.756077,1.828283,0.388019,0.508946,0.501378,-1.013292,-0.90484
c2,0.269159,1.334553,0.996838,-0.018653,-0.902417,0.649875,0.431149,-0.324754,0.752553,0.46975
c3,0.253452,-0.178264,-1.271246,-0.516593,0.670401,-0.246614,-0.595974,0.7712,0.309467,1.391168
c4,0.501988,-0.17834,-0.488833,-1.174101,-0.937503,0.236178,-0.867091,0.457972,-1.006771,0.241953


In [22]:
my_df.c1.unique()

array([ 0.73655357, -1.10389918,  1.05220268, -0.75607731,  1.82828301,
        0.3880193 ,  0.5089456 ,  0.50137846, -1.0132917 , -0.90484043])

In [23]:
my_df.sort_values(by='c1')

Unnamed: 0,c1,c2,c3,c4
1,-1.103899,1.334553,-0.178264,-0.17834
8,-1.013292,0.752553,0.309467,-1.006771
9,-0.90484,0.46975,1.391168,0.241953
3,-0.756077,-0.018653,-0.516593,-1.174101
5,0.388019,0.649875,-0.246614,0.236178
7,0.501378,-0.324754,0.7712,0.457972
6,0.508946,0.431149,-0.595974,-0.867091
0,0.736554,0.269159,0.253452,0.501988
2,1.052203,0.996838,-1.271246,-0.488833
4,1.828283,-0.902417,0.670401,-0.937503


In [24]:
#slicing and dicing

#returns series
my_df.c1
#returns series
my_df['c1']

#returns dataframe
my_df[['c1']]

#loc is inclusive of end value range
#returns series
my_df.loc[:, 'c1']
#returns dataframe
my_df.loc[:, ['c1']]

#iloc is exclusive of end value range
#returns series
my_df.iloc[:, 0]
#returns dataframe
my_df.iloc[:, [0]]

my_df[0:2]
my_df.iloc[0:2, :]

print(type(my_df.c1))
print(type(my_df['c1']))
print(type(my_df[['c1']]))
print(type(my_df.loc[:, 'c1']))
print(type(my_df.loc[:, ['c1']]))
print(type(my_df.iloc[:, 0]))
print(type(my_df.iloc[:, [0]]))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>


In [25]:
#Boolean indexing
#Can be further combined using bitwise operators
bool_df = my_df[(my_df.c1 > 1) & (my_df.c2 < 1)]
bool_df

Unnamed: 0,c1,c2,c3,c4
2,1.052203,0.996838,-1.271246,-0.488833
4,1.828283,-0.902417,0.670401,-0.937503


In [26]:
#writing to a slice
#be sure to write data on a per column basis because of homogeneous column datatypes
my_df.loc[my_df.c1 < 1, ['c4']] = 1
my_df

Unnamed: 0,c1,c2,c3,c4
0,0.736554,0.269159,0.253452,1.0
1,-1.103899,1.334553,-0.178264,1.0
2,1.052203,0.996838,-1.271246,-0.488833
3,-0.756077,-0.018653,-0.516593,1.0
4,1.828283,-0.902417,0.670401,-0.937503
5,0.388019,0.649875,-0.246614,1.0
6,0.508946,0.431149,-0.595974,1.0
7,0.501378,-0.324754,0.7712,1.0
8,-1.013292,0.752553,0.309467,1.0
9,-0.90484,0.46975,1.391168,1.0


In [27]:
#using isin method for filtering
my_df2 = my_df.copy()
my_df2['E'] = ['one', 'one','two','three','four','three','five','six','seven','eight']
my_df2[my_df2['E'].isin(['two','four'])] = np.nan
my_df2

Unnamed: 0,c1,c2,c3,c4,E
0,0.736554,0.269159,0.253452,1.0,one
1,-1.103899,1.334553,-0.178264,1.0,one
2,,,,,
3,-0.756077,-0.018653,-0.516593,1.0,three
4,,,,,
5,0.388019,0.649875,-0.246614,1.0,three
6,0.508946,0.431149,-0.595974,1.0,five
7,0.501378,-0.324754,0.7712,1.0,six
8,-1.013292,0.752553,0.309467,1.0,seven
9,-0.90484,0.46975,1.391168,1.0,eight


In [28]:
#concat dataframes
df3 = pd.DataFrame(np.random.randn(10, 4))
pieces = [df3[:3], df3[3:7], df3[7:]]
#you can also concat on axis=1 which is columns instead of rows
pd.concat(pieces, ignore_index=False)

Unnamed: 0,0,1,2,3
0,-1.006502,0.721945,-1.088461,1.263342
1,1.02514,0.565729,-1.10789,0.327143
2,0.341378,-0.467268,0.27901,-1.088711
3,0.450944,1.697789,0.282701,0.161627
4,0.886552,-1.162972,0.66799,0.071254
5,0.663216,0.218618,-0.077011,1.184447
6,-0.744467,-1.270784,-0.705477,0.272665
7,-1.088776,1.387077,-0.413942,-0.469358
8,0.209674,-0.474754,-0.924084,1.006825
9,-1.20319,0.628853,-0.181042,-0.899254


In [29]:
#Append rows
df4 = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
s4 = df4.iloc[3]
df4.append(s4, ignore_index=False)
#you can also append dataframe to conconcat them on rows

Unnamed: 0,A,B,C,D
0,0.779678,1.534846,0.222188,0.506277
1,-0.948113,-0.522177,-0.054997,0.880036
2,-0.615853,0.031428,1.397151,0.398182
3,0.945864,0.40114,0.252321,-0.794253
4,0.29571,-1.923325,-0.299958,-1.517742
5,-0.261086,1.493882,-0.251637,0.856025
6,0.978454,-0.282706,-0.680783,0.811201
7,-1.163855,0.982866,0.196933,0.517124
3,0.945864,0.40114,0.252321,-0.794253


In [30]:
#joining
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


### Generate Ranges with Numpy
Useful for hyperparameter tuning

In [78]:
C_l = np.arange(0.05,2,0.05)
C_l

array([ 0.05,  0.1 ,  0.15,  0.2 ,  0.25,  0.3 ,  0.35,  0.4 ,  0.45,
        0.5 ,  0.55,  0.6 ,  0.65,  0.7 ,  0.75,  0.8 ,  0.85,  0.9 ,
        0.95,  1.  ,  1.05,  1.1 ,  1.15,  1.2 ,  1.25,  1.3 ,  1.35,
        1.4 ,  1.45,  1.5 ,  1.55,  1.6 ,  1.65,  1.7 ,  1.75,  1.8 ,
        1.85,  1.9 ,  1.95])

In [80]:
gamma_l = np.arange(0.001,0.1,0.001)
gamma_l

array([ 0.001,  0.002,  0.003,  0.004,  0.005,  0.006,  0.007,  0.008,
        0.009,  0.01 ,  0.011,  0.012,  0.013,  0.014,  0.015,  0.016,
        0.017,  0.018,  0.019,  0.02 ,  0.021,  0.022,  0.023,  0.024,
        0.025,  0.026,  0.027,  0.028,  0.029,  0.03 ,  0.031,  0.032,
        0.033,  0.034,  0.035,  0.036,  0.037,  0.038,  0.039,  0.04 ,
        0.041,  0.042,  0.043,  0.044,  0.045,  0.046,  0.047,  0.048,
        0.049,  0.05 ,  0.051,  0.052,  0.053,  0.054,  0.055,  0.056,
        0.057,  0.058,  0.059,  0.06 ,  0.061,  0.062,  0.063,  0.064,
        0.065,  0.066,  0.067,  0.068,  0.069,  0.07 ,  0.071,  0.072,
        0.073,  0.074,  0.075,  0.076,  0.077,  0.078,  0.079,  0.08 ,
        0.081,  0.082,  0.083,  0.084,  0.085,  0.086,  0.087,  0.088,
        0.089,  0.09 ,  0.091,  0.092,  0.093,  0.094,  0.095,  0.096,
        0.097,  0.098,  0.099])

## Unique Values

In [31]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [32]:
df2.D.unique()

array([3])

In [33]:
df2.D.value_counts()

3    4
Name: D, dtype: int64

In [34]:
df2.E.unique()

[test, train]
Categories (2, object): [test, train]

In [35]:
df2.E.value_counts()

train    2
test     2
Name: E, dtype: int64

In [75]:
df2.apply(pd.Series.nunique)

A      1
B      1
C      1
D      1
E      2
F      1
Eup    2
dtype: int64

In [37]:
def cnt_unique_vals(df):
  col_uni_val={}
  for i in df.columns:
    col_uni_val[i] = len(df[i].unique())
  #Import pprint to display dic nicely:
  import pprint
  pprint.pprint(col_uni_val)

In [38]:
cnt_unique_vals(df2)

{'A': 1, 'B': 1, 'C': 1, 'D': 1, 'E': 2, 'F': 1}


## Drop duplicates

In [39]:
my_df2.drop_duplicates()

Unnamed: 0,c1,c2,c3,c4,E
0,0.736554,0.269159,0.253452,1.0,one
1,-1.103899,1.334553,-0.178264,1.0,one
2,,,,,
3,-0.756077,-0.018653,-0.516593,1.0,three
5,0.388019,0.649875,-0.246614,1.0,three
6,0.508946,0.431149,-0.595974,1.0,five
7,0.501378,-0.324754,0.7712,1.0,six
8,-1.013292,0.752553,0.309467,1.0,seven
9,-0.90484,0.46975,1.391168,1.0,eight


In [40]:
my_df2.drop_duplicates(subset=['c1','E'], inplace=False)

Unnamed: 0,c1,c2,c3,c4,E
0,0.736554,0.269159,0.253452,1.0,one
1,-1.103899,1.334553,-0.178264,1.0,one
2,,,,,
3,-0.756077,-0.018653,-0.516593,1.0,three
5,0.388019,0.649875,-0.246614,1.0,three
6,0.508946,0.431149,-0.595974,1.0,five
7,0.501378,-0.324754,0.7712,1.0,six
8,-1.013292,0.752553,0.309467,1.0,seven
9,-0.90484,0.46975,1.391168,1.0,eight


## Detecting and replacing missing values 

In [41]:
#count missing values
my_df2[['c2']].isnull().sum()

c2    2
dtype: int64

In [42]:
#get boolean values where dataframe is null
my_df2.isnull()

Unnamed: 0,c1,c2,c3,c4,E
0,False,False,False,False,False
1,False,False,False,False,False
2,True,True,True,True,True
3,False,False,False,False,False
4,True,True,True,True,True
5,False,False,False,False,False
6,False,False,False,False,False
7,False,False,False,False,False
8,False,False,False,False,False
9,False,False,False,False,False


In [43]:
#drop missing data
my_df2.dropna()

Unnamed: 0,c1,c2,c3,c4,E
0,0.736554,0.269159,0.253452,1.0,one
1,-1.103899,1.334553,-0.178264,1.0,one
3,-0.756077,-0.018653,-0.516593,1.0,three
5,0.388019,0.649875,-0.246614,1.0,three
6,0.508946,0.431149,-0.595974,1.0,five
7,0.501378,-0.324754,0.7712,1.0,six
8,-1.013292,0.752553,0.309467,1.0,seven
9,-0.90484,0.46975,1.391168,1.0,eight


In [44]:
#filling missing datab
my_df2.fillna(value=3)

Unnamed: 0,c1,c2,c3,c4,E
0,0.736554,0.269159,0.253452,1.0,one
1,-1.103899,1.334553,-0.178264,1.0,one
2,3.0,3.0,3.0,3.0,3
3,-0.756077,-0.018653,-0.516593,1.0,three
4,3.0,3.0,3.0,3.0,3
5,0.388019,0.649875,-0.246614,1.0,three
6,0.508946,0.431149,-0.595974,1.0,five
7,0.501378,-0.324754,0.7712,1.0,six
8,-1.013292,0.752553,0.309467,1.0,seven
9,-0.90484,0.46975,1.391168,1.0,eight


In [45]:
my_df2.fillna(my_df2.mean(axis=0))

Unnamed: 0,c1,c2,c3,c4,E
0,0.736554,0.269159,0.253452,1.0,one
1,-1.103899,1.334553,-0.178264,1.0,one
2,-0.205401,0.445454,0.14848,1.0,
3,-0.756077,-0.018653,-0.516593,1.0,three
4,-0.205401,0.445454,0.14848,1.0,
5,0.388019,0.649875,-0.246614,1.0,three
6,0.508946,0.431149,-0.595974,1.0,five
7,0.501378,-0.324754,0.7712,1.0,six
8,-1.013292,0.752553,0.309467,1.0,seven
9,-0.90484,0.46975,1.391168,1.0,eight


In [46]:
#forward fill
my_df2.ffill()

Unnamed: 0,c1,c2,c3,c4,E
0,0.736554,0.269159,0.253452,1.0,one
1,-1.103899,1.334553,-0.178264,1.0,one
2,-1.103899,1.334553,-0.178264,1.0,one
3,-0.756077,-0.018653,-0.516593,1.0,three
4,-0.756077,-0.018653,-0.516593,1.0,three
5,0.388019,0.649875,-0.246614,1.0,three
6,0.508946,0.431149,-0.595974,1.0,five
7,0.501378,-0.324754,0.7712,1.0,six
8,-1.013292,0.752553,0.309467,1.0,seven
9,-0.90484,0.46975,1.391168,1.0,eight


In [47]:
#backward fill
my_df2.bfill()

Unnamed: 0,c1,c2,c3,c4,E
0,0.736554,0.269159,0.253452,1.0,one
1,-1.103899,1.334553,-0.178264,1.0,one
2,-0.756077,-0.018653,-0.516593,1.0,three
3,-0.756077,-0.018653,-0.516593,1.0,three
4,0.388019,0.649875,-0.246614,1.0,three
5,0.388019,0.649875,-0.246614,1.0,three
6,0.508946,0.431149,-0.595974,1.0,five
7,0.501378,-0.324754,0.7712,1.0,six
8,-1.013292,0.752553,0.309467,1.0,seven
9,-0.90484,0.46975,1.391168,1.0,eight


In [48]:
#interpolate
my_df2.interpolate(method='linear')

Unnamed: 0,c1,c2,c3,c4,E
0,0.736554,0.269159,0.253452,1.0,one
1,-1.103899,1.334553,-0.178264,1.0,one
2,-0.929988,0.65795,-0.347429,1.0,
3,-0.756077,-0.018653,-0.516593,1.0,three
4,-0.184029,0.315611,-0.381603,1.0,
5,0.388019,0.649875,-0.246614,1.0,three
6,0.508946,0.431149,-0.595974,1.0,five
7,0.501378,-0.324754,0.7712,1.0,six
8,-1.013292,0.752553,0.309467,1.0,seven
9,-0.90484,0.46975,1.391168,1.0,eight


## Detecting outliers

In [49]:
#use boolean indexing

## Data Conversions
Here are some important pandas data conversions:
* pd.to_datetime(data, errors='coerce')
* pd.to_numeric(data, errors='raise')
* pd.to_timedelta(data, errors='ignore')
* pd.to_pickle(data)

In [50]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [51]:
df2['E'] = df2['E'].astype(dtype='object')
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E            object
F            object
dtype: object

### Pandas String methods

In [52]:
df2['Eup'] = df2.E.str.upper()
df2

Unnamed: 0,A,B,C,D,E,F,Eup
0,1.0,2013-01-02,1.0,3,test,foo,TEST
1,1.0,2013-01-02,1.0,3,train,foo,TRAIN
2,1.0,2013-01-02,1.0,3,test,foo,TEST
3,1.0,2013-01-02,1.0,3,train,foo,TRAIN


In [53]:
train_select = df2.E.isin(['train'])
train_select

0    False
1     True
2    False
3     True
Name: E, dtype: bool

In [54]:
df2.loc[train_select,'E'] = df2.loc[train_select,'E'].apply(lambda x : x.upper())
df2

Unnamed: 0,A,B,C,D,E,F,Eup
0,1.0,2013-01-02,1.0,3,test,foo,TEST
1,1.0,2013-01-02,1.0,3,TRAIN,foo,TRAIN
2,1.0,2013-01-02,1.0,3,test,foo,TEST
3,1.0,2013-01-02,1.0,3,TRAIN,foo,TRAIN


In [55]:
df2.loc[0,'D'] = '3a'
df2

Unnamed: 0,A,B,C,D,E,F,Eup
0,1.0,2013-01-02,1.0,3a,test,foo,TEST
1,1.0,2013-01-02,1.0,3,TRAIN,foo,TRAIN
2,1.0,2013-01-02,1.0,3,test,foo,TEST
3,1.0,2013-01-02,1.0,3,TRAIN,foo,TRAIN


In [56]:
#handy recipe to replace non numbers in a numerical with empty string
df2.loc[:,'D'] = df2.D.apply(lambda x : re.sub('[^0-9]','',str(x)))
df2

Unnamed: 0,A,B,C,D,E,F,Eup
0,1.0,2013-01-02,1.0,3,test,foo,TEST
1,1.0,2013-01-02,1.0,3,TRAIN,foo,TRAIN
2,1.0,2013-01-02,1.0,3,test,foo,TEST
3,1.0,2013-01-02,1.0,3,TRAIN,foo,TRAIN


## Categoricals

In [57]:
df3 = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})

In [58]:
df3["grade"] = df3["raw_grade"].astype("category")
df3["grade"]

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): [a, b, e]

In [59]:
df3["grade"].cat.categories = ["very good", "good", "very bad"]
df3["grade"]

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (3, object): [very good, good, very bad]

In [60]:
df3["grade"] = df3["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])
df3["grade"]

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (5, object): [very bad, bad, medium, good, very good]

In [61]:
df3.sort_values(by='grade')

Unnamed: 0,id,raw_grade,grade
5,6,e,very bad
1,2,b,good
2,3,b,good
0,1,a,very good
3,4,a,very good
4,5,a,very good


In [62]:
df3.groupby('grade').size()

grade
very bad     1
bad          0
medium       0
good         2
very good    3
dtype: int64

### Ordered categories

In [63]:
ordered_satisfaction = ['very bad', 'bad','neutral','good','very good']
satis_df = pd.DataFrame({"satisfactions":['mad','good','neutral','very good','bad']})
satis_df['satisfactions2'] = satis_df['satisfactions'].astype('category', ordered=True, categories=ordered_satisfaction)
satis_df

Unnamed: 0,satisfactions,satisfactions2
0,mad,
1,good,good
2,neutral,neutral
3,very good,very good
4,bad,bad


In [64]:
satis_df.dtypes

satisfactions       object
satisfactions2    category
dtype: object

### Category codes

In [65]:
satis_df['satisfactions2'].cat.codes

0   -1
1    3
2    2
3    4
4    1
dtype: int8

### Converting to categorical indicators (one hot encoding)

In [66]:
pd.get_dummies(ordered_satisfaction)

Unnamed: 0,bad,good,neutral,very bad,very good
0,0,0,0,1,0
1,1,0,0,0,0
2,0,0,1,0,0
3,0,1,0,0,0
4,0,0,0,0,1


In [67]:
satis_dummy_df = pd.get_dummies(satis_df, columns=['satisfactions'])
satis_dummy_df

Unnamed: 0,satisfactions2,satisfactions_bad,satisfactions_good,satisfactions_mad,satisfactions_neutral,satisfactions_very good
0,,0,0,1,0,0
1,good,0,1,0,0,0
2,neutral,0,0,0,1,0
3,very good,0,0,0,0,1
4,bad,1,0,0,0,0


## Text to Features
### Bag of Words

In [68]:
corpus = ["Think of your machine learning models as if they were children who have absolutely no knowledge",
          "except what you train them with;",
          "what information would they need to know to make the right decisions?"]

In [69]:
bow = CountVectorizer()
X = bow.fit_transform(corpus)
X

<3x31 sparse matrix of type '<class 'numpy.int64'>'
	with 33 stored elements in Compressed Sparse Row format>

In [70]:
bow.vocabulary_
bow.get_feature_names()

['absolutely',
 'as',
 'children',
 'decisions',
 'except',
 'have',
 'if',
 'information',
 'know',
 'knowledge',
 'learning',
 'machine',
 'make',
 'models',
 'need',
 'no',
 'of',
 'right',
 'the',
 'them',
 'they',
 'think',
 'to',
 'train',
 'were',
 'what',
 'who',
 'with',
 'would',
 'you',
 'your']

In [71]:
bow.stop_words_
bow.get_stop_words()

In [72]:
X.toarray()
X.todense()

matrix([[1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1,
         0, 0, 1, 0, 1, 0, 0, 0, 1],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
         0, 1, 0, 1, 0, 1, 0, 1, 0],
        [0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0,
         2, 0, 0, 1, 0, 0, 1, 0, 0]], dtype=int64)

In [73]:
bow.inverse_transform(X)

[array(['knowledge', 'no', 'absolutely', 'have', 'who', 'children', 'were',
        'they', 'if', 'as', 'models', 'learning', 'machine', 'your', 'of',
        'think'],
       dtype='<U11'), array(['with', 'them', 'train', 'you', 'what', 'except'],
       dtype='<U11'), array(['decisions', 'right', 'the', 'make', 'know', 'to', 'need', 'would',
        'information', 'what', 'they'],
       dtype='<U11')]

## Images to Features

In [74]:
img_rgb = misc.imread(name='CapLogo.png',flatten=False,mode='RGB')
type(img)

NameError: name 'img' is not defined

In [None]:
img_rgb.shape

In [None]:
img_rgb.dtype

In [None]:
#resample image if it is too big
#every 2 pixels close to each other are heavily correlated
img_rgb = img_rgb[::2, ::2]
img_rgb.shape

In [None]:
img_gray = misc.imread(name='CapLogo.png',flatten=True,mode='RGB')
type(img_gray)

In [None]:
img_gray.shape

In [None]:
img_gray.dtype

In [None]:
#normalize values between 0 and 1
img_gray = (img_gray/255.).reshape(-1,)
img_gray

In [None]:
img_gray.shape

## Audio to Features
**Be sure to have _same sample rates_ when comparing wav files!!!**

In [None]:
sample_rate, audio_data = wavfile.read('test.wav')
sample_rate

In [None]:
audio_data

In [None]:
type(sample_rate)

In [None]:
audio_data.shape

## Normalizing Data Sets

- use sklearn preprocessing of numpy arrays
- so remember to convert to numpy arrays!

http://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing

In [None]:
def scaleFeaturesDF(df):
    # Feature scaling is a type of transformation that only changes the
    # scale, but not number of features. Because of this, we can still
    # use the original dataset's column names... so long as we keep in
    # mind that the _units_ have been altered:

    scaled = preprocessing.StandardScaler().fit_transform(df)
    scaled = pd.DataFrame(scaled, columns=df.columns)
    
    print("New Variances:\n", scaled.var())
    print("New Describe:\n", scaled.describe())
    return scaled

In [None]:
from sklearn import preprocessing
#X = preprocessing.StandardScaler().fit_transform(df)
#X = preprocessing.MinMaxScaler().fit_transform(df)
#X = preprocessing.MaxAbsScaler().fit_transform(df)
#X = preprocessing.Normalizer().fit_transform(df)
#T = df # No Change

## Feature Engineering

### Principal Component Analysis (PCA)
You can use ```python X_rec = pca.inverse_transform(X_proj)``` to go back and forth the original feature space and the new feature space project onto principal components. A handy [link](https://onlinecourses.science.psu.edu/stat505/node/54) to interpret PCA.

In [None]:
def do_PCA(np_arr, n_components, svd_solver):
    from sklearn.decomposition import PCA
    pca = PCA(n_components=n_components, svd_solver=svd_solver)
    pca.fit(np_arr)
    reduced_np_arr = pca.transform(np_arr)
    return reduced_np_arr

### Isomap

In [None]:
from sklearn import manifold
iso = manifold.Isomap(n_neighbors=4, n_components=2)
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
iso.fit(df)

In [None]:
manifold = iso.transform(df)
print(df.shape)
print(manifold.shape)

In [None]:
manifold[0:3,:]

In [None]:
df.iloc[0:3,:]

### Train Split Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
data   = [0,1,2,3,4, 5,6,7,8,9]  # input dataframe samples
labels = [0,0,0,0,0, 1,1,1,1,1]  # the function we're training is " >4 "

In [None]:
data_train, data_test, label_train, label_test = train_test_split(data, labels, test_size=0.5, random_state=7)

In [None]:
data_train

In [None]:
label_train

In [None]:
data_test

In [None]:
label_test