# Data Wrangling

## Some Pandas Basics
A Dataframe can be best seen as a collection of Series.

In [190]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy import misc
import scipy.io.wavfile as wavfile

```python
#create DF from a file
from sqlalchemy import create_engine
engine = create_engine('sqlite:///:memory:')

#read data from a SQL Database
sql_df   = pd.read_sql_table('my_table', engine, columns=['ColA', 'ColB'])

xls_df   = pd.read_excel('my_dataset.xlsx', 'Sheet1', na_values=['NA', '?'])
json_df  = pd.read_json('my_dataset.json', orient='columns')
csv_df   = pd.read_csv('my_dataset.csv', sep=',')

#loads all HTML tables into a list of DataFrames
html_df = pd.read_html('http://page.com/with/table.html')[0]

#writing dataframe back to disk
sql_df.to_sql('table', engine)
xls_df.to_excel('dataset.xlsx')
json_df.to_json('dataset.json')
csv_df.to_csv('dataset.csv')
```

In [92]:
#create df from dicts
df2 = pd.DataFrame({ 'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo' })
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [93]:
#create DF from Random data
my_df = pd.DataFrame(np.random.randn(10,4))
my_df

Unnamed: 0,0,1,2,3
0,-1.353916,-0.956077,1.91878,0.318696
1,0.946857,-1.589318,1.863635,1.136637
2,-0.156633,-2.278528,0.430331,1.109732
3,0.79643,-0.751505,-0.068205,0.549385
4,0.610282,-0.160709,0.352771,1.405675
5,-0.244197,1.042836,-1.829334,0.322587
6,0.485236,0.601111,1.497022,-0.133032
7,0.074969,0.10606,0.193874,0.529798
8,0.996205,-2.159927,0.110629,0.415107
9,-1.328051,-0.044505,-1.018048,-0.379998


In [94]:
#View columns
my_df.columns

RangeIndex(start=0, stop=4, step=1)

In [95]:
#rename columns
my_df.columns = ['c1', 'c2', 'c3', 'c4']
my_df.columns

Index(['c1', 'c2', 'c3', 'c4'], dtype='object')

In [96]:
#head of data
my_df.head(3)

Unnamed: 0,c1,c2,c3,c4
0,-1.353916,-0.956077,1.91878,0.318696
1,0.946857,-1.589318,1.863635,1.136637
2,-0.156633,-2.278528,0.430331,1.109732


In [97]:
#tail of data
my_df.tail(3)

Unnamed: 0,c1,c2,c3,c4
7,0.074969,0.10606,0.193874,0.529798
8,0.996205,-2.159927,0.110629,0.415107
9,-1.328051,-0.044505,-1.018048,-0.379998


In [98]:
#summary statistics
my_df.describe()

Unnamed: 0,c1,c2,c3,c4
count,10.0,10.0,10.0,10.0
mean,0.082718,-0.619056,0.345146,0.527459
std,0.865982,1.13124,1.199614,0.561723
min,-1.353916,-2.278528,-1.829334,-0.379998
25%,-0.222306,-1.431007,-0.023496,0.319669
50%,0.280102,-0.456107,0.273322,0.472453
75%,0.749893,0.068419,1.230349,0.969645
max,0.996205,1.042836,1.91878,1.405675


In [99]:
#view indices
my_df.index

RangeIndex(start=0, stop=10, step=1)

In [100]:
#view df types
#objects are strings
my_df.dtypes

c1    float64
c2    float64
c3    float64
c4    float64
dtype: object

In [101]:
#transpose dataframe
my_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
c1,-1.353916,0.946857,-0.156633,0.79643,0.610282,-0.244197,0.485236,0.074969,0.996205,-1.328051
c2,-0.956077,-1.589318,-2.278528,-0.751505,-0.160709,1.042836,0.601111,0.10606,-2.159927,-0.044505
c3,1.91878,1.863635,0.430331,-0.068205,0.352771,-1.829334,1.497022,0.193874,0.110629,-1.018048
c4,0.318696,1.136637,1.109732,0.549385,1.405675,0.322587,-0.133032,0.529798,0.415107,-0.379998


In [102]:
my_df.sort_values(by='c1')

Unnamed: 0,c1,c2,c3,c4
0,-1.353916,-0.956077,1.91878,0.318696
9,-1.328051,-0.044505,-1.018048,-0.379998
5,-0.244197,1.042836,-1.829334,0.322587
2,-0.156633,-2.278528,0.430331,1.109732
7,0.074969,0.10606,0.193874,0.529798
6,0.485236,0.601111,1.497022,-0.133032
4,0.610282,-0.160709,0.352771,1.405675
3,0.79643,-0.751505,-0.068205,0.549385
1,0.946857,-1.589318,1.863635,1.136637
8,0.996205,-2.159927,0.110629,0.415107


In [103]:
#slicing and dicing

#returns series
my_df.c1
#returns series
my_df['c1']

#returns dataframe
my_df[['c1']]

#loc is inclusive of end value range
#returns series
my_df.loc[:, 'c1']
#returns dataframe
my_df.loc[:, ['c1']]

#iloc is exclusive of end value range
#returns series
my_df.iloc[:, 0]
#returns dataframe
my_df.iloc[:, [0]]

my_df[0:2]
my_df.iloc[0:2, :]

print(type(my_df.c1))
print(type(my_df['c1']))
print(type(my_df[['c1']]))
print(type(my_df.loc[:, 'c1']))
print(type(my_df.loc[:, ['c1']]))
print(type(my_df.iloc[:, 0]))
print(type(my_df.iloc[:, [0]]))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>


In [104]:
#Boolean indexing
#Can be further combined using bitwise operators
bool_df = my_df[(my_df.c1 > 1) & (my_df.c2 < 1)]
bool_df

Unnamed: 0,c1,c2,c3,c4


In [105]:
#writing to a slice
#be sure to write data on a per column basis because of homogeneous column datatypes
my_df.loc[my_df.c1 < 1, ['c4']] = 1
my_df

Unnamed: 0,c1,c2,c3,c4
0,-1.353916,-0.956077,1.91878,1.0
1,0.946857,-1.589318,1.863635,1.0
2,-0.156633,-2.278528,0.430331,1.0
3,0.79643,-0.751505,-0.068205,1.0
4,0.610282,-0.160709,0.352771,1.0
5,-0.244197,1.042836,-1.829334,1.0
6,0.485236,0.601111,1.497022,1.0
7,0.074969,0.10606,0.193874,1.0
8,0.996205,-2.159927,0.110629,1.0
9,-1.328051,-0.044505,-1.018048,1.0


In [106]:
#using isin method for filtering
my_df2 = my_df.copy()
my_df2['E'] = ['one', 'one','two','three','four','three','five','six','seven','eight']
my_df2[my_df2['E'].isin(['two','four'])] = np.nan
my_df2

Unnamed: 0,c1,c2,c3,c4,E
0,-1.353916,-0.956077,1.91878,1.0,one
1,0.946857,-1.589318,1.863635,1.0,one
2,,,,,
3,0.79643,-0.751505,-0.068205,1.0,three
4,,,,,
5,-0.244197,1.042836,-1.829334,1.0,three
6,0.485236,0.601111,1.497022,1.0,five
7,0.074969,0.10606,0.193874,1.0,six
8,0.996205,-2.159927,0.110629,1.0,seven
9,-1.328051,-0.044505,-1.018048,1.0,eight


In [107]:
#concat dataframes
df3 = pd.DataFrame(np.random.randn(10, 4))
pieces = [df3[:3], df3[3:7], df3[7:]]
#you can also concat on axis=1 which is columns instead of rows
pd.concat(pieces, ignore_index=False)

Unnamed: 0,0,1,2,3
0,-0.374462,-0.36337,-0.194131,0.211513
1,0.194068,1.163646,1.1043,0.886834
2,-1.161996,-1.170085,0.954889,1.667097
3,1.099197,0.340988,0.707699,-0.199392
4,-0.222015,-0.382282,2.113857,0.889638
5,-0.230541,-0.085252,1.320642,0.441078
6,-0.684889,1.834033,0.259828,-0.430723
7,0.00277,-1.774921,0.536611,0.604318
8,0.246562,0.287036,0.286153,1.261558
9,-1.020751,-2.004812,0.422095,-1.044094


In [108]:
#Append rows
df4 = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
s4 = df4.iloc[3]
df4.append(s4, ignore_index=False)
#you can also append dataframe to conconcat them on rows

Unnamed: 0,A,B,C,D
0,-0.243195,-0.278707,-1.130894,-1.738669
1,-0.458775,-0.487703,0.315612,0.442408
2,2.047365,-0.714572,-0.637538,-1.764341
3,1.015748,-0.097178,0.263644,0.44196
4,-1.65205,0.344966,0.12124,0.488373
5,0.649349,-0.530617,0.514445,-0.790895
6,0.229526,1.397136,0.767916,-1.304029
7,0.21299,0.028431,1.043233,-0.011836
3,1.015748,-0.097178,0.263644,0.44196


In [109]:
#joining
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


## Drop duplicates

In [110]:
my_df2.drop_duplicates()

Unnamed: 0,c1,c2,c3,c4,E
0,-1.353916,-0.956077,1.91878,1.0,one
1,0.946857,-1.589318,1.863635,1.0,one
2,,,,,
3,0.79643,-0.751505,-0.068205,1.0,three
5,-0.244197,1.042836,-1.829334,1.0,three
6,0.485236,0.601111,1.497022,1.0,five
7,0.074969,0.10606,0.193874,1.0,six
8,0.996205,-2.159927,0.110629,1.0,seven
9,-1.328051,-0.044505,-1.018048,1.0,eight


## Detecting and replacing missing values 

In [111]:
#get boolean values where dataframe is null
my_df2.isnull()

Unnamed: 0,c1,c2,c3,c4,E
0,False,False,False,False,False
1,False,False,False,False,False
2,True,True,True,True,True
3,False,False,False,False,False
4,True,True,True,True,True
5,False,False,False,False,False
6,False,False,False,False,False
7,False,False,False,False,False
8,False,False,False,False,False
9,False,False,False,False,False


In [112]:
#drop missing data
my_df2.dropna()

Unnamed: 0,c1,c2,c3,c4,E
0,-1.353916,-0.956077,1.91878,1.0,one
1,0.946857,-1.589318,1.863635,1.0,one
3,0.79643,-0.751505,-0.068205,1.0,three
5,-0.244197,1.042836,-1.829334,1.0,three
6,0.485236,0.601111,1.497022,1.0,five
7,0.074969,0.10606,0.193874,1.0,six
8,0.996205,-2.159927,0.110629,1.0,seven
9,-1.328051,-0.044505,-1.018048,1.0,eight


In [113]:
#filling missing datab
my_df2.fillna(value=3)

Unnamed: 0,c1,c2,c3,c4,E
0,-1.353916,-0.956077,1.91878,1.0,one
1,0.946857,-1.589318,1.863635,1.0,one
2,3.0,3.0,3.0,3.0,3
3,0.79643,-0.751505,-0.068205,1.0,three
4,3.0,3.0,3.0,3.0,3
5,-0.244197,1.042836,-1.829334,1.0,three
6,0.485236,0.601111,1.497022,1.0,five
7,0.074969,0.10606,0.193874,1.0,six
8,0.996205,-2.159927,0.110629,1.0,seven
9,-1.328051,-0.044505,-1.018048,1.0,eight


In [114]:
#forward fill
my_df2.ffill()

Unnamed: 0,c1,c2,c3,c4,E
0,-1.353916,-0.956077,1.91878,1.0,one
1,0.946857,-1.589318,1.863635,1.0,one
2,0.946857,-1.589318,1.863635,1.0,one
3,0.79643,-0.751505,-0.068205,1.0,three
4,0.79643,-0.751505,-0.068205,1.0,three
5,-0.244197,1.042836,-1.829334,1.0,three
6,0.485236,0.601111,1.497022,1.0,five
7,0.074969,0.10606,0.193874,1.0,six
8,0.996205,-2.159927,0.110629,1.0,seven
9,-1.328051,-0.044505,-1.018048,1.0,eight


In [115]:
#backward fill
my_df2.bfill()

Unnamed: 0,c1,c2,c3,c4,E
0,-1.353916,-0.956077,1.91878,1.0,one
1,0.946857,-1.589318,1.863635,1.0,one
2,0.79643,-0.751505,-0.068205,1.0,three
3,0.79643,-0.751505,-0.068205,1.0,three
4,-0.244197,1.042836,-1.829334,1.0,three
5,-0.244197,1.042836,-1.829334,1.0,three
6,0.485236,0.601111,1.497022,1.0,five
7,0.074969,0.10606,0.193874,1.0,six
8,0.996205,-2.159927,0.110629,1.0,seven
9,-1.328051,-0.044505,-1.018048,1.0,eight


In [116]:
#interpolate
my_df2.interpolate()

Unnamed: 0,c1,c2,c3,c4,E
0,-1.353916,-0.956077,1.91878,1.0,one
1,0.946857,-1.589318,1.863635,1.0,one
2,0.871644,-1.170411,0.897715,1.0,
3,0.79643,-0.751505,-0.068205,1.0,three
4,0.276117,0.145665,-0.948769,1.0,
5,-0.244197,1.042836,-1.829334,1.0,three
6,0.485236,0.601111,1.497022,1.0,five
7,0.074969,0.10606,0.193874,1.0,six
8,0.996205,-2.159927,0.110629,1.0,seven
9,-1.328051,-0.044505,-1.018048,1.0,eight


## Detecting outliers

In [117]:
#use boolean indexing

## Data Conversions

In [118]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [119]:
df2['D'] = df2['D'].astype(dtype='object')
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D            object
E          category
F            object
dtype: object

## Categoricals

In [120]:
df3 = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})

In [121]:
df3["grade"] = df3["raw_grade"].astype("category")
df3["grade"]

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): [a, b, e]

In [122]:
df3["grade"].cat.categories = ["very good", "good", "very bad"]
df3["grade"]

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (3, object): [very good, good, very bad]

In [123]:
df3["grade"] = df3["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])
df3["grade"]

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (5, object): [very bad, bad, medium, good, very good]

In [124]:
df3.sort_values(by='grade')

Unnamed: 0,id,raw_grade,grade
5,6,e,very bad
1,2,b,good
2,3,b,good
0,1,a,very good
3,4,a,very good
4,5,a,very good


In [125]:
df3.groupby('grade').size()

grade
very bad     1
bad          0
medium       0
good         2
very good    3
dtype: int64

### Ordered categories

In [126]:
ordered_satisfaction = ['very bad', 'bad','neutral','good','very good']
satis_df = pd.DataFrame({"satisfactions":['mad','good','neutral','very good','bad']})
satis_df['satisfactions2'] = satis_df['satisfactions'].astype('category', ordered=True, categories=ordered_satisfaction)
satis_df

Unnamed: 0,satisfactions,satisfactions2
0,mad,
1,good,good
2,neutral,neutral
3,very good,very good
4,bad,bad


In [127]:
satis_df.dtypes

satisfactions       object
satisfactions2    category
dtype: object

### Category codes

In [128]:
satis_df['satisfactions2'].cat.codes

0   -1
1    3
2    2
3    4
4    1
dtype: int8

### Converting to categorical indicators (one hot encoding)

In [129]:
pd.get_dummies(ordered_satisfaction)

Unnamed: 0,bad,good,neutral,very bad,very good
0,0,0,0,1,0
1,1,0,0,0,0
2,0,0,1,0,0
3,0,1,0,0,0
4,0,0,0,0,1


In [130]:
satis_dummy_df = pd.get_dummies(satis_df, columns=['satisfactions'])
satis_dummy_df

Unnamed: 0,satisfactions2,satisfactions_bad,satisfactions_good,satisfactions_mad,satisfactions_neutral,satisfactions_very good
0,,0,0,1,0,0
1,good,0,1,0,0,0
2,neutral,0,0,0,1,0
3,very good,0,0,0,0,1
4,bad,1,0,0,0,0


## Text to Features
### Bag of Words

In [131]:
corpus = ["Think of your machine learning models as if they were children who have absolutely no knowledge",
          "except what you train them with;",
          "what information would they need to know to make the right decisions?"]

In [132]:
bow = CountVectorizer()
X = bow.fit_transform(corpus)
X

<3x31 sparse matrix of type '<class 'numpy.int64'>'
	with 33 stored elements in Compressed Sparse Row format>

In [133]:
bow.vocabulary_
bow.get_feature_names()

['absolutely',
 'as',
 'children',
 'decisions',
 'except',
 'have',
 'if',
 'information',
 'know',
 'knowledge',
 'learning',
 'machine',
 'make',
 'models',
 'need',
 'no',
 'of',
 'right',
 'the',
 'them',
 'they',
 'think',
 'to',
 'train',
 'were',
 'what',
 'who',
 'with',
 'would',
 'you',
 'your']

In [134]:
bow.stop_words_
bow.get_stop_words()

In [135]:
X.toarray()
X.todense()

matrix([[1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1,
         0, 0, 1, 0, 1, 0, 0, 0, 1],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
         0, 1, 0, 1, 0, 1, 0, 1, 0],
        [0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0,
         2, 0, 0, 1, 0, 0, 1, 0, 0]], dtype=int64)

In [136]:
bow.inverse_transform(X)

[array(['knowledge', 'no', 'absolutely', 'have', 'who', 'children', 'were',
        'they', 'if', 'as', 'models', 'learning', 'machine', 'your', 'of',
        'think'],
       dtype='<U11'), array(['with', 'them', 'train', 'you', 'what', 'except'],
       dtype='<U11'), array(['decisions', 'right', 'the', 'make', 'know', 'to', 'need', 'would',
        'information', 'what', 'they'],
       dtype='<U11')]

## Images to Features

In [158]:
img_rgb = misc.imread(name='CapLogo.png',flatten=False,mode='RGB')
type(img)

numpy.ndarray

In [159]:
img_rgb.shape

(115, 300, 3)

In [160]:
img_rgb.dtype

dtype('uint8')

In [198]:
#resample image if it is too big
#every 2 pixels close to each other are heavily correlated
img_rgb = img_rgb[::2, ::2]
img_rgb.shape

(29, 75, 3)

In [181]:
img_gray = misc.imread(name='CapLogo.png',flatten=True,mode='RGB')
type(img_gray)

numpy.ndarray

In [182]:
img_gray.shape

(115, 300)

In [183]:
img_gray.dtype

dtype('float32')

In [188]:
#normalize values between 0 and 1
img_gray = (img_gray/255.).reshape(-1,)
img_gray

array([  6.03086363e-08,   6.03086363e-08,   6.03086363e-08, ...,
         6.03086363e-08,   6.03086363e-08,   6.03086363e-08], dtype=float32)

In [189]:
img_gray.shape

(34500,)

## Audio to Features
**Be sure to have _same sample rates_ when comparing wav files!!!**

In [192]:
sample_rate, audio_data = wavfile.read('test.wav')
sample_rate

44100

In [193]:
audio_data

array([     0,   1221,   2440, ..., -31277, -30892, -30463], dtype=int16)

In [195]:
type(sample_rate)

int

In [197]:
audio_data.shape

(22050,)

## Normalizing Data Sets

In [137]:
#use sklearn preprocessing of numpy arrays
#so remember to convert to numpy arrays!

## Feature Engineering