# Data Manipulation with Pandas

In [261]:
import pandas as pd
import numpy as np
%matplotlib inline

## Learning specifics of how DataFrames are built

In [86]:
area_dict = {'California': 163696, 'Texas': 268597, 'New York': 54556, 'Florida': 65755, 'Illinois': 57915}  # Land masses of each state in miles
pop_dict = {'California': 39.54, 'Texas': 28.3, 'New York': 19.85, 'Florida': 20.98, 'Illinois': 12.8}

In [87]:
area = pd.Series(area_dict)
pop = pd.Series(pop_dict)

In [88]:
states = pd.DataFrame({'Area': area, 'Population': pop})
states

Unnamed: 0,Area,Population
California,163696,39.54
Florida,65755,20.98
Illinois,57915,12.8
New York,54556,19.85
Texas,268597,28.3


In [89]:
pop.values

array([ 39.54,  20.98,  12.8 ,  19.85,  28.3 ])

In [90]:
pd.DataFrame(pop, columns=['population'])

Unnamed: 0,population
California,39.54
Florida,20.98
Illinois,12.8
New York,19.85
Texas,28.3


In [101]:
print(d)
print(pd.DataFrame(d))

pd.DataFrame({'a': {0: 0, 1: 1, 2: 2}, 'b': {0: 0, 1: 2, 2: 4}})

[{'a': 0, 'b': 0}, {'a': 1, 'b': 2}, {'a': 2, 'b': 4}]
   a  b
0  0  0
1  1  2
2  2  4


Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [137]:
[area_dict.values(), pop_dict.values()]

[dict_values([163696, 268597, 54556, 65755, 57915]),
 dict_values([39.54, 28.3, 19.85, 20.98, 12.8])]

In [None]:
pd.DataFrame([area.values(), pop.values()])

In [102]:
pd.DataFrame({'Area': area_dict, 'Population': pop_dict})

Unnamed: 0,Area,Population
California,163696,39.54
Florida,65755,20.98
Illinois,57915,12.8
New York,54556,19.85
Texas,268597,28.3


In [133]:
conc = np.concatenate((area.values[:, np.newaxis], pop.values[:, np.newaxis]), axis=1)
conc

array([[  1.63696000e+05,   3.95400000e+01],
       [  6.57550000e+04,   2.09800000e+01],
       [  5.79150000e+04,   1.28000000e+01],
       [  5.45560000e+04,   1.98500000e+01],
       [  2.68597000e+05,   2.83000000e+01]])

In [259]:
df = pd.DataFrame(conc, columns=['area', 'population'], index=[area.keys()])
df

Unnamed: 0,area,population
California,163696.0,39.54
Florida,65755.0,20.98
Illinois,57915.0,12.8
New York,54556.0,19.85
Texas,268597.0,28.3


In [147]:
ind = pd.Index([2, 3, 5, 7, 11])
(ind.size, ind.shape, ind.ndim, ind.dtype)


(5, (5,), 1, dtype('int64'))

## Get good at selecting / indexing

### With Series

In [170]:
n = np.arange(35).reshape(5, 7)
n

array([[ 0,  1,  2,  3,  4,  5,  6],
       [ 7,  8,  9, 10, 11, 12, 13],
       [14, 15, 16, 17, 18, 19, 20],
       [21, 22, 23, 24, 25, 26, 27],
       [28, 29, 30, 31, 32, 33, 34]])

In [205]:
df = pd.DataFrame(n)
df

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34


In [233]:
s = pd.Series(np.arange(0, 1.25, .25), index=['a', 'b', 'c', 'd', 'e'])
s

a    0.00
b    0.25
c    0.50
d    0.75
e    1.00
dtype: float64

In [252]:
s[(s > .3) & (s < .8)]

c    0.50
d    0.75
dtype: float64

### With DataFrames

In [268]:
df.append(pd.DataFrame([[71362, 7.40]], columns=['area', 'population']))
df.index = df.index[:-1].append(pd.Index(['Washington']))
df

Unnamed: 0,area,population
California,163696.0,39.54
Florida,65755.0,20.98
Illinois,57915.0,12.8
New York,54556.0,19.85
Washington,268597.0,28.3


#### Can access Series columns two ways

In [281]:
df['area']  # Use this for assignment e.g. df['area'] = z to avoid weird bugs

California    163696.0
Florida        65755.0
Illinois       57915.0
New York       54556.0
Washington    268597.0
Name: area, dtype: float64

In [282]:
df.area

California    163696.0
Florida        65755.0
Illinois       57915.0
New York       54556.0
Washington    268597.0
Name: area, dtype: float64

In [284]:
df['area'] is df.area  # they're the same object

True

#### Adding new column

In [286]:
df['population'] * (10**6)

California    39540000.0
Florida       20980000.0
Illinois      12800000.0
New York      19850000.0
Washington    28300000.0
Name: population, dtype: float64

In [298]:
df['density'] = (df['population'] * (10**6)) / df['area']
df

Unnamed: 0,area,population,density
California,163696.0,39.54,241.545303
Florida,65755.0,20.98,319.063189
Illinois,57915.0,12.8,221.013554
New York,54556.0,19.85,363.846323
Washington,268597.0,28.3,105.362309


In [300]:
df.T

Unnamed: 0,California,Florida,Illinois,New York,Washington
area,163696.0,65755.0,57915.0,54556.0,268597.0
population,39.54,20.98,12.8,19.85,28.3
density,241.545303,319.063189,221.013554,363.846323,105.362309


In [305]:
df = df.rename(columns={'population': 'population (mil)'})
df

Unnamed: 0,area,population (mil),density
California,163696.0,39.54,241.545303
Florida,65755.0,20.98,319.063189
Illinois,57915.0,12.8,221.013554
New York,54556.0,19.85,363.846323
Washington,268597.0,28.3,105.362309


### Three main ways to access data

In [309]:
df.iloc[0]  # Accesses a row

area                163696.000000
population (mil)        39.540000
density                241.545303
Name: California, dtype: float64

In [310]:
df.loc['California']  # Accesses a row

area                163696.000000
population (mil)        39.540000
density                241.545303
Name: California, dtype: float64

In [315]:
df[0:3]  # Accesses rows

Unnamed: 0,area,population (mil),density
California,163696.0,39.54,241.545303
Florida,65755.0,20.98,319.063189
Illinois,57915.0,12.8,221.013554


In [329]:
df.index  # Access row index

Index(['California', 'Florida', 'Illinois', 'New York', 'Washington'], dtype='object')

In [334]:
df.index[0]  # Access specific string in row index

'California'

In [312]:
df['area']  # Accesses a column

California    163696.0
Florida        65755.0
Illinois       57915.0
New York       54556.0
Washington    268597.0
Name: area, dtype: float64

In [316]:
df.iloc[:, 0:2]  # Access columns

Unnamed: 0,area,population (mil)
California,163696.0,39.54
Florida,65755.0,20.98
Illinois,57915.0,12.8
New York,54556.0,19.85
Washington,268597.0,28.3


In [330]:
df.columns  # Access column index

Index(['area', 'population (mil)', 'density'], dtype='object')

In [333]:
df.columns[0]  # Access specific string

'area'

In [323]:
df.loc['California': 'Illinois', 'area': 'population (mil)']  # Subset both rows and columns by name

Unnamed: 0,area,population (mil)
California,163696.0,39.54
Florida,65755.0,20.98
Illinois,57915.0,12.8


In [324]:
df.iloc[0:3, 0:2]  # Subset rows and columns by index

Unnamed: 0,area,population (mil)
California,163696.0,39.54
Florida,65755.0,20.98
Illinois,57915.0,12.8


In [343]:
df.loc[df.population > 20]

Unnamed: 0,area,population,density
California,163696.0,39.54,241.545303
Florida,65755.0,20.98,319.063189
Washington,268597.0,28.3,105.362309


In [346]:
df.loc[['Washington', 'California']]

Unnamed: 0,area,population,density
Washington,268597.0,28.3,105.362309
California,163696.0,39.54,241.545303


In [353]:
df.loc[df.density > 110, ['area', 'population']]  # Boolean + fancy indexing

Unnamed: 0,area,population
California,163696.0,39.54
Florida,65755.0,20.98
Illinois,57915.0,12.8
New York,54556.0,19.85


In [357]:
df

Unnamed: 0,area,population,density
California,163696.0,39.54,241.545303
Florida,65755.0,20.98,319.063189
Illinois,57915.0,12.8,221.013554
New York,54556.0,19.85,363.846323
Washington,268597.0,28.3,105.362309


#### Indexing refers to columns, slicing refers to rows

In [385]:
df['area']  # Get area column

California    163696.0
Florida        65755.0
Illinois       57915.0
New York       54556.0
Washington    268597.0
Name: area, dtype: float64

In [387]:
df['California': 'New York']  # Get set of rows

Unnamed: 0,area,population,density
California,163696.0,39.54,241.545303
Florida,65755.0,20.98,319.063189
Illinois,57915.0,12.8,221.013554
New York,54556.0,19.85,363.846323


In [388]:
df[0:4]

Unnamed: 0,area,population,density
California,163696.0,39.54,241.545303
Florida,65755.0,20.98,319.063189
Illinois,57915.0,12.8,221.013554
New York,54556.0,19.85,363.846323
