# Pandas - Operations

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({
    'col1': [1,2,3,4],
    'col2': [444,555,666,444],
    'col3': ['abc','def','ghi','xyz']
})

In [3]:
df.head()

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


# Finding unique values in a DataFrame

Unique values are the items that appear in a dataset only once.

In [4]:
# finding all the unique values in column 2
df['col2'].unique()

array([444, 555, 666])

In [6]:
# number of unique values in column 2
df['col2'].nunique()

3

In [7]:
# how often do unique values show up?
df['col2'].value_counts()

444    2
555    1
666    1
Name: col2, dtype: int64

In [8]:
value_counts_col2 = df['col2'].value_counts()

In [13]:
type(value_counts_col2) # returns a series

pandas.core.series.Series

## Selection in a DataFrame

In [15]:
# conditional selection
df[df['col1']>2]

Unnamed: 0,col1,col2,col3
2,3,666,ghi
3,4,444,xyz


In [17]:
# combining conditions
df[(df['col1']>2) & (df['col2']==444)]

Unnamed: 0,col1,col2,col3
3,4,444,xyz


## Applying a custom function

In [19]:
# example function
def times2(x):
    return x*2

In [21]:
# applying function to DataFrame
df['col1'].apply(times2) # will broadcast the function to each element in the column

0    2
1    4
2    6
3    8
Name: col1, dtype: int64

In [22]:
# applying build in functions
df['col3'].apply(len) # will broadcast the function to each element in the column

0    3
1    3
2    3
3    3
Name: col3, dtype: int64

In [24]:
# apply is extremly powerful when combined with lambda expressions
df['col2'].apply(lambda x: x*2)

0     888
1    1110
2    1332
3     888
Name: col2, dtype: int64

## Dropping values in a DataFrame

In [26]:
# dropping a column
df.drop('col1',axis=1) # axis needs to be specified for columns

Unnamed: 0,col2,col3
0,444,abc
1,555,def
2,666,ghi
3,444,xyz


In [27]:
# specifying inplace if we want to occur it inplace
df.drop('col1',axis=1,inplace=True) 

In [28]:
df

Unnamed: 0,col2,col3
0,444,abc
1,555,def
2,666,ghi
3,444,xyz


## Getting Information about the DataFrame

In [29]:
df.columns

Index(['col2', 'col3'], dtype='object')

In [30]:
df.index

RangeIndex(start=0, stop=4, step=1)

## Sorting and Ordering a DataFrame

In [32]:
# sorting the DataFrame by column 2
df.sort_values(by='col2')

Unnamed: 0,col2,col3
0,444,abc
3,444,xyz
1,555,def
2,666,ghi


## Finding Null Values in a DataFrame

In [33]:
df.isnull()

Unnamed: 0,col2,col3
0,False,False
1,False,False
2,False,False
3,False,False


## Pivot Table Method

In [34]:
data = {
    'A': ['foo','foo','foo','bar','bar','bar'],
    'B': ['one','one','two','two','one','one'],
    'C': ['x','y','x','y','x','y'],
    'D': [1,3,2,5,4,1]
}
df = pd.DataFrame(data)

In [35]:
df

Unnamed: 0,A,B,C,D
0,foo,one,x,1
1,foo,one,y,3
2,foo,two,x,2
3,bar,two,y,5
4,bar,one,x,4
5,bar,one,y,1


In [37]:
# creating a pivot table like in excel
# == creating a multiple index out of the table
df.pivot_table(values='D',index=['A','B'],columns=['C'])
# A and B become multi level index
# values of C become columns
# values of D become values under the C columns

Unnamed: 0_level_0,C,x,y
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,4.0,1.0
bar,two,,5.0
foo,one,1.0,3.0
foo,two,2.0,
