# Import libraries

In [0]:
import pandas as pd

# Data structures: 

## Overview
**Series**:	1D labeled homogeneous array, size immutable.

**Data Frame**: 2D labeled, size-mutable tabular structure, probably heterogeneously typed columns.

**Panel**: 3D labeled, size-mutable array.

## pandas vs numpy
https://www.youtube.com/watch?v=CLoNO-XxNXU

## Note
DataFrame is widely used, so that this tutorial focuses on it.

# Series

## Create a Series
pandas.Series( data, index, dtype, copy)


### By passing a numpy array

In [0]:
import numpy as np
data = np.arange(3)*3
index = np.arange(3)
s = pd.Series(data, index, float)
print(s)

0    0.0
1    3.0
2    6.0
dtype: float64


In [0]:
data = [np.arange(9)*2.0]
index = np.arange(3)
s = pd.Series(data, index)
print(s)

0    [0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0]
1    [0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0]
2    [0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0]
dtype: object


### By passing a dictionary

In [0]:
dict = {'A': 1,
        'B': pd.Series(np.arange(3).astype(float)),
        'C': np.array([3] * 3,dtype='int32'),
        'D': pd.Categorical(['maths', 'physics', 'chemistry']),
        'E': 'loop'
       }
s = pd.Series(dict)
print(s)

A                                                    1
B            0    0.0
1    1.0
2    2.0
dtype: float64
C                                            [3, 3, 3]
D    [maths, physics, chemistry]
Categories (3, obj...
E                                                 loop
dtype: object


### By passing a scalar value

In [0]:
s = pd.Series(100, index=[0, 1, 2])
print(s)

0    100
1    100
2    100
dtype: int64


## Access data

### With position

In [0]:
s = pd.Series(np.arange(9)*2, np.arange(9))
print(s)
print('\n at 1:\n', s[1])
print('\n first 3:\n', s[:3])
print('\n last 3:\n', s[-3:])
print('\n from 3 to 4:\n', s[3:5])

0     0
1     2
2     4
3     6
4     8
5    10
6    12
7    14
8    16
dtype: int64

 at 1:
 2

 first 3:
 0    0
1    2
2    4
dtype: int64

 last 3:
 6    12
7    14
8    16
dtype: int64

 from 3 to 4:
 3    6
4    8
dtype: int64


###Using label

In [0]:
s = pd.Series(np.arange(5), ['a', 'b', 'c', 'd', 'e'])
print(s)
print('\n index c: \n', s['c'])
print('\n index a, b, c: \n', s[['a', 'b', 'c']])

a    0
b    1
c    2
d    3
e    4
dtype: int64

 index c: 
 2

 index a, b, c: 
 a    0
b    1
c    2
dtype: int64


# DataFrame


## Create a DataFrame
pandas.DataFrame( data, index, columns, dtype, copy)

### By passing numpy arrays

In [0]:
data = np.arange(3)*2
index = np.arange(3)
columns = ['A']

df = pd.DataFrame(data, index, columns)
print(df)

   A
0  0
1  2
2  4


In [0]:
data = np.arange(12).reshape(3,4)
index = np.arange(3)
columns = ['A', 'B', 'C', 'D']

df = pd.DataFrame(data, index, columns)
print(df)


   A  B   C   D
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11


### By passing a dictionary

In [0]:
dict = {'A': 1,
        'B': pd.Series(np.arange(3).astype(float)),
        'C': np.array([3] * 3,dtype='int32'),
        'D': pd.Categorical(['maths', 'physics', 'chemistry']),
        'E': ['go', 'went', 'gone']
       }
df = pd.DataFrame(dict)
print(df)

   A    B  C          D     E
0  1  0.0  3      maths    go
1  1  1.0  3    physics  went
2  1  2.0  3  chemistry  gone


### By passing lists

In [0]:
data = [1, 2, 3, 4, 5]
df = pd.DataFrame(data)
print(df)

   0
0  1
1  2
2  3
3  4
4  5


In [0]:
data = [['Maths', 4.5], ['Physics', 4.0], ['Chemistry', 4.3]]
df = pd.DataFrame(data, columns=['Subject', 'Mark'])
print(df)

     Subject  Mark
0      Maths   4.5
1    Physics   4.0
2  Chemistry   4.3


## Select columns

In [0]:
dict = {'A': 1,
        'B': pd.Series(np.arange(3).astype(float)),
        'C': np.array([3] * 3,dtype='int32'),
        'D': pd.Categorical(['maths', 'physics', 'chemistry']),
        'E': ['go', 'went', 'gone']
       }
df = pd.DataFrame(dict)
print(df)
print('\n at B:\n', df['B'])
print('\n at D & E:\n', df[['D', 'E']])

   A    B  C          D     E
0  1  0.0  3      maths    go
1  1  1.0  3    physics  went
2  1  2.0  3  chemistry  gone

 at B:
 0    0.0
1    1.0
2    2.0
Name: B, dtype: float64

 at D & E:
            D     E
0      maths    go
1    physics  went
2  chemistry  gone


## Add a column

In [0]:
dict = {'A': 1,
        'B': pd.Series(np.arange(3).astype(float)),
        'C': np.array([3] * 3,dtype='int32'),
        'D': pd.Categorical(['maths', 'physics', 'chemistry']),
        'E': ['go', 'went', 'gone']
       }
df = pd.DataFrame(dict)
print(df)

df['F'] = [5, 7, 9]
print('\n New: \n', df)

   A    B  C          D     E
0  1  0.0  3      maths    go
1  1  1.0  3    physics  went
2  1  2.0  3  chemistry  gone

 New: 
    A    B  C          D     E  F
0  1  0.0  3      maths    go  5
1  1  1.0  3    physics  went  7
2  1  2.0  3  chemistry  gone  9


## Delete column

In [0]:
dict = {'A': 1,
        'B': pd.Series(np.arange(3).astype(float)),
        'C': np.array([3] * 3,dtype='int32'),
        'D': pd.Categorical(['maths', 'physics', 'chemistry']),
        'E': ['go', 'went', 'gone']
       }
df = pd.DataFrame(dict)
print(df)

del df['C']
print('\n delete C\n', df)

df.pop('D')
print('\n pop D\n', df)

   A    B  C          D     E
0  1  0.0  3      maths    go
1  1  1.0  3    physics  went
2  1  2.0  3  chemistry  gone

 delete C
    A    B          D     E
0  1  0.0      maths    go
1  1  1.0    physics  went
2  1  2.0  chemistry  gone

 pop D
    A    B     E
0  1  0.0    go
1  1  1.0  went
2  1  2.0  gone


## Select rows

In [0]:
dict = {'A': 1,
        'B': pd.Series(np.arange(3).astype(float)),
        'C': np.array([3] * 3,dtype='int32'),
        'D': pd.Categorical(['maths', 'physics', 'chemistry']),
        'E': ['go', 'went', 'gone']
       }
df = pd.DataFrame(dict, index=['label0', 'label1', 'label2'])
print(df)

print('\n at label1\n', df.loc['label1'])

print('\n at label2 of D\n', df.loc['label2', 'D'])

print('\n at 2\n', df.iloc[2])

print('\n at 0 & 1\n', df[0:2])

        A   B  C          D     E
label0  1 NaN  3      maths    go
label1  1 NaN  3    physics  went
label2  1 NaN  3  chemistry  gone

 at label1
 A          1
B        NaN
C          3
D    physics
E       went
Name: label1, dtype: object

 at label2 of D
 chemistry

 at 2
 A            1
B          NaN
C            3
D    chemistry
E         gone
Name: label2, dtype: object

 at 0 & 1
         A   B  C        D     E
label0  1 NaN  3    maths    go
label1  1 NaN  3  physics  went


## Add rows

In [0]:
dict = {'A': 1,
        'B': pd.Series(np.arange(3).astype(float)),
        'C': np.array([3] * 3,dtype='int32'),
        'D': pd.Categorical(['maths', 'physics', 'chemistry']),
        'E': ['go', 'went', 'gone']
       }
df = pd.DataFrame(dict, index=['a', 'b', 'c'])

dfx = pd.DataFrame(np.random.rand(5).reshape(1,5), columns=['A', 'B', 'C', 'D', 'E'])
df = df.append(dfx)
print(df)

          A         B         C          D         E
a  1.000000       NaN  3.000000      maths        go
b  1.000000       NaN  3.000000    physics      went
c  1.000000       NaN  3.000000  chemistry      gone
0  0.339898  0.815616  0.355668  0.0611391  0.454416


## Delete rows

In [0]:
dict = {'A': 1,
        'B': pd.Series(np.arange(3).astype(float)),
        'C': np.array([3] * 3,dtype='int32'),
        'D': pd.Categorical(['maths', 'physics', 'chemistry']),
        'E': ['go', 'went', 'gone']
       }
df = pd.DataFrame(dict, index=['a', 'b', 'c'])
print(df)
df = df.drop(['a', 'b'])
print('\n new\n', df)

   A   B  C          D     E
a  1 NaN  3      maths    go
b  1 NaN  3    physics  went
c  1 NaN  3  chemistry  gone

 new
    A   B  C          D     E
c  1 NaN  3  chemistry  gone


# Panel

## Create Panel
pandas.Panel(data, items, major_axis, minor_axis, dtype, copy)

### By passing 3D ndarray

In [0]:
data = np.random.rand(2,3,4)
pn = pd.Panel(data)
print(pn)

<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 3 (major_axis) x 4 (minor_axis)
Items axis: 0 to 1
Major_axis axis: 0 to 2
Minor_axis axis: 0 to 3


### By passing dictionary of DataFrame objects

In [0]:
data = {'A': pd.DataFrame(np.arange(12).reshape(3,4)),
        'B': pd.DataFrame(np.random.rand(2,3))}
pn = pd.Panel(data)
print(pn)

<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 3 (major_axis) x 4 (minor_axis)
Items axis: A to B
Major_axis axis: 0 to 2
Minor_axis axis: 0 to 3


## Select data from Panel

In [0]:
data = {'A': pd.DataFrame(np.arange(12).reshape(3,4).astype(float)),
        'B': pd.DataFrame(np.random.rand(2,3)),
        'C': pd.DataFrame(np.random.rand(3,3)),
        'D': pd.DataFrame(np.random.rand(1,4))}
pn = pd.Panel(data)
print(pn)
print('\n item A\n', pn['A'])
print('\n item B & C\n', pn[['B', 'C']])
print('\n major_axis 1\n', pn.major_xs(1))
print('\n minor_axis 3\n', pn.minor_xs(3))

<class 'pandas.core.panel.Panel'>
Dimensions: 4 (items) x 3 (major_axis) x 4 (minor_axis)
Items axis: A to D
Major_axis axis: 0 to 2
Minor_axis axis: 0 to 3

 item A
      0    1     2     3
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0

 item B & C
 <class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 3 (major_axis) x 4 (minor_axis)
Items axis: B to C
Major_axis axis: 0 to 2
Minor_axis axis: 0 to 3

 major_axis 1
      A         B         C   D
0  4.0  0.591257  0.126056 NaN
1  5.0  0.772519  0.633301 NaN
2  6.0  0.002772  0.150727 NaN
3  7.0       NaN       NaN NaN

 minor_axis 3
       A   B   C         D
0   3.0 NaN NaN  0.381286
1   7.0 NaN NaN       NaN
2  11.0 NaN NaN       NaN
