# Getting Started with pandas

In [1]:
import numpy as np

In [2]:
import pandas as pd
from icecream import ic

## Introduction to Pandas Data Structures

### Series
A Series is a one-dimensional array-like object containing a sequence of values (of
similar types to NumPy types) and an associated array of data labels, called its index.
The simplest Series is formed from only an array of data:

In [3]:
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

0    4
1    7
2   -5
3    3
dtype: int64

In [None]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

array([ 4,  7, -5,  3], dtype=int64)

In [None]:
obj.index

RangeIndex(start=0, stop=4, step=1)

RangeIndex(start=0, stop=4, step=1)

In [None]:
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

d    4
b    7
a   -5
c    3
dtype: int64

In [None]:
obj2['a']

-5

-5

In [None]:
obj2[1]

7

7

Using NumPy functions or NumPy-like operations, such as filtering with a boolean
array, scalar multiplication, or applying math functions, will preserve the index-value
link:

In [None]:
obj2[obj2 > 0]

d    4
b    7
c    3
dtype: int64

d    4
b    7
c    3
dtype: int64

In [None]:
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [None]:
F1=pd.Series([1,2,3],index=["a","b","c"])
F2=pd.Series([2,3,4],index=["a","b","d"])

In [None]:
F3=F1+F2
F3

a    3.0
b    5.0
c    NaN
d    NaN
dtype: float64

a    3.0
b    5.0
c    NaN
d    NaN
dtype: float64

In [None]:
F4=F3.isnull()
F4.count()

4

4

Another way to think about a Series is as a fixed-length, ordered dict, as it is a map‐
ping of index values to data values.

In [None]:
'b' in obj2

True

True

### Creating Series from Dictionary

In [None]:
sdata = {'Ohio': 35000, 
         'Texas': 71000, 
         'Oregon': 16000, 
         'Utah': 5000,
        'India': np.nan}
obj3 = pd.Series(sdata)
obj3

Ohio      35000.0
Texas     71000.0
Oregon    16000.0
Utah       5000.0
India         NaN
dtype: float64

Ohio      35000.0
Texas     71000.0
Oregon    16000.0
Utah       5000.0
India         NaN
dtype: float64

In [None]:
obj3.index

Index(['Ohio', 'Texas', 'Oregon', 'Utah', 'India'], dtype='object')

Index(['Ohio', 'Texas', 'Oregon', 'Utah', 'India'], dtype='object')

In [None]:
obj3.values

array([35000., 71000., 16000.,  5000.,    nan])

array([35000., 71000., 16000.,  5000.,    nan])

In [None]:
sindex=['Oregon', 'Utah','India','Ohio', 'Texas']   #Index Sequence Changed, Observe the mssing value
obj4=pd.Series(sdata,index=sindex)
obj4

Oregon    16000.0
Utah       5000.0
India         NaN
Ohio      35000.0
Texas     71000.0
dtype: float64

Oregon    16000.0
Utah       5000.0
India         NaN
Ohio      35000.0
Texas     71000.0
dtype: float64

### Creating Series from Lists

In [None]:
tdata=[12,14,16]
tind=["a","b","c"]
obj5=pd.Series(tdata,index=tind)
obj5

a    12
b    14
c    16
dtype: int64

a    12
b    14
c    16
dtype: int64

In [None]:
obj6=pd.Series({"a":23,"b":34,"c":45,"d":34})
obj6

a    23
b    34
c    45
d    34
dtype: int64

a    23
b    34
c    45
d    34
dtype: int64

In [None]:
obj5+obj6

a    35.0
b    48.0
c    61.0
d     NaN
dtype: float64

a    35.0
b    48.0
c    61.0
d     NaN
dtype: float64

In [None]:
obj6-obj5

a    11.0
b    20.0
c    29.0
d     NaN
dtype: float64

a    11.0
b    20.0
c    29.0
d     NaN
dtype: float64

## DataFrame
A DataFrame represents a rectangular table of data and contains an ordered collection of columns, each of which can be a different value type (numeric, string, boolean, etc.). The DataFrame has both a row and column index; it can be thought of
as a dict of Series all sharing the same index. 

In [None]:
# Constructing Data Frame using Dictionary

data = {'State': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'Year': [2000, 2001, 2002, 2001, 2002, 2003],
        'Pop': [1.5, 1.7, np.nan, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,State,Year,Pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


Unnamed: 0,State,Year,Pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [None]:
# head method selects only the first five rows:
frame.head()

Unnamed: 0,State,Year,Pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,
3,Nevada,2001,2.4
4,Nevada,2002,2.9


Unnamed: 0,State,Year,Pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [None]:
# Rearrangement of Columns
pd.DataFrame(data, columns=['State', 'Pop','Year'])

Unnamed: 0,State,Pop,Year
0,Ohio,1.5,2000
1,Ohio,1.7,2001
2,Ohio,,2002
3,Nevada,2.4,2001
4,Nevada,2.9,2002
5,Nevada,3.2,2003


Unnamed: 0,State,Pop,Year
0,Ohio,1.5,2000
1,Ohio,1.7,2001
2,Ohio,,2002
3,Nevada,2.4,2001
4,Nevada,2.9,2002
5,Nevada,3.2,2003


In [None]:
# If you pass a column that isn’t contained in the dict, it will appear with missing values in the result:
frame4=pd.DataFrame(data, columns=['Year', 'State', 'Pop',"Extra"])

In [None]:
frame4["Extra"]=pd.Series([1,2,3,4,5,6],index=[2,3,1,0,4,5])
frame4

Unnamed: 0,Year,State,Pop,Extra
0,2000,Ohio,1.5,4
1,2001,Ohio,1.7,3
2,2002,Ohio,,1
3,2001,Nevada,2.4,2
4,2002,Nevada,2.9,5
5,2003,Nevada,3.2,6


Unnamed: 0,Year,State,Pop,Extra
0,2000,Ohio,1.5,4
1,2001,Ohio,1.7,3
2,2002,Ohio,,1
3,2001,Nevada,2.4,2
4,2002,Nevada,2.9,5
5,2003,Nevada,3.2,6


In [None]:
frame4.columns

Index(['Year', 'State', 'Pop', 'Extra'], dtype='object')

Index(['Year', 'State', 'Pop', 'Extra'], dtype='object')

In [None]:
frame4.index=["a","b","c","d","e","f"]
frame4

Unnamed: 0,Year,State,Pop,Extra
a,2000,Ohio,1.5,4
b,2001,Ohio,1.7,3
c,2002,Ohio,,1
d,2001,Nevada,2.4,2
e,2002,Nevada,2.9,5
f,2003,Nevada,3.2,6


Unnamed: 0,Year,State,Pop,Extra
a,2000,Ohio,1.5,4
b,2001,Ohio,1.7,3
c,2002,Ohio,,1
d,2001,Nevada,2.4,2
e,2002,Nevada,2.9,5
f,2003,Nevada,3.2,6


In [None]:
# index assignment
frame2=pd.DataFrame(data, columns=['Year', 'State', 'Pop'],index=["a","b","c","d","e","f"])
frame2

Unnamed: 0,Year,State,Pop
a,2000,Ohio,1.5
b,2001,Ohio,1.7
c,2002,Ohio,
d,2001,Nevada,2.4
e,2002,Nevada,2.9
f,2003,Nevada,3.2


Unnamed: 0,Year,State,Pop
a,2000,Ohio,1.5
b,2001,Ohio,1.7
c,2002,Ohio,
d,2001,Nevada,2.4
e,2002,Nevada,2.9
f,2003,Nevada,3.2


In [None]:
frame2.iloc[2,1]

'Ohio'

'Ohio'

In [None]:
# Accessing Particular Column
frame2['State']

a      Ohio
b      Ohio
c      Ohio
d    Nevada
e    Nevada
f    Nevada
Name: State, dtype: object

a      Ohio
b      Ohio
c      Ohio
d    Nevada
e    Nevada
f    Nevada
Name: State, dtype: object

In [None]:
frame2

Unnamed: 0,Year,State,Pop
a,2000,Ohio,1.5
b,2001,Ohio,1.7
c,2002,Ohio,
d,2001,Nevada,2.4
e,2002,Nevada,2.9
f,2003,Nevada,3.2


Unnamed: 0,Year,State,Pop
a,2000,Ohio,1.5
b,2001,Ohio,1.7
c,2002,Ohio,
d,2001,Nevada,2.4
e,2002,Nevada,2.9
f,2003,Nevada,3.2


In [None]:
# Accessing Particular Column using dot operator
frame2.State

a      Ohio
b      Ohio
c      Ohio
d    Nevada
e    Nevada
f    Nevada
Name: State, dtype: object

a      Ohio
b      Ohio
c      Ohio
d    Nevada
e    Nevada
f    Nevada
Name: State, dtype: object

In [None]:
set(frame2['state'])

KeyError: 'state'

KeyError: 'state'

In [None]:
frame3=pd.DataFrame(data, columns=['Year', 'State', 'Pop',"Extra"])
frame3["Extra"]

In [None]:
# When you are assigning lists or arrays to a column, the value’s length must match the length of the DataFrame. 
frame3["Extra"]=np.random.randn(6)
frame3

In [None]:
# If you assign a Series, its labels will be realigned exactly to the DataFrame’s index, inserting missing values in any holes:
val = pd.Series([-1.2, -1.5, -1.7], index=[0,2,4])
frame3["Extra"]=val
frame3

In [None]:
del frame3['Extra']
frame3

In [None]:
# nested dict of dicts
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame4=pd.DataFrame(pop)

# Outer dict keys as the columns and the inner keys as the row indices

frame4

In [None]:
# transpose the DataFrame
frame4.T

In [None]:
frame4

In [None]:
frame4.index=[2001, 2002, 2003]

In [None]:
frame4

Reindexing
An important method on pandas objects is reindex, which means to create a new
object with the data conformed to a new index.

In [None]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

In [None]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3

In [None]:
obj3.reindex(range(6), method='ffill')

In [None]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])
frame

In [None]:
np.arange(9).reshape((3, 3))

In [None]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

In [None]:
data=np.arange(16).reshape(4,4)

In [None]:
data[1]

In [None]:
frame6=pd.DataFrame(data,index=list(data[:,1]))
frame6

In [None]:
data