In [1]:
#Numpy provides efficient storage and manipulation of dense type arrays
#typically used in numerical computing tasks
#Pandas builds on the Numpy array structure.
#The three fundamental Pandas data structures are 
# series, dataframe and index

#To get started with Pandas, you need to import the package

import pandas as pd  #pd is the common convention for representing Pandas

print(pd.__version__)  #print the version of Pandas

1.2.4


In [3]:
#tpe the package name or package alias if you created one then '.'
#followed by <Tab> to get a list of classes, methods, and attributes
pd.

SyntaxError: invalid syntax (<ipython-input-3-2cbd04dd4fab>, line 3)

In [33]:
#Series
#Series is a one dimensional array of indexed data capable of
#holding any data type
#Series can be created from at list or an array
S1 = pd.Series([0,30,60,90,120,150,180])
S1

0      0
1     30
2     60
3     90
4    120
5    150
6    180
dtype: int64

In [34]:
type(S1)

pandas.core.series.Series

In [35]:
S11 = pd.Series([0,30,60,90,'Angle 120','Angle 150','Angle 180'])
S11

0            0
1           30
2           60
3           90
4    Angle 120
5    Angle 150
6    Angle 180
dtype: object

In [36]:
type(S11)

pandas.core.series.Series

In [37]:
#Data can be accessed by the associated index via a square bracket
S1[0]

0

In [38]:
S1[3]

90

In [39]:
#Series 
S1.index

RangeIndex(start=0, stop=7, step=1)

In [40]:
S1.values

array([  0,  30,  60,  90, 120, 150, 180], dtype=int64)

In [41]:
#Series accepts the slice notation
#From the beginning of the series up to but not including 3
S1[:3]

0     0
1    30
2    60
dtype: int64

In [42]:
#Starting with index of 2 through the end of the series
S1[2:]

2     60
3     90
4    120
5    150
6    180
dtype: int64

In [43]:
#Pandas Series can have explicity defined index associated with the values
#Index need not be integers as in Numpy arrays

S2 = pd.Series([0,30,60,90,120,150,180],index = ['a','b','c','d','e','f','g'])
S2

a      0
b     30
c     60
d     90
e    120
f    150
g    180
dtype: int64

In [44]:
S2['a']

0

In [45]:
S2['c']

60

In [46]:
#Series is similar to a Python dictionary in which the index is the key
#unlike a dictionary, Pandas series supoorts slicing
S2[:'d']

a     0
b    30
c    60
d    90
dtype: int64

In [47]:
S2['b':]

b     30
c     60
d     90
e    120
f    150
g    180
dtype: int64

In [48]:
S2['b':'e']
type(S2)

pandas.core.series.Series

In [49]:
#Constructing Series objects
#In general pd.Series(data,index)
#where data can be one of many entities and 
#index is an optional argument

In [50]:
#Data can be a list or Numpy array in which case the
#index defaults to an integer sequence
pd.Series([2,4,6,8])

0    2
1    4
2    6
3    8
dtype: int64

In [51]:
#Data can be a scalar which is repeated to fill the specified index
pd.Series(3.1416,index=[100,200,300])

100    3.1416
200    3.1416
300    3.1416
dtype: float64

In [52]:
#Data can be a dictionary in which the index defaults to the
#dictionary keys
S3=pd.Series({2:'a',1:'b',3:'c'})
S3
S3[3]

'c'

In [53]:
#The key can be explicity set if a different result set is preferred
#Series is only populated with the explicitly identified keys
pd.Series({2:'a',1:'b',3:'c'},index=[1,3])

1    b
3    c
dtype: object

In [54]:
#Next fundemental structure is the DataFrame
#DataFrame is like a two-dimensional array with flexible row indices
#and flexible column names.
#Unlike Numpy array, Dataframe allows for heterogeneous data types
#and missing data

#One of the most preferred tools for data scientists to do data
#manipulation and analysis


In [55]:
#Constructing DataFrame Objects
#There are a variety of ways to create a DataFrame
import pandas as pd

In [56]:
#A single column DataFrame can be constructed from a single series
#From a single series object
medianAge = pd.Series([46,42,47,45],index=['Germany','France','Italy','Spain'])
print(medianAge)

Germany    46
France     42
Italy      47
Spain      45
dtype: int64


In [57]:
#Creating the DataFrame
df1 = pd.DataFrame(medianAge,columns=['Median Age'])
print(df1)

         Median Age
Germany          46
France           42
Italy            47
Spain            45


In [58]:
#In general a DataFrame is a collection of Series object

In [59]:
#Create a DataFrame from a list of dictionaries
#Any list of dictionaries can be made into a DataFrame
#Let's use list comprehension to create some data
data = [{'V':i,'V2':i**2} for i in range(5)]
print(data)

[{'V': 0, 'V2': 0}, {'V': 1, 'V2': 1}, {'V': 2, 'V2': 4}, {'V': 3, 'V2': 9}, {'V': 4, 'V2': 16}]


In [60]:
df2 = pd.DataFrame(data)
df2

Unnamed: 0,V,V2
0,0,0
1,1,1
2,2,4
3,3,9
4,4,16


In [201]:
#Even if some keys in the dictionary are missing, Pandas will fill them in 
#with NaN values
#NaN means Not a Number
data0 = [{'V': 0, 'V2': 0}, {'V': 1, 'V2': 1}, {'V': 2, 'V2': 4}, {'V': 3, 'V2': 9}, 
         {'V': 4, 'V2': 16}]
data1 = [{'V': 0, 'V2': 0}, {'V': 1, 'V2': 1}, {'V2': 4}, {'V': 3, 'V2': 9}, 
         {'V': 4, 'V2': 16}]

In [62]:
pd.DataFrame(data1)

Unnamed: 0,V,V2
0,0.0,0
1,1.0,1
2,,4
3,3.0,9
4,4.0,16


In [63]:
#DataFrame can be constructed from a dictionary of Series objects
medianAge

Germany    46
France     42
Italy      47
Spain      45
dtype: int64

In [64]:
#Create another Series object
population = pd.Series([83783942,65273511,60461826,46754778],
                       index=['Germany','France','Italy','Spain'])

In [65]:
population

Germany    83783942
France     65273511
Italy      60461826
Spain      46754778
dtype: int64

In [66]:
df3 = pd.DataFrame({'population':population,'medianAge':medianAge})
df3

Unnamed: 0,population,medianAge
Germany,83783942,46
France,65273511,42
Italy,60461826,47
Spain,46754778,45


In [67]:
#DataFrame can be constructed from a two-dimensional NumPy array
import numpy as np
A1 = np.random.rand(3,2)
A1

array([[0.08757746, 0.96886036],
       [0.86185158, 0.89816143],
       [0.35769568, 0.56635207]])

In [68]:
#Create a DataFrame using the specified column and index names
import pandas as pd
pd.DataFrame(A1,columns=['C1','C2'],index=['a','b','c'])


Unnamed: 0,C1,C2
a,0.087577,0.96886
b,0.861852,0.898161
c,0.357696,0.566352


In [69]:
#if we omit the columns or index, integers will be used
#Let's omit the columns
pd.DataFrame(A1,index=['a','b','c'])


Unnamed: 0,0,1
a,0.087577,0.96886
b,0.861852,0.898161
c,0.357696,0.566352


In [70]:
#Let's omit the columns and index
pd.DataFrame(A1)

Unnamed: 0,0,1
0,0.087577,0.96886
1,0.861852,0.898161
2,0.357696,0.566352


In [71]:
#The third fundemental structure is the Index which we have already seen
#Both the Series and DataFrame contain an explicit index
#Index object can be thought of as an immutable array or as an ordered set
#Index object may also contain repeated values.


In [72]:
#Let's use the NumPy from an earlier example
A1

array([[0.08757746, 0.96886036],
       [0.86185158, 0.89816143],
       [0.35769568, 0.56635207]])

In [73]:
#Let's create an Index object with a repeating value
I1 = pd.Index(['a','b','b'])
pd.DataFrame(A1,columns=['C1','C2'],index=I1)

Unnamed: 0,C1,C2
a,0.087577,0.96886
b,0.861852,0.898161
b,0.357696,0.566352


In [74]:
type(I1)

pandas.core.indexes.base.Index

In [75]:
#How would you use repeated values

In [76]:
#Index object operates like an array
I1[2]

'b'

In [77]:
#But Index object is immutable
I1[2] = 'c'

TypeError: Index does not support mutable operations

In [202]:
#Being immutable is safer for when the Index object is shared across 
#multiple datasets
#Pandas objects are designed to facilitate operations such as joining of 
#multiple datasets,
#taking the differences,
#and getting the intersection. Similar to set operations

In [79]:
#Let's look at S1 again
S1

0      0
1     30
2     60
3     90
4    120
5    150
6    180
dtype: int64

In [80]:
#Already showed data selection and slicing
S1[1]

30

In [81]:
S1[:3]

0     0
1    30
2    60
dtype: int64

In [82]:
S1[4:]

4    120
5    150
6    180
dtype: int64

In [83]:
#Let's see more of data indexing and selection
#Data selections in Series uses dictionary-like expressions as shown above
S1.keys()


RangeIndex(start=0, stop=7, step=1)

In [84]:
list(S1.items())

[(0, 0), (1, 30), (2, 60), (3, 90), (4, 120), (5, 150), (6, 180)]

In [85]:
#We can extend the series by adding a new key/value pair
S1

0      0
1     30
2     60
3     90
4    120
5    150
6    180
dtype: int64

In [86]:
S1[7] = 210

In [87]:
S1

0      0
1     30
2     60
3     90
4    120
5    150
6    180
7    210
dtype: int64

In [88]:
#Series is mutable
S1[0] = 10
print(S1)

0     10
1     30
2     60
3     90
4    120
5    150
6    180
7    210
dtype: int64


In [89]:
S1[0] = 0
print(S1)

0      0
1     30
2     60
3     90
4    120
5    150
6    180
7    210
dtype: int64


In [90]:
S1[0:5]

0      0
1     30
2     60
3     90
4    120
dtype: int64

In [91]:
S2

a      0
b     30
c     60
d     90
e    120
f    150
g    180
dtype: int64

In [92]:
S2['a':'f']

a      0
b     30
c     60
d     90
e    120
f    150
dtype: int64

In [93]:
#Note: the difference between the two slices?
#When slicing with an explicit index, the final index is included
#When slicing with an implicit index, the final index excluded
#It can be confusing if using an integer index
#Because of this potential confusion, Pandas provides special 
#indexer attributes: loc and iloc
#These are not functional methods but attributes that expose a particular
#slicing interface to the data

In [94]:
S2.loc['a':'e']

a      0
b     30
c     60
d     90
e    120
dtype: int64