<h2>Chapter 5. Getting Started with pandas<h2>

In [24]:
import pandas as pd
import numpy as np

In [2]:
from pandas import DataFrame, Series

<h3>Series<h2>

In [3]:
# A Series is a one-dimensional array-like object containing a sequence of values (of similar types to NumPy types) and an associated array of data labels, called its index. The simplest Series is formed from only an array of data:

In [6]:
obj = Series([1,3,5,6]) #Since we did not specify an index for the data, a default one consisting of the integers 0 through N - 1 (where N is the length of the data) is created. 

In [7]:
obj

0    1
1    3
2    5
3    6
dtype: int64

In [8]:
obj.values

array([1, 3, 5, 6])

In [10]:
obj.index # like range(4)

RangeIndex(start=0, stop=4, step=1)

In [74]:
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c']).astype(np.float64)

In [75]:
obj2

d    4.0
b    7.0
a   -5.0
c    3.0
dtype: float64

In [13]:
obj2['a']

-5

In [15]:
obj2[['d','a']]

d    4
a   -5
dtype: int64

In [16]:
obj2['a','d']

KeyError: 'key of type tuple not found and not a MultiIndex'

In [19]:
obj2[obj2>3]

d    4
b    7
dtype: int64

In [20]:
# mainlz it has all the functions of numpy arrray(1-d), see below:

In [21]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [22]:
obj2*4

d    16
b    28
a   -20
c    12
dtype: int64

In [23]:
max(obj2)

7

In [25]:
np.max(obj2)

7

In [27]:
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [28]:
np.sqrt(obj2)

  result = getattr(ufunc, method)(*inputs, **kwargs)


d    2.000000
b    2.645751
a         NaN
c    1.732051
dtype: float64

In [29]:
obj2=o

d    4
b    7
a   -5
c    3
dtype: int64

In [30]:
np.where(obj2>0,np.sqrt(obj2),0)

  result = getattr(ufunc, method)(*inputs, **kwargs)


array([2.        , 2.64575131, 0.        , 1.73205081])

In [31]:
np.sqrt(obj2[obj2>0])

d    2.000000
b    2.645751
c    1.732051
dtype: float64

In [33]:
np.sqrt(np.where(obj2>0,obj2,0))

array([2.        , 2.64575131, 0.        , 1.73205081])

In [79]:
out = obj2.copy()

In [80]:
out

d    4.0
b    7.0
a   -5.0
c    3.0
dtype: float64

In [64]:
np.sqrt(obj2,out=out,where)

d    2
b    2
a    n
c    1
dtype: object

In [65]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [81]:
np.sqrt(obj2,out=out,where=[True,True,False,True])

RecursionError: maximum recursion depth exceeded while calling a Python object

In [83]:
np.where(obj2>0,np.sqrt(obj2),obj2)  # so far this is the best method

  result = getattr(ufunc, method)(*inputs, **kwargs)


array([ 2.        ,  2.64575131, -5.        ,  1.73205081])

In [86]:
# %load ignorewarnings.py
import warnings
warnings.filterwarnings('ignore')


In [87]:
np.where(obj2>0,np.sqrt(obj2),obj2)  # so far this is the best method

array([ 2.        ,  2.64575131, -5.        ,  1.73205081])

In [88]:
# Another way to think about a Series is as a fixed-length, ordered dict, as it is a mapping of index values to data values. It can be used in many contexts where you might use a dict:

In [89]:
obj2

d    4.0
b    7.0
a   -5.0
c    3.0
dtype: float64

In [90]:
'd' in obj2

True

In [91]:
'e' in obj2

False

In [92]:
# Create a python series using dict

In [93]:
inp = {'apple':100,
      'oranges':400,
      'pomgrenade':900}

In [95]:
s=Series(inp)

In [96]:
s

apple         100
oranges       400
pomgrenade    900
dtype: int64

In [97]:
'apple' in s

True

In [102]:
# When you are only passing a dict, the index in the resulting Series will have the dict’s keys in sorted order. You can override this by passing the dict keys in the order you want them to appear in the resulting Series:

In [106]:
states=['oranges','apple','kela']

In [107]:
s=Series(inp, index=states)

In [108]:
s

oranges    400.0
apple      100.0
kela         NaN
dtype: float64

In [109]:
# The isnull and notnull functions in pandas should be used to detect missing data:

In [110]:
s.isnull()

oranges    False
apple      False
kela        True
dtype: bool

In [111]:
s.notnull()

oranges     True
apple       True
kela       False
dtype: bool

In [112]:
s.notna()

oranges     True
apple       True
kela       False
dtype: bool

In [114]:
pd.isnull(s)

oranges    False
apple      False
kela        True
dtype: bool

In [115]:
one = Series({'Punjab':100,
             'Haryana':300,
             'JK':550})

In [116]:
two = Series({'Delhi':200,
             'JK':200})

In [118]:
#A useful Series feature for many applications is that it automatically aligns by index label in arithmetic operations:
one + two 

Delhi        NaN
Haryana      NaN
JK         750.0
Punjab       NaN
dtype: float64

In [119]:
# it's like join operation in databases

In [120]:
one

Punjab     100
Haryana    300
JK         550
dtype: int64

In [127]:
# Both the Series object itself and its index have a name attribute, which integrates with other key areas of pandas functionality:

In [128]:
one.name='Number'

In [129]:
one.index.name='States'

In [130]:
one

States
Punjab     100
Haryana    300
JK         550
Name: Number, dtype: int64

<h2>DataFrames <h2>

In [134]:
data ={'State':['Punjab','Punjab','Haryana','Haryana','JK','JK','JK'],
      'Year':[2000,2001,2000,2001,2000,2001,2002],
      'Population':[12,32,10,30,10,27,31]}

In [135]:
# two ways to create

In [136]:
frame = DataFrame(data)

In [138]:
frame  # see the index here, its similar to what we discussed stdying Series above

Unnamed: 0,State,Year,Population
0,Punjab,2000,12
1,Punjab,2001,32
2,Haryana,2000,10
3,Haryana,2001,30
4,JK,2000,10
5,JK,2001,27
6,JK,2002,31


In [139]:
# another way of passing columns , it will reflect in that order only and also it additional column is given which is not preset in dataframe, it will give Nan values for all of them

# also, we can pass index

In [143]:
frame2=pd.DataFrame(data, columns=['Year','Population','State','average'],
            index = ['one', 'two','three','four','five','six','seven'])

In [144]:
frame2

Unnamed: 0,Year,Population,State,average
one,2000,12,Punjab,
two,2001,32,Punjab,
three,2000,10,Haryana,
four,2001,30,Haryana,
five,2000,10,JK,
six,2001,27,JK,
seven,2002,31,JK,


In [146]:
frame2['Year']

one      2000
two      2001
three    2000
four     2001
five     2000
six      2001
seven    2002
Name: Year, dtype: int64

In [147]:
frame2.Year

one      2000
two      2001
three    2000
four     2001
five     2000
six      2001
seven    2002
Name: Year, dtype: int64

In [148]:
# although, I already know but rows can e retrived using loc or positions


In [149]:
frame2.loc['one']

Year            2000
Population        12
State         Punjab
average          NaN
Name: one, dtype: object

In [150]:
# Columns can be modified by assignment. For example, the empty 'debt' column could be assigned a scalar value or an array of values:

In [151]:
frame2.columns

Index(['Year', 'Population', 'State', 'average'], dtype='object')

In [152]:
frame2['average']=2

In [153]:
frame2

Unnamed: 0,Year,Population,State,average
one,2000,12,Punjab,2
two,2001,32,Punjab,2
three,2000,10,Haryana,2
four,2001,30,Haryana,2
five,2000,10,JK,2
six,2001,27,JK,2
seven,2002,31,JK,2


In [155]:
frame2.average=[44,4,43,12,45,67,42]

In [156]:
frame2

Unnamed: 0,Year,Population,State,average
one,2000,12,Punjab,44
two,2001,32,Punjab,4
three,2000,10,Haryana,43
four,2001,30,Haryana,12
five,2000,10,JK,45
six,2001,27,JK,67
seven,2002,31,JK,42


In [186]:
frame2['numbering']=np.arange(1,8)

In [161]:
frame2

Unnamed: 0,Year,Population,State,average,numbering
one,2000,12,Punjab,44,1
two,2001,32,Punjab,4,2
three,2000,10,Haryana,43,3
four,2001,30,Haryana,12,4
five,2000,10,JK,45,5
six,2001,27,JK,67,6
seven,2002,31,JK,42,7


In [162]:
# When you are assigning lists or arrays to a column, the value’s length must match the length of the DataFrame. If you assign a Series, its labels will be realigned exactly to the DataFrame’s index, inserting missing values in any holes:

In [164]:
frame2.T

Unnamed: 0,one,two,three,four,five,six,seven
Year,2000,2001,2000,2001,2000,2001,2002
Population,12,32,10,30,10,27,31
State,Punjab,Punjab,Haryana,Haryana,JK,JK,JK
average,44,4,43,12,45,67,42
numbering,1,2,3,4,5,6,7


In [165]:
# The del keyword will delete columns as with a dict.

In [168]:
frame2['bools']=frame2['Year']=2000

In [169]:
frame2

Unnamed: 0,Year,Population,State,average,numbering,bools
one,2000,12,Punjab,44,1,2000
two,2000,32,Punjab,4,2,2000
three,2000,10,Haryana,43,3,2000
four,2000,30,Haryana,12,4,2000
five,2000,10,JK,45,5,2000
six,2000,27,JK,67,6,2000
seven,2000,31,JK,42,7,2000


In [173]:
frame2.rename(columns={'bools':'Punjab'})

Unnamed: 0,Year,Population,State,average,numbering,Punjab
one,2000,12,Punjab,44,1,2000
two,2000,32,Punjab,4,2,2000
three,2000,10,Haryana,43,3,2000
four,2000,30,Haryana,12,4,2000
five,2000,10,JK,45,5,2000
six,2000,27,JK,67,6,2000
seven,2000,31,JK,42,7,2000


In [174]:
# Another method
frame2.rename({'bools':'Punjab'},axis=1)

Unnamed: 0,Year,Population,State,average,numbering,Punjab
one,2000,12,Punjab,44,1,2000
two,2000,32,Punjab,4,2,2000
three,2000,10,Haryana,43,3,2000
four,2000,30,Haryana,12,4,2000
five,2000,10,JK,45,5,2000
six,2000,27,JK,67,6,2000
seven,2000,31,JK,42,7,2000


In [177]:
# Another method
frame2.rename({'bools':'Punjab'},axis='columns', inplace=True)

In [180]:
frame2['Punjab']=frame2['State']=='Punjab'

In [181]:
frame2

Unnamed: 0,Year,Population,State,average,numbering,Punjab
one,2000,12,Punjab,44,1,True
two,2000,32,Punjab,4,2,True
three,2000,10,Haryana,43,3,False
four,2000,30,Haryana,12,4,False
five,2000,10,JK,45,5,False
six,2000,27,JK,67,6,False
seven,2000,31,JK,42,7,False


In [187]:
del frame2['numbering']   # del is used to delete a column of dataframe

In [184]:
frame2

Unnamed: 0,Year,Population,State,average,Punjab
one,2000,12,Punjab,44,True
two,2000,32,Punjab,4,True
three,2000,10,Haryana,43,False
four,2000,30,Haryana,12,False
five,2000,10,JK,45,False
six,2000,27,JK,67,False
seven,2000,31,JK,42,False


In [193]:
frame3 = frame2.copy()

In [194]:
del frame3   # can be used to delete a dataframe as well

In [195]:
frame3

NameError: name 'frame3' is not defined

In [196]:
# CAUTION

# The column returned from indexing a DataFrame is a view on the underlying data, not a copy. Thus, any in-place modifications to the Series will be reflected in the DataFrame. The column can be explicitly copied with the Series’s copy method.

In [204]:
# another way tp create a datframe , is to use a nested dict.
# when we pass a nested dict , it will automatically pick up the "outer" key as its columns and inner keys as its index
# see below:

In [219]:
inp = {'Punjab':{'population':13, 'languages':['Punjabi','Hindi','English']},
      'Chennai':{'population':20,'languages':['Tamil','Telugu']}}

In [220]:
frame4 = DataFrame(inp)

In [221]:
frame4

Unnamed: 0,Punjab,Chennai
population,13,20
languages,"[Punjabi, Hindi, English]","[Tamil, Telugu]"


In [208]:
frame4.T

Unnamed: 0,Population,languages
Punjab,13,"[Punjabi, Hindi, English]"
Chennai,20,"[Tamil, Telugu]"


In [209]:
# The keys in the inner dicts are combined and sorted to form the index in the result. This isn’t true if an explicit index is specified:

In [210]:
frame4

Unnamed: 0,Punjab,Chennai
Population,13,20
languages,"[Punjabi, Hindi, English]","[Tamil, Telugu]"


In [212]:
DataFrame(inp, index=['languages','Population','literacy'])

Unnamed: 0,Punjab,Chennai
languages,"[Punjabi, Hindi, English]","[Tamil, Telugu]"
Population,13,20
literacy,,


In [226]:
# this is reallz good below:

In [227]:
 d = dict.fromkeys(range(100))

In [228]:
# list(d)    # r3eal

In [230]:
# However i have learnt something new o generate random dataframe using pandas only.
# I will need to explore this module from pandas
# pd.util

In [232]:
dir(pd.util)

['Appender',
 'Substitution',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__getattr__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_decorators',
 '_exceptions',
 '_print_versions',
 '_tester',
 '_validators',
 'cache_readonly',
 'hash_array',
 'hash_pandas_object']

In [233]:
pd.util.testing.makeMixedDataFrame()

Unnamed: 0,A,B,C,D
0,0.0,0.0,foo1,2009-01-01
1,1.0,1.0,foo2,2009-01-02
2,2.0,0.0,foo3,2009-01-05
3,3.0,1.0,foo4,2009-01-06
4,4.0,0.0,foo5,2009-01-07


In [235]:
# dir(pd.util.testing)

In [239]:
pd.util.testing.makeDataFrame()

Unnamed: 0,A,B,C,D
hJ0leN3EHx,1.921712,0.638855,-1.307781,1.174704
PEtkUXKPuS,-0.432391,1.776363,-0.683518,0.06252
CHLvbtBfoi,1.279376,-0.605693,0.772322,1.799372
6S4pebODdj,-1.485323,1.054322,-2.150849,1.904975
UcOBesarFb,-0.061579,-1.854974,-0.113044,0.101542
PDT7414uwV,0.17493,0.258707,0.710214,-0.552411
f80exOPJ2U,-0.714309,1.525778,-1.153517,0.184796
uxTf4FCMZJ,0.017034,-1.010296,-1.221527,0.085348
kLEYscX4Vz,-2.686446,-0.432157,-1.926708,-1.4581
fnuKpAESZF,-0.264379,1.067109,0.347207,-1.136336
