In [1]:
import numpy as np
import pandas as pd

# Pandas Series Object

In [2]:
data=pd.Series([32,54,65,23,12])

In [3]:
data

0    32
1    54
2    65
3    23
4    12
dtype: int64

In [4]:
data.values

array([32, 54, 65, 23, 12], dtype=int64)

In [5]:
data.index

RangeIndex(start=0, stop=5, step=1)

In [6]:
data[1]

54

In [12]:
data=pd.Series([32,54,65,23,12],index=['a','b','c','d','e'])

In [13]:
data

a    32
b    54
c    65
d    23
e    12
dtype: int64

In [14]:
data['c']

65

In [15]:
data[1]

54

In [19]:
population_dict={'India':1400,
                 'USA':400,
                 'China':1500,
                 'Israel':10  
}
population=pd.Series(population_dict)

In [20]:
population

India     1400
USA        400
China     1500
Israel      10
dtype: int64

In [21]:
population['India']

1400

In [22]:
population['India':'China']

India    1400
USA       400
China    1500
dtype: int64

# Pandas DataFrame Object

In [23]:
area_dict={'India':32000,
                 'USA':1400,
                 'China':15000,
                 'Israel':980  
}
area=pd.Series(area_dict)

In [24]:
area

India     32000
USA        1400
China     15000
Israel      980
dtype: int64

In [25]:
country=pd.DataFrame({'Population':population,'Area':area})

In [26]:
country

Unnamed: 0,Population,Area
India,1400,32000
USA,400,1400
China,1500,15000
Israel,10,980


In [27]:
country.index

Index(['India', 'USA', 'China', 'Israel'], dtype='object')

In [28]:
country.columns

Index(['Population', 'Area'], dtype='object')

In [29]:
country['Area']

India     32000
USA        1400
China     15000
Israel      980
Name: Area, dtype: int64

In [30]:
country['Population']

India     1400
USA        400
China     1500
Israel      10
Name: Population, dtype: int64

In [31]:
data=[{'a':i,'b':2*i}
     for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [33]:
data=pd.DataFrame([{'a':2,'b':4},{'b':5,'c':89}])

In [34]:
data

Unnamed: 0,a,b,c
0,2.0,4,
1,,5,89.0


In [37]:
pd.DataFrame(np.random.rand(3,2),
            columns=['x','y'],
            index=['a','b','c'])

Unnamed: 0,x,y
a,0.619383,0.491824
b,0.623757,0.199117
c,0.225909,0.313371


# Pandas Index Object

In [38]:
ind=pd.Index([2,3,4,5,6])

In [39]:
ind

Int64Index([2, 3, 4, 5, 6], dtype='int64')

In [40]:
ind[1]

3

In [41]:
ind[::2]

Int64Index([2, 4, 6], dtype='int64')

In [42]:
ind.size

5

In [43]:
ind.shape

(5,)

In [44]:
ind.ndim

1

In [45]:
ind.dtype

dtype('int64')

In [46]:
ind[1]=0

TypeError: Index does not support mutable operations

# Index as Ordered Set

In [47]:
ind1=pd.Index([2,4,5,6,3])
ind2=pd.Index([5,6,3,9,8])

In [48]:
ind1 & ind2

Int64Index([5, 6, 3], dtype='int64')

In [49]:
ind1 | ind2

Int64Index([2, 3, 4, 5, 6, 8, 9], dtype='int64')

In [50]:
ind1 ^ ind2

Int64Index([2, 4, 8, 9], dtype='int64')

# Indexers loc,iloc

In [51]:
data=pd.Series(['x','y','z'],index=[1,3,8])

In [52]:
data

1    x
3    y
8    z
dtype: object

In [53]:
data[1]

'x'

In [54]:
data[1:3]

3    y
8    z
dtype: object

In [55]:
data.loc[1]

'x'

In [56]:
data.loc[1:3]

1    x
3    y
dtype: object

In [57]:
data.iloc[1]

'y'

In [58]:
data.iloc[1:3]

3    y
8    z
dtype: object

# Data Selection in DataFrame

In [59]:
area_dict

{'India': 32000, 'USA': 1400, 'China': 15000, 'Israel': 980}

In [60]:
area

India     32000
USA        1400
China     15000
Israel      980
dtype: int64

In [61]:
population

India     1400
USA        400
China     1500
Israel      10
dtype: int64

In [62]:
country

Unnamed: 0,Population,Area
India,1400,32000
USA,400,1400
China,1500,15000
Israel,10,980


In [63]:
country["Area"]

India     32000
USA        1400
China     15000
Israel      980
Name: Area, dtype: int64

In [65]:
country.Area

India     32000
USA        1400
China     15000
Israel      980
Name: Area, dtype: int64

In [66]:
country.Area is country['Area']

True

In [67]:
country.Area is country['gdp']

KeyError: 'gdp'

In [68]:
country.values

array([[ 1400, 32000],
       [  400,  1400],
       [ 1500, 15000],
       [   10,   980]], dtype=int64)

In [69]:
country

Unnamed: 0,Population,Area
India,1400,32000
USA,400,1400
China,1500,15000
Israel,10,980


In [70]:
country.T

Unnamed: 0,India,USA,China,Israel
Population,1400,400,1500,10
Area,32000,1400,15000,980


In [71]:
country.values[0]

array([ 1400, 32000], dtype=int64)

# Handling Missing Data

In [79]:
data1=np.array([2,np.nan,3,5])

In [80]:
data1

array([ 2., nan,  3.,  5.])

In [81]:
data1.dtype

dtype('float64')

In [82]:
1+np.nan

nan

In [83]:
0*np.nan

nan

In [84]:
data1.sum()

nan

In [85]:
data1.min()

nan

In [86]:
data1.max()

nan

In [87]:
np.nansum(data1)

10.0

In [89]:
x=pd.Series([2,np.nan,3,None])

In [90]:
x

0    2.0
1    NaN
2    3.0
3    NaN
dtype: float64

In [91]:
x.isnull

<bound method Series.isnull of 0    2.0
1    NaN
2    3.0
3    NaN
dtype: float64>

In [92]:
x.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [93]:
x.isnull().sum

<bound method Series.sum of 0    False
1     True
2    False
3     True
dtype: bool>

In [94]:
x.isnull().sum()

2

In [95]:
x.notnull()

0     True
1    False
2     True
3    False
dtype: bool

In [96]:
x


0    2.0
1    NaN
2    3.0
3    NaN
dtype: float64

In [97]:
x.dropna()

0    2.0
2    3.0
dtype: float64

In [101]:
df=pd.DataFrame([[1,np.nan,2],
                [2,4,7],
                [np.nan,4,5]])

In [99]:
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,4.0,7
2,,4.0,5


In [100]:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,4.0,7


In [102]:
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,4.0,7
2,,4.0,5


In [103]:
df.dropna(axis=1) #columns

Unnamed: 0,2
0,2
1,7
2,5


In [104]:
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,4.0,7
2,,4.0,5


In [105]:
df[3]=np.nan

In [106]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,4.0,7,
2,,4.0,5,


In [107]:
df.dropna(axis=1,how='all') #columns

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,4.0,7
2,,4.0,5


In [108]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,4.0,7,
2,,4.0,5,


In [109]:
df.fillna(0)

Unnamed: 0,0,1,2,3
0,1.0,0.0,2,0.0
1,2.0,4.0,7,0.0
2,0.0,4.0,5,0.0


In [110]:
df=df.dropna(axis=1,how='all') #columns

In [111]:
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,4.0,7
2,,4.0,5


In [113]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,4.0,7
2,2.0,4.0,5


In [114]:
df.fillna(method='bfill')

Unnamed: 0,0,1,2
0,1.0,4.0,2
1,2.0,4.0,7
2,,4.0,5


In [115]:
x=[1,2,3]
y=[3,4,5]
z=[21,34,21]

In [117]:
np.concatenate([x,y,z])

array([ 1,  2,  3,  3,  4,  5, 21, 34, 21])

In [118]:
ser1=pd.Series(['Shivam','SHrikant','Prashant','Suryansh'],index=[1,2,3,4])

In [119]:
ser1

1      Shivam
2    SHrikant
3    Prashant
4    Suryansh
dtype: object

In [120]:
ser2=pd.Series(['Bhopal','Chhatarpur','Hyderabad','Indore'],index=[5,6,7,8])

In [121]:
ser2

5        Bhopal
6    Chhatarpur
7     Hyderabad
8        Indore
dtype: object

In [123]:
pd.concat([ser1,ser2])

1        Shivam
2      SHrikant
3      Prashant
4      Suryansh
5        Bhopal
6    Chhatarpur
7     Hyderabad
8        Indore
dtype: object

In [124]:
df=pd.DataFrame({'key':['A','B','C','A','B','C'],
                'data':range(6)},columns=['key','data'])

In [125]:
df

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [126]:
df.groupby('key')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000021973110040>

In [127]:
df.groupby('key').sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7


In [128]:
df.groupby('key').median()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,1.5
B,2.5
C,3.5
