<a href="https://colab.research.google.com/github/najjouj/Birella/blob/main/pandasDataFrame.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
area_dict={'California':456321,'Texas':147852,'New York':852963,'Florida':789654,'Illinois':456852}
area=pd.Series(area_dict)
area

Unnamed: 0,0
California,456321
Texas,147852
New York,852963
Florida,789654
Illinois,456852


In [None]:
population_dict={'California':789321852,'Texas':753159852,'New York':741258963,'Florida':789654123,'Illinois':1047852147}
population=pd.Series(population_dict)

In [None]:
states=pd.DataFrame({'population':population,'area':area})
states

Unnamed: 0,population,area
California,789321852,456321
Texas,753159852,147852
New York,741258963,852963
Florida,789654123,789654
Illinois,1047852147,456852


# Constructing DataFrame objects

From a single Series object

In [None]:
pd.DataFrame(population, columns=['Population'])

Unnamed: 0,Population
California,789321852
Texas,753159852
New York,741258963
Florida,789654123
Illinois,1047852147


From a list of dicts.

In [None]:
data=[{'a':i,'b':2*i} for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


From a dictionary of Series objects

In [None]:
pd.DataFrame({'population':population,'area':area})

Unnamed: 0,population,area
California,789321852,456321
Texas,753159852,147852
New York,741258963,852963
Florida,789654123,789654
Illinois,1047852147,456852


From a two-dimensional Numpy objects

In [None]:
pd.DataFrame(np.random.rand(3,2),columns=['foo','bar'],index=['a','b','c'])

Unnamed: 0,foo,bar
a,0.156943,0.134583
b,0.886192,0.206581
c,0.552491,0.008365


From a numpy structered array

In [None]:
A=np.zeros(3,dtype=[('A','i8'),('B','f8')])
A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

In [None]:
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


# **Operating on Null Values**

Detecting null values

In [None]:
data=pd.Series([1, np.nan,'hello',None])
data.isnull()

Unnamed: 0,0
0,False
1,True
2,False
3,True


In [None]:
data[data.notnull()]

Unnamed: 0,0
0,1
2,hello


Dropping null values

In [None]:
data.dropna()

Unnamed: 0,0
0,1
2,hello


In [None]:
df=pd.DataFrame([[1,np.nan,2],[2,3,5],[np.nan,4,6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [None]:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [None]:
df.dropna(axis=1)

Unnamed: 0,2
0,2
1,5
2,6


In [None]:
df[3]=np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


Filling null values

In [None]:
data=pd.Series([1,np.nan,2,None,3],index=list('abcde'))
data

Unnamed: 0,0
a,1.0
b,
c,2.0
d,
e,3.0


In [None]:
data.fillna(0)

Unnamed: 0,0
a,1.0
b,0.0
c,2.0
d,0.0
e,3.0


In [None]:
data.fillna(method='ffill')

  data.fillna(method='ffill')


Unnamed: 0,0
a,1.0
b,1.0
c,2.0
d,2.0
e,3.0


In [None]:
data.fillna(method='bfill')

  data.fillna(method='bfill')


Unnamed: 0,0
a,1.0
b,2.0
c,2.0
d,3.0
e,3.0


In [None]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [None]:
df.fillna(method='ffill',axis=1)

  df.fillna(method='ffill',axis=1)


Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


# **Aggregation and Grouping**

Planets Data

In [None]:
import seaborn as sns
planets=sns.load_dataset('planets')
planets.shape

(1035, 6)

In [None]:
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


Simple Aggregation in Pandas

In [None]:
rng=np.random.RandomState(42)
ser=pd.Series(rng.rand(5))
ser

Unnamed: 0,0
0,0.37454
1,0.950714
2,0.731994
3,0.598658
4,0.156019


In [None]:
ser.sum()

np.float64(2.811925491708157)

In [None]:
ser.mean()

np.float64(0.5623850983416314)

In [None]:
df=pd.DataFrame({'A':rng.rand(5),'B':rng.rand(5)})
df

Unnamed: 0,A,B
0,0.155995,0.020584
1,0.058084,0.96991
2,0.866176,0.832443
3,0.601115,0.212339
4,0.708073,0.181825


In [None]:
df.mean()

Unnamed: 0,0
A,0.477888
B,0.44342


In [None]:
df.mean(axis='columns')

Unnamed: 0,0
0,0.08829
1,0.513997
2,0.849309
3,0.406727
4,0.444949


In [None]:
planets.dropna().describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,498.0,498.0,498.0,498.0,498.0
mean,1.73494,835.778671,2.50932,52.068213,2007.37751
std,1.17572,1469.128259,3.636274,46.596041,4.167284
min,1.0,1.3283,0.0036,1.35,1989.0
25%,1.0,38.27225,0.2125,24.4975,2005.0
50%,1.0,357.0,1.245,39.94,2009.0
75%,2.0,999.6,2.8675,59.3325,2011.0
max,6.0,17337.5,25.0,354.0,2014.0


GroupBy: Split, Apply, Combine

In [None]:
planets.groupby('method')['orbital_period'].median()

Unnamed: 0_level_0,orbital_period
method,Unnamed: 1_level_1
Astrometry,631.18
Eclipse Timing Variations,4343.5
Imaging,27500.0
Microlensing,3300.0
Orbital Brightness Modulation,0.342887
Pulsar Timing,66.5419
Pulsation Timing Variations,1170.0
Radial Velocity,360.2
Transit,5.714932
Transit Timing Variations,57.011


In [None]:
planets.columns

Index(['method', 'number', 'orbital_period', 'mass', 'distance', 'year'], dtype='object')

In [None]:
for(method,group) in planets.groupby('method'):
  print('{0:30s} shape{1}'.format(method,group.shape))

Astrometry                     shape(2, 6)
Eclipse Timing Variations      shape(9, 6)
Imaging                        shape(38, 6)
Microlensing                   shape(23, 6)
Orbital Brightness Modulation  shape(3, 6)
Pulsar Timing                  shape(5, 6)
Pulsation Timing Variations    shape(1, 6)
Radial Velocity                shape(553, 6)
Transit                        shape(397, 6)
Transit Timing Variations      shape(4, 6)


In [None]:
planets.groupby('method')['year'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Astrometry,2.0,2011.5,2.12132,2010.0,2010.75,2011.5,2012.25,2013.0
Eclipse Timing Variations,9.0,2010.0,1.414214,2008.0,2009.0,2010.0,2011.0,2012.0
Imaging,38.0,2009.131579,2.781901,2004.0,2008.0,2009.0,2011.0,2013.0
Microlensing,23.0,2009.782609,2.859697,2004.0,2008.0,2010.0,2012.0,2013.0
Orbital Brightness Modulation,3.0,2011.666667,1.154701,2011.0,2011.0,2011.0,2012.0,2013.0
Pulsar Timing,5.0,1998.4,8.38451,1992.0,1992.0,1994.0,2003.0,2011.0
Pulsation Timing Variations,1.0,2007.0,,2007.0,2007.0,2007.0,2007.0,2007.0
Radial Velocity,553.0,2007.518987,4.249052,1989.0,2005.0,2009.0,2011.0,2014.0
Transit,397.0,2011.236776,2.077867,2002.0,2010.0,2012.0,2013.0,2014.0
Transit Timing Variations,4.0,2012.5,1.290994,2011.0,2011.75,2012.5,2013.25,2014.0
