In [5]:
import pandas as pd
import numpy as np

In [6]:
def create_df(nRows, nCols, maxRand=10):
  arr = np.random.randint(0, maxRand, (nRows, nCols))
  df = pd.DataFrame(arr)
  
  df.index = ['R' + str(x) for x in np.arange(1, nRows+1)]
  df.columns = ['C' + str(x) for x in np.arange(1, nCols+1)]

  return df

In [7]:
create_df(5, 3)

Unnamed: 0,C1,C2,C3
R1,7,1,8
R2,2,7,7
R3,2,8,1
R4,7,4,1
R5,5,8,0


In [8]:
create_df(2, 5)

Unnamed: 0,C1,C2,C3,C4,C5
R1,7,2,4,8,6
R2,3,0,1,1,7


# Planetary Dataset Case Study

In [9]:
import seaborn as sns

In [10]:
df = sns.load_dataset("planets")
df.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [11]:
df.tail()

Unnamed: 0,method,number,orbital_period,mass,distance,year
1030,Transit,1,3.941507,,172.0,2006
1031,Transit,1,2.615864,,148.0,2007
1032,Transit,1,3.191524,,174.0,2007
1033,Transit,1,4.125083,,293.0,2008
1034,Transit,1,4.187757,,260.0,2008


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1035 entries, 0 to 1034
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   method          1035 non-null   object 
 1   number          1035 non-null   int64  
 2   orbital_period  992 non-null    float64
 3   mass            513 non-null    float64
 4   distance        808 non-null    float64
 5   year            1035 non-null   int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 48.6+ KB


In [13]:
df.describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,1035.0,992.0,513.0,808.0,1035.0
mean,1.785507,2002.917596,2.638161,264.069282,2009.070531
std,1.240976,26014.728304,3.818617,733.116493,3.972567
min,1.0,0.090706,0.0036,1.35,1989.0
25%,1.0,5.44254,0.229,32.56,2007.0
50%,1.0,39.9795,1.26,55.25,2010.0
75%,2.0,526.005,3.04,178.5,2012.0
max,7.0,730000.0,25.0,8500.0,2014.0


In [14]:
# Method 1
for row in df.index:
  for column in df.columns:
    if pd.isnull(df.loc[row, column]):
      df.drop(row, inplace=True)
      break

In [15]:
df.describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,498.0,498.0,498.0,498.0,498.0
mean,1.73494,835.778671,2.50932,52.068213,2007.37751
std,1.17572,1469.128259,3.636274,46.596041,4.167284
min,1.0,1.3283,0.0036,1.35,1989.0
25%,1.0,38.27225,0.2125,24.4975,2005.0
50%,1.0,357.0,1.245,39.94,2009.0
75%,2.0,999.6,2.8675,59.3325,2011.0
max,6.0,17337.5,25.0,354.0,2014.0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 498 entries, 0 to 784
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   method          498 non-null    object 
 1   number          498 non-null    int64  
 2   orbital_period  498 non-null    float64
 3   mass            498 non-null    float64
 4   distance        498 non-null    float64
 5   year            498 non-null    int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 27.2+ KB


In [17]:
df = sns.load_dataset("planets")

# Method 2
for i, r in df.iterrows():
  print(i)
  print(r)
  break

0
method            Radial Velocity
number                          1
orbital_period              269.3
mass                          7.1
distance                     77.4
year                         2006
Name: 0, dtype: object


In [18]:
for i, r in df.iterrows():
  print(pd.isnull(r).any())
  break

False


In [19]:
for i, r in df.iterrows():
  if pd.isnull(r).any():
    df.drop(i, inplace=True)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 498 entries, 0 to 784
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   method          498 non-null    object 
 1   number          498 non-null    int64  
 2   orbital_period  498 non-null    float64
 3   mass            498 non-null    float64
 4   distance        498 non-null    float64
 5   year            498 non-null    int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 27.2+ KB


In [21]:
# Method 3
df = sns.load_dataset("planets")

df.dropna(inplace=True)

In [22]:
df.describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,498.0,498.0,498.0,498.0,498.0
mean,1.73494,835.778671,2.50932,52.068213,2007.37751
std,1.17572,1469.128259,3.636274,46.596041,4.167284
min,1.0,1.3283,0.0036,1.35,1989.0
25%,1.0,38.27225,0.2125,24.4975,2005.0
50%,1.0,357.0,1.245,39.94,2009.0
75%,2.0,999.6,2.8675,59.3325,2011.0
max,6.0,17337.5,25.0,354.0,2014.0


# Question 2: Filter and show only those rows which have planets that are found in the 2010's and afterwards and method is Radial Velocity or Transit.

In [23]:
df_ = df.copy()

In [24]:
# Query the DataFrame
df_ = df_[
          (df_['year'] >= 2010) &
          ((df_['method'] == 'Radial Velocity') | 
          (df_['method'] == 'Transit'))
]

In [25]:
df_['year'].unique()

array([2011, 2010, 2013, 2012, 2014], dtype=int64)

In [26]:
df_.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
2,Radial Velocity,1,763.0,2.6,19.84,2011
9,Radial Velocity,2,452.8,1.99,74.79,2010
10,Radial Velocity,2,883.0,0.86,74.79,2010
28,Radial Velocity,1,181.4,3.2,45.52,2013
45,Radial Velocity,1,380.8,1.8,20.21,2010


In [27]:
df_['method'].unique()

array(['Radial Velocity', 'Transit'], dtype=object)

In [28]:
df = sns.load_dataset("planets")
df.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [29]:
df['method'].unique()

array(['Radial Velocity', 'Imaging', 'Eclipse Timing Variations',
       'Transit', 'Astrometry', 'Transit Timing Variations',
       'Orbital Brightness Modulation', 'Microlensing', 'Pulsar Timing',
       'Pulsation Timing Variations'], dtype=object)