# Missing Values

In [1]:
import numpy as np 
import pandas as pd

# Create a dataframe
df = pd.DataFrame({'A':[1,2,np.nan], 'B':[6,np.nan,np.nan], 'C':[1,2,3]}) 
df

Unnamed: 0,A,B,C
0,1.0,6.0,1
1,2.0,,2
2,,,3


In [2]:
# Check missing values in each column
df.isna().sum()

A    1
B    2
C    0
dtype: int64

In [3]:
# Remove rows with NaN values
df.dropna()

Unnamed: 0,A,B,C
0,1.0,6.0,1


In [4]:
# Original dataframe remains unchanged
df

Unnamed: 0,A,B,C
0,1.0,6.0,1
1,2.0,,2
2,,,3


In [5]:
# Remove columns with NaN values
df.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


In [6]:
# Fill NaN values with a specific value
df.fillna(value = 'Mean or median or mode of data')

Unnamed: 0,A,B,C
0,1.0,6.0,1
1,2.0,Mean or median or mode of data,2
2,Mean or median or mode of data,Mean or median or mode of data,3


In [7]:
# Fill NaN values with the mean of the column
df['A'].fillna(value = df['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

# Groupby

In [8]:
import pandas as pd

# Create a dataframe
data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'], 
        'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'], 
        'Sales':[200,120,340,124,243,350]} 
df = pd.DataFrame(data)

In [9]:
# Group by Company (returns a GroupBy object)
df.groupby('Company')


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x140f11150>

In [10]:
# Store the GroupBy object
by_comp = df.groupby('Company')

In [11]:
# Minimum values in each group
by_comp.min()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,Carl,243
GOOG,Charlie,120
MSFT,Amy,124


In [12]:
# Maximum values in each group
by_comp.max()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,Sarah,350
GOOG,Sam,200
MSFT,Vanessa,340


In [13]:
# Count of values in each group
by_comp.count()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,2,2
GOOG,2,2
MSFT,2,2


In [14]:
# Detailed statistics of each group
by_comp.describe()

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
FB,2.0,296.5,75.660426,243.0,269.75,296.5,323.25,350.0
GOOG,2.0,160.0,56.568542,120.0,140.0,160.0,180.0,200.0
MSFT,2.0,232.0,152.735065,124.0,178.0,232.0,286.0,340.0


# combining Dataframes

In [15]:
import pandas as pd

# Create example dataframes
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], 
                    'B': ['B0', 'B1', 'B2', 'B3'], 
                    'C': ['C0', 'C1', 'C2', 'C3'], 
                    'D': ['D0', 'D1', 'D2', 'D3']}, 
                   index=[0, 1, 2, 3])

df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'], 
                    'B': ['B4', 'B5', 'B6', 'B7'], 
                    'C': ['C4', 'C5', 'C6', 'C7'], 
                    'D': ['D4', 'D5', 'D6', 'D7']}, 
                   index=[4, 5, 6, 7])

df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'], 
                    'B': ['B8', 'B9', 'B10', 'B11'], 
                    'C': ['C8', 'C9', 'C10', 'C11'], 
                    'D': ['D8', 'D9', 'D10', 'D11']}, 
                   index=[8, 9, 10, 11])

In [16]:
# Concatenate vertically (row-wise)
pd.concat([df1, df2, df3])

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


In [17]:
# Concatenate horizontally (column-wise)
pd.concat([df1, df2, df3], axis=1)

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1,A.2,B.2,C.2,D.2
0,A0,B0,C0,D0,,,,,,,,
1,A1,B1,C1,D1,,,,,,,,
2,A2,B2,C2,D2,,,,,,,,
3,A3,B3,C3,D3,,,,,,,,
4,,,,,A4,B4,C4,D4,,,,
5,,,,,A5,B5,C5,D5,,,,
6,,,,,A6,B6,C6,D6,,,,
7,,,,,A7,B7,C7,D7,,,,
8,,,,,,,,,A8,B8,C8,D8
9,,,,,,,,,A9,B9,C9,D9


In [18]:
# Concatenate horizontally (column-wise)
pd.concat([df1, df2, df3], axis=1)

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1,A.2,B.2,C.2,D.2
0,A0,B0,C0,D0,,,,,,,,
1,A1,B1,C1,D1,,,,,,,,
2,A2,B2,C2,D2,,,,,,,,
3,A3,B3,C3,D3,,,,,,,,
4,,,,,A4,B4,C4,D4,,,,
5,,,,,A5,B5,C5,D5,,,,
6,,,,,A6,B6,C6,D6,,,,
7,,,,,A7,B7,C7,D7,,,,
8,,,,,,,,,A8,B8,C8,D8
9,,,,,,,,,A9,B9,C9,D9


# Pandas Operations

In [19]:
import pandas as pd

# Create example dataframe
df = pd.DataFrame({'Col1':[1,2,3,4], 
                   'Col2':[44,55,66,55], 
                   'Col3':['a','b','c','d']})

In [20]:
# View first rows
df.head()

Unnamed: 0,Col1,Col2,Col3
0,1,44,a
1,2,55,b
2,3,66,c
3,4,55,d


In [21]:
# View last rows
df.tail()

Unnamed: 0,Col1,Col2,Col3
0,1,44,a
1,2,55,b
2,3,66,c
3,4,55,d


In [22]:
# Get unique values in a column
df['Col2'].unique()

array([44, 55, 66])

In [23]:
# Count unique values
df['Col2'].nunique()

3

In [24]:
# Count frequency of each value
df['Col2'].value_counts()

Col2
55    2
44    1
66    1
Name: count, dtype: int64

In [25]:
# Define a function to apply to a column
def square(x): 
    return x*x


In [26]:
# Apply function to column
df['Col1'] = df['Col1'].apply(square)

In [27]:
# Sum a column
df['Col1'].sum()

30

In [28]:
# Remove a column
del df['Col3']

In [29]:
# Get column names
df.columns

Index(['Col1', 'Col2'], dtype='object')

In [32]:
# Get row indices
df.index

RangeIndex(start=0, stop=4, step=1)