In [1]:
import pandas as pd
import numpy as np

In [2]:
np.random.seed(seed=2)
np.random.choice([1,2,3], size=7)

array([1, 2, 1, 3, 3, 1, 3])

In [3]:
np.random.seed(seed=1)
df = pd.DataFrame(data={'A':np.random.choice([1,2,3,np.nan], size=3),
                        'B':np.random.choice([1,np.nan], size=3),
                        'C':np.random.choice([10,20], size=3)
                        })
df

Unnamed: 0,A,B,C
0,2.0,1.0,20
1,,,20
2,1.0,,20


In [4]:
# Simplest way to check for missing data levels in the tabular data

df.isnull().sum()

A    1
B    2
C    0
dtype: int64

In [5]:
df_practice = pd.read_csv('/Users/raihannasir/Documents/Pandas/New Materials/UKMacroData.csv')
df_practice

Unnamed: 0,Date,GDP (£ m),CPI,Bank Rate,Gross Fixed Capital Formation (Investments)
0,2000 Q1,401242,1.1,5.875,69114.0
1,2000 Q2,404196,1.0,6.000,73074.0
2,2000 Q3,406795,1.2,6.000,68011.0
3,2000 Q4,409411,1.4,6.000,70115.0
4,2001 Q1,413054,1.3,5.750,70186.0
...,...,...,...,...,...
91,2022 Q4,568034,9.4,3.250,106689.0
92,2023 Q1,569336,9.0,4.125,109290.0
93,2023 Q2,569364,7.7,4.750,108245.0
94,2023 Q3,568655,6.3,5.250,106496.0


In [6]:
df_practice.isnull().sum()

Date                                           0
GDP (£ m)                                      0
CPI                                            0
Bank Rate                                      0
Gross Fixed Capital Formation (Investments)    1
dtype: int64

In [7]:
df

Unnamed: 0,A,B,C
0,2.0,1.0,20
1,,,20
2,1.0,,20


In [8]:
# DataFrame.dropna(*, axis=0, how=<no_default>, thresh=<no_default>, subset=None, inplace=False, ignore_index=False)
# .dropna() - it drops all rows that have missing data
# Notna needs to be used to save DataFrame without NA values. Dropna won't work.

df.dropna()

Unnamed: 0,A,B,C
0,2.0,1.0,20


In [9]:
# To remove column containing NA value using .dropna()

df.dropna(axis = 1)

Unnamed: 0,C
0,20
1,20
2,20


In [10]:
# Using Thresh parameter of .dropna(), row or column can be dropped

df.dropna(thresh=2) # thresh = 2 means, if row or column has 2 NA value will be dropped

Unnamed: 0,A,B,C
0,2.0,1.0,20
2,1.0,,20


In [11]:
# Define in which columns to look for missing values.

df.dropna(subset=['A','B'])

Unnamed: 0,A,B,C
0,2.0,1.0,20


In [12]:
df.dropna(subset=['A'])

Unnamed: 0,A,B,C
0,2.0,1.0,20
2,1.0,,20


In [13]:
# DataFrame.fillna(value=None, *, method=None, axis=None, inplace=False, limit=None, downcast=<no_default>)
# method{‘backfill’, ‘bfill’, ‘ffill’, None}, default None
# axis{0 or ‘index’} for Series, {0 or ‘index’, 1 or ‘columns’} for DataFrame

df

Unnamed: 0,A,B,C
0,2.0,1.0,20
1,,,20
2,1.0,,20


In [14]:
df.fillna(value='Constant')

Unnamed: 0,A,B,C
0,2.0,1.0,20
1,Constant,Constant,20
2,1.0,Constant,20


In [15]:
df.fillna(value='Constant').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   A       3 non-null      object
 1   B       3 non-null      object
 2   C       3 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 204.0+ bytes


In [16]:
df.fillna(value=4)

Unnamed: 0,A,B,C
0,2.0,1.0,20
1,4.0,4.0,20
2,1.0,4.0,20


In [17]:
# Inserting mean() value of Column A to NaN

df['A'].fillna(value=df['A'].mean(), inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['A'].fillna(value=df['A'].mean(), inplace=True)


Unnamed: 0,A,B,C
0,2.0,1.0,20
1,1.5,,20
2,1.0,,20


In [18]:
df['A'].mean()

1.5

In [19]:
df.fillna(method='ffill', axis=0)

  df.fillna(method='ffill', axis=0)


Unnamed: 0,A,B,C
0,2.0,1.0,20
1,1.5,1.0,20
2,1.0,1.0,20


In [20]:
df['A'].fillna(method='ffill', axis=0, inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['A'].fillna(method='ffill', axis=0, inplace=True)
  df['A'].fillna(method='ffill', axis=0, inplace=True)


Unnamed: 0,A,B,C
0,2.0,1.0,20
1,1.5,,20
2,1.0,,20


In [21]:
df

Unnamed: 0,A,B,C
0,2.0,1.0,20
1,1.5,,20
2,1.0,,20


In [22]:
df.fillna(method='bfill')

  df.fillna(method='bfill')


Unnamed: 0,A,B,C
0,2.0,1.0,20
1,1.5,,20
2,1.0,,20


In [23]:
df1 = pd.read_csv('/Users/raihannasir/Documents/Pandas/New Materials/insurance.csv')
df1

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [24]:
df.isna().any(axis=0)

A    False
B     True
C    False
dtype: bool

In [155]:
df2 = pd.read_csv('Fraud Detection Dataset.csv')
df2

Unnamed: 0,Transaction_ID,User_ID,Transaction_Amount,Transaction_Type,Time_of_Transaction,Device_Used,Location,Previous_Fraudulent_Transactions,Account_Age,Number_of_Transactions_Last_24H,Payment_Method,Fraudulent
0,T1,4174,1292.76,ATM Withdrawal,16.0,Tablet,San Francisco,0,119,13,Debit Card,0
1,T2,4507,1554.58,ATM Withdrawal,13.0,Mobile,New York,4,79,3,Credit Card,0
2,T3,1860,2395.02,ATM Withdrawal,,Mobile,,3,115,9,,0
3,T4,2294,100.10,Bill Payment,15.0,Desktop,Chicago,4,3,4,UPI,0
4,T5,2130,1490.50,POS Payment,19.0,Mobile,San Francisco,2,57,7,Credit Card,0
...,...,...,...,...,...,...,...,...,...,...,...,...
50995,T33982,2339,3112.51,Bill Payment,15.0,Mobile,New York,0,7,8,Debit Card,0
50996,T31261,2152,2897.15,Online Purchase,3.0,Mobile,Miami,1,75,11,Net Banking,1
50997,T12293,3345,2204.43,POS Payment,18.0,Mobile,San Francisco,3,73,5,Credit Card,0
50998,T42287,1518,4787.17,POS Payment,19.0,Tablet,New York,2,108,14,Net Banking,0


In [157]:
df2['Location'].dtype

dtype('O')

In [159]:
def fillup(cell):
    if cell == 'NaN':
        return 'Unknown'
    else:
        return cell

df2['Location'] = df2['Location'].apply(fillup)
df2

Unnamed: 0,Transaction_ID,User_ID,Transaction_Amount,Transaction_Type,Time_of_Transaction,Device_Used,Location,Previous_Fraudulent_Transactions,Account_Age,Number_of_Transactions_Last_24H,Payment_Method,Fraudulent
0,T1,4174,1292.76,ATM Withdrawal,16.0,Tablet,San Francisco,0,119,13,Debit Card,0
1,T2,4507,1554.58,ATM Withdrawal,13.0,Mobile,New York,4,79,3,Credit Card,0
2,T3,1860,2395.02,ATM Withdrawal,,Mobile,,3,115,9,,0
3,T4,2294,100.10,Bill Payment,15.0,Desktop,Chicago,4,3,4,UPI,0
4,T5,2130,1490.50,POS Payment,19.0,Mobile,San Francisco,2,57,7,Credit Card,0
...,...,...,...,...,...,...,...,...,...,...,...,...
50995,T33982,2339,3112.51,Bill Payment,15.0,Mobile,New York,0,7,8,Debit Card,0
50996,T31261,2152,2897.15,Online Purchase,3.0,Mobile,Miami,1,75,11,Net Banking,1
50997,T12293,3345,2204.43,POS Payment,18.0,Mobile,San Francisco,3,73,5,Credit Card,0
50998,T42287,1518,4787.17,POS Payment,19.0,Tablet,New York,2,108,14,Net Banking,0


In [163]:
def fillup(cell):
    if cell is np.nan:
        return 'Unknown'
    else:
        return cell

df2['Location'] = df2['Location'].apply(fillup)
df2

Unnamed: 0,Transaction_ID,User_ID,Transaction_Amount,Transaction_Type,Time_of_Transaction,Device_Used,Location,Previous_Fraudulent_Transactions,Account_Age,Number_of_Transactions_Last_24H,Payment_Method,Fraudulent
0,T1,4174,1292.76,ATM Withdrawal,16.0,Tablet,San Francisco,0,119,13,Debit Card,0
1,T2,4507,1554.58,ATM Withdrawal,13.0,Mobile,New York,4,79,3,Credit Card,0
2,T3,1860,2395.02,ATM Withdrawal,,Mobile,Unknown,3,115,9,,0
3,T4,2294,100.10,Bill Payment,15.0,Desktop,Chicago,4,3,4,UPI,0
4,T5,2130,1490.50,POS Payment,19.0,Mobile,San Francisco,2,57,7,Credit Card,0
...,...,...,...,...,...,...,...,...,...,...,...,...
50995,T33982,2339,3112.51,Bill Payment,15.0,Mobile,New York,0,7,8,Debit Card,0
50996,T31261,2152,2897.15,Online Purchase,3.0,Mobile,Miami,1,75,11,Net Banking,1
50997,T12293,3345,2204.43,POS Payment,18.0,Mobile,San Francisco,3,73,5,Credit Card,0
50998,T42287,1518,4787.17,POS Payment,19.0,Tablet,New York,2,108,14,Net Banking,0
