In [1]:
# I'm Using "Cost of Living Index by Country" DataSet
# About DataSet: The Cost of Living Index tells you how expensive it is to live in one place compared to another.
# Example:
# Imagine we use New York City as the "standard" city. We give New York City a score of 100.
# Comparing Cities:
# If another city, say Tokyo, has a score of 120, it means living in Tokyo is 20% more expensive than living in New York City.
# If a city like Cairo has a score of 80, it means living in Cairo is 20% cheaper than living in New York City.

In [2]:
# Import Dependencies OR Import Some necessary Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

In [3]:
# Load Dataset OR Read CSV File
df = pd.read_csv(r"DataSets\Cost_of_Living_Index_by_Country.csv")
df.head()

Unnamed: 0,Rank,Country,Cost of Living Index,Rent Index,Cost of Living Plus Rent Index,Groceries Index,Restaurant Price Index,Local Purchasing Power Index
0,1,Switzerland,101.1,46.5,74.9,109.1,97.0,158.7
1,2,Bahamas,85.0,36.7,61.8,81.6,83.3,54.6
2,3,Iceland,83.0,39.2,62.0,88.4,86.8,120.3
3,4,Singapore,76.7,67.2,72.1,74.6,50.4,111.1
4,5,Barbados,76.6,19.0,48.9,80.8,69.4,43.5


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 8 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Rank                            121 non-null    int64  
 1   Country                         121 non-null    object 
 2   Cost of Living Index            121 non-null    float64
 3   Rent Index                      121 non-null    float64
 4   Cost of Living Plus Rent Index  121 non-null    float64
 5   Groceries Index                 121 non-null    float64
 6   Restaurant Price Index          121 non-null    float64
 7   Local Purchasing Power Index    121 non-null    float64
dtypes: float64(6), int64(1), object(1)
memory usage: 7.7+ KB


In [5]:
df.isnull().sum()

Rank                              0
Country                           0
Cost of Living Index              0
Rent Index                        0
Cost of Living Plus Rent Index    0
Groceries Index                   0
Restaurant Price Index            0
Local Purchasing Power Index      0
dtype: int64

In [6]:
# In This Data Set, There is no Null/Missing value Present

In [7]:
df = pd.read_csv(r"DataSets\Imdb.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,names,relese_date,run_time,rating_tag,rating
0,0,1 Breaking Bad,2008–2013,62 eps,TV-MA,9.5
1,1,2 Planet Earth II,2016,6 eps,TV-G,9.5
2,2,3 Planet Earth,2006,11 eps,TV-PG,9.4
3,3,4 Band of Brothers,2001,10 eps,TV-MA,9.4
4,4,5 Chernobyl,2019,5 eps,TV-MA,9.3


In [8]:
df.isnull().sum()

Unnamed: 0     0
names          0
relese_date    0
run_time       0
rating_tag     0
rating         0
dtype: int64

In [9]:
df = pd.read_csv(r"Datasets\LoanData_Raw.csv")
df.head()

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41.0,3.0,17,12,176.0,9.3,11.359392,5.008608,1
1,27.0,1.0,10,6,31.0,17.3,1.362202,4.000798,0
2,40.0,1.0,15,7,,5.5,0.856075,2.168925,0
3,41.0,,15,14,120.0,2.9,2.65872,0.82128,0
4,24.0,2.0,2,0,28.0,17.3,1.787436,3.056564,1


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       681 non-null    float64
 1   ed        680 non-null    float64
 2   employ    700 non-null    int64  
 3   address   700 non-null    int64  
 4   income    663 non-null    float64
 5   debtinc   700 non-null    float64
 6   creddebt  700 non-null    float64
 7   othdebt   700 non-null    float64
 8   default   700 non-null    object 
dtypes: float64(6), int64(2), object(1)
memory usage: 49.3+ KB


In [11]:
df.isnull().sum()

age         19
ed          20
employ       0
address      0
income      37
debtinc      0
creddebt     0
othdebt      0
default      0
dtype: int64

In [12]:
ageMean = df["age"].mean()
print(ageMean)

34.89867841409691


In [13]:
df["age"].fillna(ageMean, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["age"].fillna(ageMean, inplace = True)


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       700 non-null    float64
 1   ed        680 non-null    float64
 2   employ    700 non-null    int64  
 3   address   700 non-null    int64  
 4   income    663 non-null    float64
 5   debtinc   700 non-null    float64
 6   creddebt  700 non-null    float64
 7   othdebt   700 non-null    float64
 8   default   700 non-null    object 
dtypes: float64(6), int64(2), object(1)
memory usage: 49.3+ KB


In [15]:
float_cols = df.select_dtypes("float64").columns

In [16]:
float_cols

Index(['age', 'ed', 'income', 'debtinc', 'creddebt', 'othdebt'], dtype='object')

In [17]:
for col in float_cols:
    temp_mean = df[col].mean()
    df[col].fillna(temp_mean, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(temp_mean, inplace = True)


In [18]:
df[float_cols].isnull().sum()

age         0
ed          0
income      0
debtinc     0
creddebt    0
othdebt     0
dtype: int64

In [19]:
df.isnull().sum()

age         0
ed          0
employ      0
address     0
income      0
debtinc     0
creddebt    0
othdebt     0
default     0
dtype: int64

In [22]:
df = pd.read_csv(r"DataSets\LoanData_Raw.csv")
df.head()

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41.0,3.0,17,12,176.0,9.3,11.359392,5.008608,1
1,27.0,1.0,10,6,31.0,17.3,1.362202,4.000798,0
2,40.0,1.0,15,7,,5.5,0.856075,2.168925,0
3,41.0,,15,14,120.0,2.9,2.65872,0.82128,0
4,24.0,2.0,2,0,28.0,17.3,1.787436,3.056564,1


In [23]:
si = SimpleImputer(strategy = "mean")

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       681 non-null    float64
 1   ed        680 non-null    float64
 2   employ    700 non-null    int64  
 3   address   700 non-null    int64  
 4   income    663 non-null    float64
 5   debtinc   700 non-null    float64
 6   creddebt  700 non-null    float64
 7   othdebt   700 non-null    float64
 8   default   700 non-null    object 
dtypes: float64(6), int64(2), object(1)
memory usage: 49.3+ KB


In [25]:
df.isnull().sum()

age         19
ed          20
employ       0
address      0
income      37
debtinc      0
creddebt     0
othdebt      0
default      0
dtype: int64

In [28]:
float_cols = df.select_dtypes("float64").columns

In [29]:
array = si.fit_transform(df[float_cols])

In [31]:
imputed_cols = pd.DataFrame(array, columns=float_cols)

In [32]:
df[float_cols] = imputed_cols

In [33]:
df.isnull().sum()

age         0
ed          0
employ      0
address     0
income      0
debtinc     0
creddebt    0
othdebt     0
default     0
dtype: int64