In [1]:
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
import seaborn as sns

In [20]:
#Load the dataset into a Pandas DataFrame.
df = pd.read_csv("./datasets/uk-inflation-data-1989-2022/Inflation by Month.csv")



In [55]:
#Convert object data types to float
#df[df["Year"]== " "]
df["Year"] = df["Year"].astype(int)
#df["Year"].fillna(0, inplace=True)
#df["Year"].isna().sum()

In [156]:
#Show df summary information.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416 entries, 0 to 415
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Year       410 non-null    float64
 1   Month      410 non-null    object 
 2   Inflation  411 non-null    object 
dtypes: float64(1), object(2)
memory usage: 9.9+ KB


In [157]:
#Show stats info on numerical columns.
df.describe()

Unnamed: 0,Year
count,410.0
mean,2005.929268
std,9.952571
min,1989.0
25%,1997.0
50%,2006.0
75%,2014.75
max,2023.0


In [158]:
#Display the first few rows of the DataFrame.
df.head()

Unnamed: 0,Year,Month,Inflation
0,1989.0,JAN,5.7
1,,,
2,1989.0,FEB,5.8
3,1989.0,MAR,5.9
4,1989.0,APR,5.6


In [159]:
#Check the data types of each column.
df.dtypes

Year         float64
Month         object
Inflation     object
dtype: object

In [161]:
#Handling Missing Values:
	#Identify missing values in the dataset.
		#Count the number of missing values in each column.

df.isna().sum()


Year         6
Month        6
Inflation    5
dtype: int64

In [162]:
#Show rows with missing values on a specific column name.
df[df["Month"].isna()] #also works on isnull()

Unnamed: 0,Year,Month,Inflation
1,,,
24,1990.0,,9.2
90,1996.0,,2.8
106,1997.0,,2.2
247,,,
261,2010.0,,2.4


In [166]:
#Replace empty values with NaN in all rows
df.fillna("NaN", inplace=True)
df.shape

(416, 3)

In [167]:
#Delete rows 
		#with all columns empty values.
df.dropna(how="all", inplace=True)
df.shape


(416, 3)

In [229]:
#Replace NaN values in any column with the previous value.
df["Inflation"].ffill(inplace=True)
df["Inflation"].isna().sum()


0

In [271]:
#Delete rows 
		#with all columns empty values.
x = df[(df["Inflation"].isna()) & (df["Month"].isna()) & (df["Year"].isna())].index
df.drop(index= x, inplace=True)

In [278]:
#Delete rows 
		#with empty values in any column
df.dropna(how="any", inplace=True)
df.shape

(402, 3)

In [280]:
#Impute missing numerical values.
	#Mean Imputation: Replace missing values with the mean (average) value of the non-missing values in the same column. 
	#This is a simple and often effective method.


mean=df["Inflation"].mean()
df["Inflation"].fillna(mean, inplace=True)
df["Inflation"].isna().sum()

In [301]:
#Median Imputation: Replace missing values with the median value of the non-missing values in the same column. 
				   #This is less sensitive to outliers compared to mean imputation.

#df[df["Inflation"] == " "] = float(0)
#df["Inflation"] = df["Inflation"].astype(float)
#df["Inflation"].dtypes


median = df["Inflation"].median()
df["Inflation"].fillna(median, inplace=True)
df[df["Inflation"].isna()]

Unnamed: 0,Year,Month,Inflation


In [316]:
#Mode Imputation: For discrete or categorical data, 
#you can replace missing values with the mode (most frequent) value in the same column.

#df[df["Inflation"] == " "] = float(0)
#df["Inflation"] = df["Inflation"].astype(float)
#df["Inflation"].dtypes

mode = df["Inflation"].mode()

#convert series to float
mode = float(mode.iloc[0])

df["Inflation"].fillna(mo, inplace=True)
df[df["Inflation"].isna()]




Unnamed: 0,Year,Month,Inflation


In [79]:
#Interpolation: For time-series data, you can use interpolation methods to estimate missing values based on the values 
#before and after the missing point in time.

df["Year"].interpolate(method="linear")
df["Year"].isna().sum()




0

In [113]:
df[df["Inflation"].isna()]

Unnamed: 0,Year,Month,Inflation
1,,,
31,1991.0,JUL,
116,1998.0,AUG,
140,2000.0,AUG,
158,2002.0,FEB,


In [157]:
#Drop columns with more than 15% percentage of missing values.
		#For one specific column

if df["Inflation"].isna().sum() / df.shape[0] > 0.15:
	df.dropna(subset="Inflation", inplace=True)
    


In [170]:
#Drop columns with more than 15% percentage of missing values.
		#For all columns
if df["Inflation"].isna().sum() / df.shape[0] > 0.15:
    df.dropna(inplace=True)


In [190]:
#Handling Duplicates:
	#Detect and remove all duplicated rows for all columns.
for column in df.columns:
    print("Column", column, "has", df.duplicated(subset=column).sum(), "values duplicated.")

df.drop_duplicates(inplace=True)

for column in df.columns:
    df.drop_duplicates(subset=column, inplace=True)
    print("Column", column, "has", df.duplicated(subset=column).sum(), "values duplicated.")

Column Year has 53 values duplicated.
Column Month has 48 values duplicated.
Column Inflation has 28 values duplicated.
Column Year has 0 values duplicated.
Column Month has 0 values duplicated.
Column Inflation has 0 values duplicated.


In [205]:
#Detect and remove all duplicated rows for a group of columns.

columns = ["Year", "Month"]

for column in columns:
    print("Column", column, "has", df.duplicated(subset=column).sum(), "duplicated values.")


for column in columns:
    df.drop_duplicates(subset=column, inplace= True)
    print("Column", column, "has", df.duplicated(subset=column).sum(), "duplicated values.")

Column Year has 74 duplicated values.
Column Month has 69 duplicated values.
Column Year has 0 duplicated values.
Column Month has 0 duplicated values.


In [244]:
#Detect and remove all duplicated rows for a group of columns. Drop all except first occurrence

columns = ["Year", "Month"]

for column in columns:
    print("Column", column, "has", df.duplicated(subset=column).sum(), "duplicated values.")


for column in columns:
    df.drop_duplicates(subset=column, inplace= True, keep="first")
    print("Column", column, "has", df.duplicated(subset=column).sum(), "duplicated values.")

Column Year has 74 duplicated values.
Column Month has 69 duplicated values.
Column Year has 0 duplicated values.
Column Month has 0 duplicated values.


In [262]:
#Data Type Conversion:
	#Convert all numeric columns to string
for column in df.columns:
    df[column] = df[column].astype("object")
    
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82 entries, 0 to 81
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Year       77 non-null     object
 1   Month      80 non-null     object
 2   Inflation  65 non-null     object
dtypes: object(3)
memory usage: 2.1+ KB


In [265]:
#Convert one integer column to string
df["Year"] = df["Year"].astype(str)

In [270]:
#Convert any values column based on data, if value > 4 convert the value/2
df[df["Inflation"] > 4] = df["Inflation"] /2
df["Inflation"]

Unnamed: 0,Year,Month,Inflation
0,1989.0,JAN,5.7
2,1989.0,FEB,5.8
3,1989.0,MAR,5.9
4,1989.0,APR,5.6
5,1989.0,MAY,5.9
6,1989.0,JUN,5.8
7,1989.0,JUL,5.7
8,1989.0,AUG,5.5
9,1989.0,SEP,5.7
11,1989.0,NOV,5.7


In [285]:
#Convert any values column based on data, if value > 4 convert the value/2
df.loc[df["Inflation"] > 4, "Inflation"] = df["Inflation"] /2


In [288]:
#Renaming Columns:
	#Rename column by name
df.rename(columns = {"Year": "Año"}, inplace=True)
df.columns

Index(['Año', 'Month', 'Inflation'], dtype='object')

In [290]:
#Rename a list of columns
rename = {"Year" : "Añito", "Month" : "Mesito"}
df.rename(columns=rename, inplace=True)
df.columns

Index(['Añito', 'Mesito', 'Inflation'], dtype='object')

In [303]:
df["Month"]

0     JAN
1     NaN
2     FEB
3     MAR
4     APR
     ... 
77    MAY
78    JUN
79    JUL
80    AUG
81    SEP
Name: Month, Length: 82, dtype: object

In [313]:
#Standardizing Text Data:
	#Standardize text data in any columnconverting strings to lowercase 
df["Month"] = df["Month"].str.lower()    
df["Month"]

0     jan
1     NaN
2     feb
3     mar
4     apr
     ... 
77    may
78    jun
79    jul
80    aug
81    sep
Name: Month, Length: 82, dtype: object