In [1]:
import pandas as pd
import numpy as np
from IPython.display import display
from sklearn.impute import SimpleImputer
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno #Our hero ^_^


In [5]:
class Cleaner:
    def __init__(self, file_name):
        self.file_name = file_name
        self.data_frame = None
        self.temp_data_frame = self.data_frame
        
    def load_data(self,delimiter=None):
        if not self.file_name:
            return "File Name Required"
        if not delimiter:
            self.data_frame = pd.read_csv(self.file_name)
        else:
            self.data_frame = pd.read_csv(self.file_name, sep=delimiter)

    def describe_data(self):
        unique_values = self.data_frame.nunique()
        total_rows = self.data_frame.shape[0]
        
        data = {
            "Column_Name" : self.data_frame.columns.values,
            "Unique_Values" : [],
            "Data_Types": [],
            "NULL_and_NaN_values_Count": []
        }
        
        for col in data["Column_Name"]:
            data["NULL_and_NaN_values_Count"].append(self.data_frame[col].isna().sum())
            data["Unique_Values"].append(unique_values[col])
            data["Data_Types"].append(self.data_frame.dtypes[col])

        uninque_vals_analysis = pd.DataFrame(data)
        return uninque_vals_analysis

    def clean_data(self,impute = True, strategy= "mean",replace = False,missing_values=np.nan,columns=None,category_columns = None,drop=""):
        
        # drop rows, cols or both containing NaN values
        if drop.lower() == "row":
            self.temp_data_frame.dropna(inplace=True)
        elif drop.lower() == "col":
            self.tmep_data_frame.dropna(axis=1,inplace=True)
        elif drop.lower() == "both":
            self.temp_data_frame.dropna(inplace=True)
            self.tmep_data_frame.dropna(axis=1,inplace=True)
            
        if not columns:
            columns = self.data_frame.columns.values
            
        if len(category_columns) != 0:
            for col in category_columns:
                columns = columns[columns != col]
        self.temp_data_frame = self.data_frame.copy()
        if impute:
            if len(category_columns) != 0:
                self.temp_data_frame.drop(category_columns,axis=1,inplace=True)
            self.temp_data_frame.replace('?',np.NaN,inplace=True)
            imp=SimpleImputer(missing_values=np.NaN)
            idf=pd.DataFrame(imp.fit_transform(self.temp_data_frame))
            idf.columns=self.temp_data_frame.columns
            idf.index=self.temp_data_frame.index
            self.temp_data_frame = idf
            display(self.temp_data_frame)
            
    def visualize(self):
        msno.heatmap(self.data_frame,cmap="YlGnBu")
#         msno.bar(self.data_frame)
            
    def update(self):
        if self.temp_data_frame != self.data_frame:
            self.data_frame = self.temp_data_frame
        else:
            return "Nothing to update"

In [8]:
cl = Cleaner("data-sets/test.csv")
cl.load_data()

In [11]:
cl.describe_data()
# cl.visualize()
cl.clean_data(category_columns=["D"])
# cl.data_frame.dropna(axis=1,inplace=True)
# cl.data_frame.A.unique()[0]

['D']
['A' 'B' 'C']


Unnamed: 0,A,B,C
0,1.0,2.0,3.0
1,4.0,11.166667,6.0
2,7.0,11.166667,9.0
3,10.0,11.0,12.0
4,13.0,14.0,15.0
5,16.0,17.0,11.428571
6,19.0,12.0,12.0
7,20.0,11.0,23.0


In [129]:
cl.data_frame.isna().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64