# importing library

In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Class define

The class contain 9 function :

    1- __init__ :- for intialize the class 
    
    2- read_file :- return a dataframe contain the data from the file (after extract the file from the path)
    
    3- df_info :- show information about columns from data
    
    4- df_describe :- show some statistics information about data
    
    5- df_shape :- return the size of data (rows,columns)
    
    6- df_show :- display some rows from dataframe, by default 5 rows 
    
    7- data_summary :- show some of preiveous methods and more 
    
    8- data_handel :- this method handle the missing value problem in 2 choice (fill,drop)
    
    9- CategToNumeric :- an preprocessing method to convert categorical values into numeric one 

In [13]:
class PrepKit:

    def __init__(self,data_path):
        """
        Initialize the PrepKit with given path

        Parameters :
          - path (str) : the path of dataset
        Attributes :
          - data_path (str) : path of dataset
          - df (DataFrame) : dataframe 
        """
        self.path=data_path
        self.df=pd.DataFrame()
        
    def read_file(self):
        """
        Reads files after deteced the type of file (csv,json,excel files) 
        No Parameter
        Returns :
          - df : dataframe after reading datset
        """
        
        valid=["csv", "json", "xls", "xlsx", "xlsm", "xlsb", "xltx", "xltm"]
        if self.path.split('.')[-1].lower() not in valid:
            raise ValueError("Unsupported file format. Please provide a valid format (csv, excel, json) \n")
        else:
            try:
                if self.path.split('.')[-1].lower()=='csv':
                    self.df=pd.read_csv(self.path)
                elif self.path.split('.')[-1].lower()=='json':
                    self.df=pd.read_json(self.path)
                else:
                    self.df=pd.read_excel(self.path)
            except Exception as e:
                raise ValueError(f"Error reading file {str(e)}\n")

        return self.df
    
    def df_info(self):
        print(self.df.info())
    
    def df_describe(self):
        print(self.df.describe())
    
    def df_shape(self):
        print(f'rows : {self.df.shape[0]}, columns : {self.df.shape[1]}')
        return self.df.shape
    
    def df_show(self,rows=5):
        print(self.df.head(rows))
    
    def data_summary(self):
        '''
        Contain some information about the data
        '''
        
        print("Data Summary :")
        print()
        
        print('Data Info :')
        print(self.df.info())
        print()
        
        print('Data Describtion :')
        print(self.df.describe())
        print()
        
        print("Number of Rows:", self.df.shape[0])
        print()
        
        print("Number of Columns:", self.df.shape[1])
        print()
        
        print("Average Values:")
        numeric_df=self.df.select_dtypes(include=np.number)
        print(numeric_df.mean())
        print()
        
        print("Most Frequent Values:")
        print(self.df.mode().iloc[0])
        print()
        
        print("Standard Deviation:")
        print(numeric_df.std())
    
    def data_handel(self):
        
        '''
        Handle missing values in the DataFrame by fill missing data with previous value
        '''
        
        print('The Missing values in data :')
        print(self.df.isnull().sum())
        print('sum of all missing values :',self.df.isnull().sum().sum())
        
        choice=input('write fill if you want to fill missing values, other wise write drop : ')
        if choice=='fill':
            self.df=self.df.fillna(method='pad') # fill missing data with previous value
            print('The Missing values in data :')
            print(self.df.isnull().sum())
        elif choice=='drop':
            self.df=self.df.dropna()
            print('The Missing values in data :')
            print(self.df.isnull().sum())
        elif choice=='skip':
            pass
        else :
            raise ValueError('Wrong choice, Try again and  write fill or drop')
    
    def CategToNumeric(self):
        print('CategToNumeric')
        print()
        
        CategoryColumns=self.df.select_dtypes(include=['category','object']).columns
        CategoryIndices=[self.df.columns.get_loc(col) for col in CategoryColumns]

        print('to avoid error, we must fill or drop missing values\n    write skip if you already handle the missing value')
        self.data_handel()
        
        self.categ_df=self.df.iloc[:,CategoryIndices]
        self.X=self.categ_df.values
        for name,idx in zip(CategoryColumns,CategoryIndices):
            ct=ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[idx])],remainder='passthrough')
            self.X=np.array(ct.fit_transform(self.X))
            self.df.loc[:,name]=self.X[:,idx]
        print(self.df)

# MainCode 

In [20]:
valid=["csv", "json", "xls", "xlsx", "xlsm", "xlsb", "xltx", "xltm"]

while True:
    path=input('enter file path')
    if path.split('.')[-1].lower() not in valid:
        raise ValueError('Not Supported type')
    else:
        break

enter file path Bengaluru_House_Data.csv


In [21]:
pk=PrepKit(path)

In [22]:
pk.read_file()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00


In [23]:
pk.data_summary()

Data Summary :

Data Info :
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB
None

Data Describtion :
               bath       balcony         price
count  13247.000000  12711.000000  13320.000000
mean       2.692610      1.584376    112.565627
std        1.341458      0.817263    148.971674
min        1.000000      0.000000      8.000000
25%        2.000000      1.000000     50.000000
50%        2.000000     

In [24]:
pk.df_shape()

rows : 13320, columns : 9


(13320, 9)

In [25]:
pk.df_info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB
None


In [26]:
pk.df_show()

              area_type   availability                  location       size  \
0  Super built-up  Area         19-Dec  Electronic City Phase II      2 BHK   
1            Plot  Area  Ready To Move          Chikka Tirupathi  4 Bedroom   
2        Built-up  Area  Ready To Move               Uttarahalli      3 BHK   
3  Super built-up  Area  Ready To Move        Lingadheeranahalli      3 BHK   
4  Super built-up  Area  Ready To Move                  Kothanur      2 BHK   

   society total_sqft  bath  balcony   price  
0  Coomee        1056   2.0      1.0   39.07  
1  Theanmp       2600   5.0      3.0  120.00  
2      NaN       1440   2.0      3.0   62.00  
3  Soiewre       1521   3.0      1.0   95.00  
4      NaN       1200   2.0      1.0   51.00  


In [27]:
pk.CategToNumeric()

CategToNumeric

to avoid error, we must fill or drop missing values
    write skip if you already handle the missing value
The Missing values in data :
area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64
sum of all missing values : 6201


write fill if you want to fill missing values, other wise write drop :  drop


The Missing values in data :
area_type       0
availability    0
location        0
size            0
society         0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64
      area_type availability location size society total_sqft  bath  balcony  \
0           0.0          0.0      1.0  0.0     1.0        0.0   2.0      1.0   
1           0.0          0.0      1.0  0.0     1.0        0.0   5.0      3.0   
3           0.0          0.0      1.0  0.0     1.0        0.0   3.0      1.0   
5           0.0          0.0      1.0  0.0     1.0        0.0   2.0      1.0   
11          0.0          0.0      1.0  0.0     1.0        0.0   5.0      3.0   
...         ...          ...      ...  ...     ...        ...   ...      ...   
13313       0.0          0.0      1.0  0.0     1.0        0.0   2.0      1.0   
13314       0.0          0.0      1.0  0.0     1.0        0.0   3.0      3.0   
13315       1.0          0.0      1.0  1.0     0.0        0.0   4.0      0.0