In [1]:
import pandas as pd
import plotly.express as px


In [2]:
class DataLoader:
    def __init__(self, raw_dataset):
        self.raw_dataset = raw_dataset


    def read_data(self):
        """
        handling file existence, empty dataset, and parsing errors
        """
        try:
            return pd.read_csv(self.raw_dataset, encoding="ISO-8859-1", engine='python')
        except FileNotFoundError:
            print("Error: The file was not found.")
        except pd.errors.EmptyDataError:
            print("Error: The file is empty.")
        except pd.errors.ParserError:
            print("Error: The file could not be parsed.")

In [39]:
class DataCleaner:
    def __init__(self, raw_dataset):
        self.raw_dataset = raw_dataset

    """
    def location_to_string_type(self):
        try:
            # Ensure the column is treated as string
            self.raw_dataset['Location'] = self.raw_dataset['Location'].astype(str)
            print("Converted Location column to string type")
        except Exception as e:
            print(f"Failed to convert Location column to string type, {e}")

        return self.raw_dataset
    """

    def remove_blanks_in_locations(self):
        try:
            # stripping blank spaces in location names 
            self.raw_dataset['Location'] = self.raw_dataset['Location'].apply(lambda x: x.strip() if isinstance(x, str) else x)
            print("stripped white spaces")

        except Exception as e:
            print(f"Failed to strip white space in locations, {e}")

        return self.raw_dataset
        

    def titlise_locations(self):
        try:
            # titlising the location names after stripping them of blank spaces
            self.raw_dataset['Location'] = self.raw_dataset['Location'].apply(lambda x: x.title() if isinstance(x, str) else x)
            print("titlised location names")
        except Exception as e:
            print(f"Failed to titlise the location names, {e}")

        return self.raw_dataset


    # the central location-cleaning function 
    def clean_location(self):
        self.remove_blanks_in_locations()
        self.titlise_locations()
        return self.raw_dataset


    
    def titlise_makers(self):
        try:
            self.raw_dataset['Make'] = self.raw_dataset['Make'].apply(lambda x: x.title())
            print("Titlised Make column, success")

        except Exception as e:
            print(f"Failed to titlise Make column, {e}")

        return self.raw_dataset

    def titlise_models(self):
        try:
            self.raw_dataset['Model'] = self.raw_dataset['Model'].apply(lambda x: x.title())
            print("Titlised Model column, success")

        except Exception as e:
            print(f"Failed to titlise Model column, {e}")

        return self.raw_dataset

In [4]:
if __name__ == "__main__":
    raw_dataset = '../dataset/raw/uae_used_cars.csv'

# Instantiate the Loader Class

In [6]:
    loader = DataLoader(raw_dataset)

# Read and Load the Dataset

In [8]:
    data_loader = loader.read_data()
    print(data_loader)

               Make                   Model  Year   Price  Mileage  \
0            toyota                   camry  2016   47819   156500   
1               kia                 sorento  2013   61250   169543   
2              mini                  cooper  2023   31861   221583   
3            nissan                  altima  2016  110322    69754   
4            toyota  land-cruiser-76-series  2020  139994    71399   
...             ...                     ...   ...     ...      ...   
9995          tesla                 model-3  2018  273413    76920   
9996           audi                      a3  2022   80053   258150   
9997         toyota                   prado  2014  183381    80525   
9998        peugeot                  expert  2016   40876   288305   
9999  mercedes-benz                 c-class  2009  150261   283648   

                 Body Type Cylinders            Transmission Fuel Type  Color  \
0                    Sedan         4  Automatic Transmission  Gasoline  Black 

In [9]:
    print(data_loader.describe())

               Year         Price        Mileage
count  10000.000000  1.000000e+04   10000.000000
mean    2014.472800  2.452345e+05  155161.871700
std        5.790839  4.709773e+05   83681.858983
min     2005.000000  7.183000e+03   10006.000000
25%     2009.000000  5.035250e+04   82904.000000
50%     2014.000000  1.027660e+05  154370.500000
75%     2019.000000  2.312480e+05  227551.250000
max     2024.000000  1.468698e+07  299996.000000


In [10]:
    print(data_loader.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Make          10000 non-null  object
 1   Model         10000 non-null  object
 2   Year          10000 non-null  int64 
 3   Price         10000 non-null  int64 
 4   Mileage       10000 non-null  int64 
 5   Body Type     10000 non-null  object
 6   Cylinders     9895 non-null   object
 7   Transmission  10000 non-null  object
 8   Fuel Type     10000 non-null  object
 9   Color         10000 non-null  object
 10  Location      10000 non-null  object
 11  Description   10000 non-null  object
dtypes: int64(3), object(9)
memory usage: 937.6+ KB
None


In [11]:
    print(data_loader.columns)

Index(['Make', 'Model', 'Year', 'Price', 'Mileage', 'Body Type', 'Cylinders',
       'Transmission', 'Fuel Type', 'Color', 'Location', 'Description'],
      dtype='object')


# Locations

- some cities contain white spaces which they should be stripped of
- there are duplicate location that must be dropped with duplicate rows

In [13]:
    print(data_loader['Location'].unique())

[' Dubai' ' Abu Dhabi' 'Abu Dhabi' ' Sharjah' 'Dubai' ' Ajman' 'Ajman'
 ' Al Ain' 'Al Ain' ' Fujeirah' 'Umm Al Qawain' ' Umm Al Qawain' 'Sharjah'
 'Ras Al Khaimah' ' Ras Al Khaimah' 'Fujeirah']


In [14]:
    print(type(data_loader))

<class 'pandas.core.frame.DataFrame'>


# Passing the loaded dataset into the Cleaner Class

In [16]:
    cleaner = DataCleaner(data_loader)

In [17]:
    cleaner.clean_location()

stripped white spaces
titlised location names


Unnamed: 0,Make,Model,Year,Price,Mileage,Body Type,Cylinders,Transmission,Fuel Type,Color,Location,Description
0,toyota,camry,2016,47819,156500,Sedan,4,Automatic Transmission,Gasoline,Black,Dubai,"2016 toyota camry with Rear camera, Leather se..."
1,kia,sorento,2013,61250,169543,SUV,4,Automatic Transmission,Gasoline,Grey,Abu Dhabi,"2013 kia sorento with Sunroof, Adaptive cruise..."
2,mini,cooper,2023,31861,221583,Soft Top Convertible,4,Automatic Transmission,Gasoline,Grey,Dubai,"2023 mini cooper with Adaptive cruise control,..."
3,nissan,altima,2016,110322,69754,Sedan,4,Automatic Transmission,Gasoline,Red,Dubai,"2016 nissan altima with Rear camera, Adaptive ..."
4,toyota,land-cruiser-76-series,2020,139994,71399,Pick Up Truck,4,Manual Transmission,Gasoline,White,Dubai,2020 toyota land-cruiser-76-series with Adapti...
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,tesla,model-3,2018,273413,76920,Sedan,,Automatic Transmission,Electric,White,Dubai,"2018 tesla model-3 with Bluetooth, Sunroof, Le..."
9996,audi,a3,2022,80053,258150,Sedan,4,Automatic Transmission,Gasoline,Red,Dubai,"2022 audi a3 with Sunroof, Bluetooth, Rear cam..."
9997,toyota,prado,2014,183381,80525,SUV,6,Automatic Transmission,Gasoline,White,Dubai,"2014 toyota prado with Rear camera, Adaptive c..."
9998,peugeot,expert,2016,40876,288305,Utility Truck,4,Automatic Transmission,Diesel,White,Dubai,"2016 peugeot expert with Navigation system, Re..."


In [18]:
    cleaner.titlise_makers()

Titlised Make column, success


Unnamed: 0,Make,Model,Year,Price,Mileage,Body Type,Cylinders,Transmission,Fuel Type,Color,Location,Description
0,Toyota,camry,2016,47819,156500,Sedan,4,Automatic Transmission,Gasoline,Black,Dubai,"2016 toyota camry with Rear camera, Leather se..."
1,Kia,sorento,2013,61250,169543,SUV,4,Automatic Transmission,Gasoline,Grey,Abu Dhabi,"2013 kia sorento with Sunroof, Adaptive cruise..."
2,Mini,cooper,2023,31861,221583,Soft Top Convertible,4,Automatic Transmission,Gasoline,Grey,Dubai,"2023 mini cooper with Adaptive cruise control,..."
3,Nissan,altima,2016,110322,69754,Sedan,4,Automatic Transmission,Gasoline,Red,Dubai,"2016 nissan altima with Rear camera, Adaptive ..."
4,Toyota,land-cruiser-76-series,2020,139994,71399,Pick Up Truck,4,Manual Transmission,Gasoline,White,Dubai,2020 toyota land-cruiser-76-series with Adapti...
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,Tesla,model-3,2018,273413,76920,Sedan,,Automatic Transmission,Electric,White,Dubai,"2018 tesla model-3 with Bluetooth, Sunroof, Le..."
9996,Audi,a3,2022,80053,258150,Sedan,4,Automatic Transmission,Gasoline,Red,Dubai,"2022 audi a3 with Sunroof, Bluetooth, Rear cam..."
9997,Toyota,prado,2014,183381,80525,SUV,6,Automatic Transmission,Gasoline,White,Dubai,"2014 toyota prado with Rear camera, Adaptive c..."
9998,Peugeot,expert,2016,40876,288305,Utility Truck,4,Automatic Transmission,Diesel,White,Dubai,"2016 peugeot expert with Navigation system, Re..."


In [37]:
    print(data_loader['Make'].unique())

['Toyota' 'Kia' 'Mini' 'Nissan' 'Chevrolet' 'Cadillac' 'Mercedes-Benz'
 'Infiniti' 'Mazda' 'Jeep' 'Ferrari' 'Bmw' 'Porsche' 'Bentley'
 'Land-Rover' 'Honda' 'Dodge' 'Rolls-Royce' 'Ford' 'Hyundai' 'Lamborghini'
 'Mitsubishi' 'Aston-Martin' 'Gmc' 'Renault' 'Volkswagen' 'Lexus' 'Suzuki'
 'Lincoln' 'Audi' 'Maybach' 'Peugeot' 'Jaguar' 'Citroen' 'Maserati'
 'Tesla' 'Volvo' 'Lotus' 'Mclaren' 'Alfa-Romeo' 'Fiat' 'Chrysler' 'Opel'
 'Mercedes-Maybach' 'Geely' 'Acura' 'Subaru' 'Genesis' 'Isuzu'
 'Westfield-Sportscars' 'Mg' 'Hummer' 'Skoda' 'Mercury' 'Rover' 'Changan'
 'Other-Make' 'Daihatsu' 'Jetour' 'Saab' 'Gac' 'Haval' 'Baic' 'Smart'
 'Morgan']


In [41]:
        cleaner.titlise_models()

AttributeError: 'DataCleaner' object has no attribute 'titlise_models'