In [1]:
import unittest
import pandas as pd
import numpy as np
import os
import sys

#define path
sys.path.append(os.path.abspath(os.path.join('..')))

#import custom modules
from scripts.DataLoading import data_loading 

class TestDataLoading(unittest.TestCase):

    def setUp(self):
        #create a dummy CSV file for testing
        self.dummy_data = {
            'col1': [1, 2, 3, 4, 5],
            'col2': ['A', 'B', '  ', 'D', 'E'],
            'TransactionMonth': ['2023-01-01', '2023-03-01', '2023-02-01', '2023-04-01', '2023-05-01'],
            'RegistrationYear': [2020, 2021, 2022, 2023, 2024],
            'VehicleIntroDate': ['2019-01-01', '2020-01-01', '2021-01-01', '2022-01-01', '2023-01-01'],
            'EmptyCol': [np.nan] * 5,
            'partialEmptyCol': [1, 2, 3, 4, np.nan],
            'stringCol': ['cat', 'dog', 'cat', 'dog', 'cat'],
            'CapitalOutstanding': ['1,2', '3,4', '5,6', '7,8', '9,0'],
            'Mmcode': [101, 102, 103, 104, 105],
            'Cubiccapacity': [1000, 1500, 2000, 2500, 3000],
            'Kilowatts': [50, 75, 100, 125, 150],
            'NumberOfDoors': [2, 4, 4, 2, 4],
            'Cylinders': [4, 4, 6, 4, 6],
            'SumInsured': [10000, 20000, 30000, 40000, 50000],
            'CalculatedPremiumPerTerm': [100, 200, 300, 400, 500],
            'TotalPremium': [500, 1000, 1500, 2000, 2500],
            'TotalClaims': [10, 20, 30, 40, 50]
        }
        self.dummy_df = pd.DataFrame(self.dummy_data)
        self.dummy_file_path = 'test_data.csv'
        self.dummy_df.to_csv(self.dummy_file_path, sep='|', index=False)
        self.output_folder = 'test_output'
        os.makedirs(self.output_folder, exist_ok=True)

    def tearDown(self):
        #clean up the dummy file and output folder after testing
        if os.path.exists(self.dummy_file_path):
            os.remove(self.dummy_file_path)
        if os.path.exists(self.output_folder):
            import shutil
            shutil.rmtree(self.output_folder)

    def test_load_df_successful(self):
        data_loader = data_loading(self.dummy_file_path, self.output_folder)
        loaded_df = data_loader.load_df()
        self.assertIsInstance(loaded_df, pd.DataFrame)
        self.assertFalse(loaded_df.empty)

    def test_load_df_capitalizes_columns(self):
        data_loader = data_loading(self.dummy_file_path, self.output_folder)
        loaded_df = data_loader.load_df()
        self.assertTrue(all(col[0].isupper() for col in loaded_df.columns))

    def test_load_df_replaces_empty_with_nan(self):
        data_loader = data_loading(self.dummy_file_path, self.output_folder)
        #load the data directly to check the replacement before dropping
        raw_df = pd.read_csv(self.dummy_file_path, engine='python', sep='|')
        #apply the replacement logic from the data_loading class
        new_df_after_replace = raw_df.replace('  ', np.nan).infer_objects(copy=False)
        #capitalise columns to match the data_loading output structure for the check
        new_df_after_replace.columns = [col.capitalize() if col[0].islower() else col for col in new_df_after_replace.columns]
        self.assertTrue(new_df_after_replace['Col2'].isnull().iloc[2])

    def test_load_df_datetime_conversion(self):
        data_loader = data_loading(self.dummy_file_path, self.output_folder)
        loaded_df = data_loader.load_df()
        self.assertTrue(pd.api.types.is_datetime64_any_dtype(loaded_df['TransactionMonth']))
        self.assertTrue(pd.api.types.is_datetime64_any_dtype(loaded_df['RegistrationYear']))
        self.assertTrue(pd.api.types.is_datetime64_any_dtype(loaded_df['VehicleIntroDate']))

    def test_load_df_drops_columns_with_high_nan(self):
        data_loader = data_loading(self.dummy_file_path, self.output_folder)
        loaded_df = data_loader.load_df()
        self.assertNotIn('Emptycol', loaded_df.columns)

    def test_load_df_drops_rows_with_low_nan(self):
        #create a dummy file with rows to be dropped
        dummy_data_with_nan_row = {
            'col1': [1, 2, np.nan, 4, 5],
            'col2': ['A', 'B', '  ', 'D', 'E'],
            'TransactionMonth': ['2023-01-01', '2023-03-01', '2023-02-01', '2023-04-01', '2023-05-01'],
            'RegistrationYear': [2020, 2021, 2022, 2023, 2024],
            'VehicleIntroDate': ['2019-01-01', '2020-01-01', '2021-01-01', '2022-01-01', '2023-01-01'],
            'EmptyCol': [np.nan] * 5,
            'PartialEmptyCol': [1, 2, np.nan, 4, np.nan],
            'StringCol': ['cat', 'dog', 'cat', 'dog', 'cat'],
            'CapitalOutstanding': ['1,2', '3,4', '5,6', '7,8', '9,0'],
            'Mmcode': [101, 102, 103, 104, 105],
            'Cubiccapacity': [1000, 1500, 2000, 2500, 3000],
            'Kilowatts': [50, 75, 100, 125, 150],
            'NumberOfDoors': [2, 4, 4, 2, 4],
            'Cylinders': [4, 4, 6, 4, 6],
            'SumInsured': [10000, 20000, 30000, 40000, 50000],
            'CalculatedPremiumPerTerm': [100, 200, 300, 400, 500],
            'TotalPremium': [500, 1000, 1500, 2000, 2500],
            'TotalClaims': [10, 20, 30, 40, 50]
        }
        dummy_df_with_nan_row = pd.DataFrame(dummy_data_with_nan_row)
        dummy_file_path_with_nan = 'test_data_with_nan.csv'
        dummy_df_with_nan_row.to_csv(dummy_file_path_with_nan, sep='|', index=False)

        data_loader = data_loading(dummy_file_path_with_nan, self.output_folder)
        loaded_df = data_loader.load_df()
        #expect two rows to be dropped as per the data and code logic
        self.assertEqual(len(loaded_df), len(dummy_df_with_nan_row) - 2)

        #clean up the extra dummy file
        if os.path.exists(dummy_file_path_with_nan):
            os.remove(dummy_file_path_with_nan)

    def test_load_df_dtype_conversion_float(self):
        data_loader = data_loading(self.dummy_file_path, self.output_folder)
        loaded_df = data_loader.load_df()
        self.assertTrue(pd.api.types.is_float_dtype(loaded_df['CapitalOutstanding']))

    def test_load_df_dtype_conversion_int(self):
        data_loader = data_loading(self.dummy_file_path, self.output_folder)
        loaded_df = data_loader.load_df()
        int_cols = ['Mmcode', 'Cubiccapacity', 'Kilowatts', 'NumberOfDoors', 'Cylinders']
        for col in int_cols:
            self.assertTrue(pd.api.types.is_integer_dtype(loaded_df[col]))

    def test_load_df_sorts_by_transactionmonth(self):
        data_loader = data_loading(self.dummy_file_path, self.output_folder)
        loaded_df = data_loader.load_df()
        sorted_dates = loaded_df['TransactionMonth'].tolist()
        self.assertEqual(sorted_dates, sorted(sorted_dates))

    def test_load_df_saves_processed_file(self):
        data_loader = data_loading(self.dummy_file_path, self.output_folder)
        data_loader.load_df()
        saved_file_path = os.path.join(self.output_folder, 'processed_insurance_data.csv')
        self.assertTrue(os.path.exists(saved_file_path))

#run class
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False, verbosity=2)

test_load_df_capitalizes_columns (__main__.TestDataLoading.test_load_df_capitalizes_columns) ... 

File read successfully!

Coloumns are capitalised.

Empty indexes are filled with "NAN" values.

Coloumns ['EmptyCol'] are dropped.

Indexes with empty values are dropped.

DataFrame is sorted by TransactionMonth column.

DataFrame is preprocessed.

Prrocessed DataFrame Saved to: test_output\processed_insurance_data.csv

DataFrame Head:


Unnamed: 0,Col1,Col2,TransactionMonth,RegistrationYear,VehicleIntroDate,Partialemptycol,Stringcol,CapitalOutstanding,Mmcode,Cubiccapacity,Kilowatts,NumberOfDoors,Cylinders,SumInsured,CalculatedPremiumPerTerm,TotalPremium,TotalClaims
0,1,A,2023-01-01,2020-01-01,2019-01-01,1.0,cat,1.2,101,1000,50,2,4,10000,100,500,10
1,2,B,2023-03-01,2021-01-01,2020-01-01,2.0,dog,3.4,102,1500,75,4,4,20000,200,1000,20
2,4,D,2023-04-01,2023-01-01,2022-01-01,4.0,dog,7.8,104,2500,125,2,4,40000,400,2000,40



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Col1                      3 non-null      int64         
 1   Col2                      3 non-null      object        
 2   TransactionMonth          3 non-null      datetime64[ns]
 3   RegistrationYear          3 non-null      datetime64[ns]
 4   VehicleIntroDate          3 non-null      datetime64[ns]
 5   Partialemptycol           3 non-null      float64       
 6   Stringcol                 3 non-null      object        
 7   CapitalOutstanding        3 non-null      float64       
 8   Mmcode                    3 non-null      int64         
 9   Cubiccapacity             3 non-null      int64         
 10  Kilowatts                 3 non-null      int64         
 11  NumberOfDoors             3 non-null      int64         
 12  Cylinders

(3, 17)


DataFrame Description:


Unnamed: 0,SumInsured,CalculatedPremiumPerTerm,TotalPremium,TotalClaims
count,3.0,3.0,3.0,3.0
mean,23333.333333,233.333333,1166.666667,23.333333
std,15275.252317,152.752523,763.762616,15.275252
min,10000.0,100.0,500.0,10.0
25%,15000.0,150.0,750.0,15.0
50%,20000.0,200.0,1000.0,20.0
75%,30000.0,300.0,1500.0,30.0
max,40000.0,400.0,2000.0,40.0


ok
test_load_df_datetime_conversion (__main__.TestDataLoading.test_load_df_datetime_conversion) ... 

File read successfully!

Coloumns are capitalised.

Empty indexes are filled with "NAN" values.

Coloumns ['EmptyCol'] are dropped.

Indexes with empty values are dropped.

DataFrame is sorted by TransactionMonth column.

DataFrame is preprocessed.

Prrocessed DataFrame Saved to: test_output\processed_insurance_data.csv

DataFrame Head:


Unnamed: 0,Col1,Col2,TransactionMonth,RegistrationYear,VehicleIntroDate,Partialemptycol,Stringcol,CapitalOutstanding,Mmcode,Cubiccapacity,Kilowatts,NumberOfDoors,Cylinders,SumInsured,CalculatedPremiumPerTerm,TotalPremium,TotalClaims
0,1,A,2023-01-01,2020-01-01,2019-01-01,1.0,cat,1.2,101,1000,50,2,4,10000,100,500,10
1,2,B,2023-03-01,2021-01-01,2020-01-01,2.0,dog,3.4,102,1500,75,4,4,20000,200,1000,20
2,4,D,2023-04-01,2023-01-01,2022-01-01,4.0,dog,7.8,104,2500,125,2,4,40000,400,2000,40



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Col1                      3 non-null      int64         
 1   Col2                      3 non-null      object        
 2   TransactionMonth          3 non-null      datetime64[ns]
 3   RegistrationYear          3 non-null      datetime64[ns]
 4   VehicleIntroDate          3 non-null      datetime64[ns]
 5   Partialemptycol           3 non-null      float64       
 6   Stringcol                 3 non-null      object        
 7   CapitalOutstanding        3 non-null      float64       
 8   Mmcode                    3 non-null      int64         
 9   Cubiccapacity             3 non-null      int64         
 10  Kilowatts                 3 non-null      int64         
 11  NumberOfDoors             3 non-null      int64         
 12  Cylinders

(3, 17)


DataFrame Description:


Unnamed: 0,SumInsured,CalculatedPremiumPerTerm,TotalPremium,TotalClaims
count,3.0,3.0,3.0,3.0
mean,23333.333333,233.333333,1166.666667,23.333333
std,15275.252317,152.752523,763.762616,15.275252
min,10000.0,100.0,500.0,10.0
25%,15000.0,150.0,750.0,15.0
50%,20000.0,200.0,1000.0,20.0
75%,30000.0,300.0,1500.0,30.0
max,40000.0,400.0,2000.0,40.0


ok
test_load_df_drops_columns_with_high_nan (__main__.TestDataLoading.test_load_df_drops_columns_with_high_nan) ... 

File read successfully!

Coloumns are capitalised.

Empty indexes are filled with "NAN" values.

Coloumns ['EmptyCol'] are dropped.

Indexes with empty values are dropped.

DataFrame is sorted by TransactionMonth column.

DataFrame is preprocessed.

Prrocessed DataFrame Saved to: test_output\processed_insurance_data.csv

DataFrame Head:


Unnamed: 0,Col1,Col2,TransactionMonth,RegistrationYear,VehicleIntroDate,Partialemptycol,Stringcol,CapitalOutstanding,Mmcode,Cubiccapacity,Kilowatts,NumberOfDoors,Cylinders,SumInsured,CalculatedPremiumPerTerm,TotalPremium,TotalClaims
0,1,A,2023-01-01,2020-01-01,2019-01-01,1.0,cat,1.2,101,1000,50,2,4,10000,100,500,10
1,2,B,2023-03-01,2021-01-01,2020-01-01,2.0,dog,3.4,102,1500,75,4,4,20000,200,1000,20
2,4,D,2023-04-01,2023-01-01,2022-01-01,4.0,dog,7.8,104,2500,125,2,4,40000,400,2000,40



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Col1                      3 non-null      int64         
 1   Col2                      3 non-null      object        
 2   TransactionMonth          3 non-null      datetime64[ns]
 3   RegistrationYear          3 non-null      datetime64[ns]
 4   VehicleIntroDate          3 non-null      datetime64[ns]
 5   Partialemptycol           3 non-null      float64       
 6   Stringcol                 3 non-null      object        
 7   CapitalOutstanding        3 non-null      float64       
 8   Mmcode                    3 non-null      int64         
 9   Cubiccapacity             3 non-null      int64         
 10  Kilowatts                 3 non-null      int64         
 11  NumberOfDoors             3 non-null      int64         
 12  Cylinders

(3, 17)


DataFrame Description:


Unnamed: 0,SumInsured,CalculatedPremiumPerTerm,TotalPremium,TotalClaims
count,3.0,3.0,3.0,3.0
mean,23333.333333,233.333333,1166.666667,23.333333
std,15275.252317,152.752523,763.762616,15.275252
min,10000.0,100.0,500.0,10.0
25%,15000.0,150.0,750.0,15.0
50%,20000.0,200.0,1000.0,20.0
75%,30000.0,300.0,1500.0,30.0
max,40000.0,400.0,2000.0,40.0


ok
test_load_df_drops_rows_with_low_nan (__main__.TestDataLoading.test_load_df_drops_rows_with_low_nan) ... 

File read successfully!

Coloumns are capitalised.

Empty indexes are filled with "NAN" values.

Coloumns ['EmptyCol'] are dropped.

Indexes with empty values are dropped.

DataFrame is sorted by TransactionMonth column.

DataFrame is preprocessed.

Prrocessed DataFrame Saved to: test_output\processed_insurance_data.csv

DataFrame Head:


Unnamed: 0,Col1,Col2,TransactionMonth,RegistrationYear,VehicleIntroDate,PartialEmptyCol,StringCol,CapitalOutstanding,Mmcode,Cubiccapacity,Kilowatts,NumberOfDoors,Cylinders,SumInsured,CalculatedPremiumPerTerm,TotalPremium,TotalClaims
0,1.0,A,2023-01-01,2020-01-01,2019-01-01,1.0,cat,1.2,101,1000,50,2,4,10000,100,500,10
1,2.0,B,2023-03-01,2021-01-01,2020-01-01,2.0,dog,3.4,102,1500,75,4,4,20000,200,1000,20
2,4.0,D,2023-04-01,2023-01-01,2022-01-01,4.0,dog,7.8,104,2500,125,2,4,40000,400,2000,40



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Col1                      3 non-null      float64       
 1   Col2                      3 non-null      object        
 2   TransactionMonth          3 non-null      datetime64[ns]
 3   RegistrationYear          3 non-null      datetime64[ns]
 4   VehicleIntroDate          3 non-null      datetime64[ns]
 5   PartialEmptyCol           3 non-null      float64       
 6   StringCol                 3 non-null      object        
 7   CapitalOutstanding        3 non-null      float64       
 8   Mmcode                    3 non-null      int64         
 9   Cubiccapacity             3 non-null      int64         
 10  Kilowatts                 3 non-null      int64         
 11  NumberOfDoors             3 non-null      int64         
 12  Cylinders

(3, 17)


DataFrame Description:


Unnamed: 0,SumInsured,CalculatedPremiumPerTerm,TotalPremium,TotalClaims
count,3.0,3.0,3.0,3.0
mean,23333.333333,233.333333,1166.666667,23.333333
std,15275.252317,152.752523,763.762616,15.275252
min,10000.0,100.0,500.0,10.0
25%,15000.0,150.0,750.0,15.0
50%,20000.0,200.0,1000.0,20.0
75%,30000.0,300.0,1500.0,30.0
max,40000.0,400.0,2000.0,40.0


ok
test_load_df_dtype_conversion_float (__main__.TestDataLoading.test_load_df_dtype_conversion_float) ... 

File read successfully!

Coloumns are capitalised.

Empty indexes are filled with "NAN" values.

Coloumns ['EmptyCol'] are dropped.

Indexes with empty values are dropped.

DataFrame is sorted by TransactionMonth column.

DataFrame is preprocessed.

Prrocessed DataFrame Saved to: test_output\processed_insurance_data.csv

DataFrame Head:


Unnamed: 0,Col1,Col2,TransactionMonth,RegistrationYear,VehicleIntroDate,Partialemptycol,Stringcol,CapitalOutstanding,Mmcode,Cubiccapacity,Kilowatts,NumberOfDoors,Cylinders,SumInsured,CalculatedPremiumPerTerm,TotalPremium,TotalClaims
0,1,A,2023-01-01,2020-01-01,2019-01-01,1.0,cat,1.2,101,1000,50,2,4,10000,100,500,10
1,2,B,2023-03-01,2021-01-01,2020-01-01,2.0,dog,3.4,102,1500,75,4,4,20000,200,1000,20
2,4,D,2023-04-01,2023-01-01,2022-01-01,4.0,dog,7.8,104,2500,125,2,4,40000,400,2000,40



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Col1                      3 non-null      int64         
 1   Col2                      3 non-null      object        
 2   TransactionMonth          3 non-null      datetime64[ns]
 3   RegistrationYear          3 non-null      datetime64[ns]
 4   VehicleIntroDate          3 non-null      datetime64[ns]
 5   Partialemptycol           3 non-null      float64       
 6   Stringcol                 3 non-null      object        
 7   CapitalOutstanding        3 non-null      float64       
 8   Mmcode                    3 non-null      int64         
 9   Cubiccapacity             3 non-null      int64         
 10  Kilowatts                 3 non-null      int64         
 11  NumberOfDoors             3 non-null      int64         
 12  Cylinders

(3, 17)


DataFrame Description:


Unnamed: 0,SumInsured,CalculatedPremiumPerTerm,TotalPremium,TotalClaims
count,3.0,3.0,3.0,3.0
mean,23333.333333,233.333333,1166.666667,23.333333
std,15275.252317,152.752523,763.762616,15.275252
min,10000.0,100.0,500.0,10.0
25%,15000.0,150.0,750.0,15.0
50%,20000.0,200.0,1000.0,20.0
75%,30000.0,300.0,1500.0,30.0
max,40000.0,400.0,2000.0,40.0


ok
test_load_df_dtype_conversion_int (__main__.TestDataLoading.test_load_df_dtype_conversion_int) ... 

File read successfully!

Coloumns are capitalised.

Empty indexes are filled with "NAN" values.

Coloumns ['EmptyCol'] are dropped.

Indexes with empty values are dropped.

DataFrame is sorted by TransactionMonth column.

DataFrame is preprocessed.

Prrocessed DataFrame Saved to: test_output\processed_insurance_data.csv

DataFrame Head:


Unnamed: 0,Col1,Col2,TransactionMonth,RegistrationYear,VehicleIntroDate,Partialemptycol,Stringcol,CapitalOutstanding,Mmcode,Cubiccapacity,Kilowatts,NumberOfDoors,Cylinders,SumInsured,CalculatedPremiumPerTerm,TotalPremium,TotalClaims
0,1,A,2023-01-01,2020-01-01,2019-01-01,1.0,cat,1.2,101,1000,50,2,4,10000,100,500,10
1,2,B,2023-03-01,2021-01-01,2020-01-01,2.0,dog,3.4,102,1500,75,4,4,20000,200,1000,20
2,4,D,2023-04-01,2023-01-01,2022-01-01,4.0,dog,7.8,104,2500,125,2,4,40000,400,2000,40



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Col1                      3 non-null      int64         
 1   Col2                      3 non-null      object        
 2   TransactionMonth          3 non-null      datetime64[ns]
 3   RegistrationYear          3 non-null      datetime64[ns]
 4   VehicleIntroDate          3 non-null      datetime64[ns]
 5   Partialemptycol           3 non-null      float64       
 6   Stringcol                 3 non-null      object        
 7   CapitalOutstanding        3 non-null      float64       
 8   Mmcode                    3 non-null      int64         
 9   Cubiccapacity             3 non-null      int64         
 10  Kilowatts                 3 non-null      int64         
 11  NumberOfDoors             3 non-null      int64         
 12  Cylinders

(3, 17)


DataFrame Description:


Unnamed: 0,SumInsured,CalculatedPremiumPerTerm,TotalPremium,TotalClaims
count,3.0,3.0,3.0,3.0
mean,23333.333333,233.333333,1166.666667,23.333333
std,15275.252317,152.752523,763.762616,15.275252
min,10000.0,100.0,500.0,10.0
25%,15000.0,150.0,750.0,15.0
50%,20000.0,200.0,1000.0,20.0
75%,30000.0,300.0,1500.0,30.0
max,40000.0,400.0,2000.0,40.0


ok
test_load_df_replaces_empty_with_nan (__main__.TestDataLoading.test_load_df_replaces_empty_with_nan) ... ok
test_load_df_saves_processed_file (__main__.TestDataLoading.test_load_df_saves_processed_file) ... 

File read successfully!

Coloumns are capitalised.

Empty indexes are filled with "NAN" values.

Coloumns ['EmptyCol'] are dropped.

Indexes with empty values are dropped.

DataFrame is sorted by TransactionMonth column.

DataFrame is preprocessed.

Prrocessed DataFrame Saved to: test_output\processed_insurance_data.csv

DataFrame Head:


Unnamed: 0,Col1,Col2,TransactionMonth,RegistrationYear,VehicleIntroDate,Partialemptycol,Stringcol,CapitalOutstanding,Mmcode,Cubiccapacity,Kilowatts,NumberOfDoors,Cylinders,SumInsured,CalculatedPremiumPerTerm,TotalPremium,TotalClaims
0,1,A,2023-01-01,2020-01-01,2019-01-01,1.0,cat,1.2,101,1000,50,2,4,10000,100,500,10
1,2,B,2023-03-01,2021-01-01,2020-01-01,2.0,dog,3.4,102,1500,75,4,4,20000,200,1000,20
2,4,D,2023-04-01,2023-01-01,2022-01-01,4.0,dog,7.8,104,2500,125,2,4,40000,400,2000,40



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Col1                      3 non-null      int64         
 1   Col2                      3 non-null      object        
 2   TransactionMonth          3 non-null      datetime64[ns]
 3   RegistrationYear          3 non-null      datetime64[ns]
 4   VehicleIntroDate          3 non-null      datetime64[ns]
 5   Partialemptycol           3 non-null      float64       
 6   Stringcol                 3 non-null      object        
 7   CapitalOutstanding        3 non-null      float64       
 8   Mmcode                    3 non-null      int64         
 9   Cubiccapacity             3 non-null      int64         
 10  Kilowatts                 3 non-null      int64         
 11  NumberOfDoors             3 non-null      int64         
 12  Cylinders

(3, 17)


DataFrame Description:


Unnamed: 0,SumInsured,CalculatedPremiumPerTerm,TotalPremium,TotalClaims
count,3.0,3.0,3.0,3.0
mean,23333.333333,233.333333,1166.666667,23.333333
std,15275.252317,152.752523,763.762616,15.275252
min,10000.0,100.0,500.0,10.0
25%,15000.0,150.0,750.0,15.0
50%,20000.0,200.0,1000.0,20.0
75%,30000.0,300.0,1500.0,30.0
max,40000.0,400.0,2000.0,40.0


ok
test_load_df_sorts_by_transactionmonth (__main__.TestDataLoading.test_load_df_sorts_by_transactionmonth) ... 

File read successfully!

Coloumns are capitalised.

Empty indexes are filled with "NAN" values.

Coloumns ['EmptyCol'] are dropped.

Indexes with empty values are dropped.

DataFrame is sorted by TransactionMonth column.

DataFrame is preprocessed.

Prrocessed DataFrame Saved to: test_output\processed_insurance_data.csv

DataFrame Head:


Unnamed: 0,Col1,Col2,TransactionMonth,RegistrationYear,VehicleIntroDate,Partialemptycol,Stringcol,CapitalOutstanding,Mmcode,Cubiccapacity,Kilowatts,NumberOfDoors,Cylinders,SumInsured,CalculatedPremiumPerTerm,TotalPremium,TotalClaims
0,1,A,2023-01-01,2020-01-01,2019-01-01,1.0,cat,1.2,101,1000,50,2,4,10000,100,500,10
1,2,B,2023-03-01,2021-01-01,2020-01-01,2.0,dog,3.4,102,1500,75,4,4,20000,200,1000,20
2,4,D,2023-04-01,2023-01-01,2022-01-01,4.0,dog,7.8,104,2500,125,2,4,40000,400,2000,40



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Col1                      3 non-null      int64         
 1   Col2                      3 non-null      object        
 2   TransactionMonth          3 non-null      datetime64[ns]
 3   RegistrationYear          3 non-null      datetime64[ns]
 4   VehicleIntroDate          3 non-null      datetime64[ns]
 5   Partialemptycol           3 non-null      float64       
 6   Stringcol                 3 non-null      object        
 7   CapitalOutstanding        3 non-null      float64       
 8   Mmcode                    3 non-null      int64         
 9   Cubiccapacity             3 non-null      int64         
 10  Kilowatts                 3 non-null      int64         
 11  NumberOfDoors             3 non-null      int64         
 12  Cylinders

(3, 17)


DataFrame Description:


Unnamed: 0,SumInsured,CalculatedPremiumPerTerm,TotalPremium,TotalClaims
count,3.0,3.0,3.0,3.0
mean,23333.333333,233.333333,1166.666667,23.333333
std,15275.252317,152.752523,763.762616,15.275252
min,10000.0,100.0,500.0,10.0
25%,15000.0,150.0,750.0,15.0
50%,20000.0,200.0,1000.0,20.0
75%,30000.0,300.0,1500.0,30.0
max,40000.0,400.0,2000.0,40.0


ok
test_load_df_successful (__main__.TestDataLoading.test_load_df_successful) ... 

File read successfully!

Coloumns are capitalised.

Empty indexes are filled with "NAN" values.

Coloumns ['EmptyCol'] are dropped.

Indexes with empty values are dropped.

DataFrame is sorted by TransactionMonth column.

DataFrame is preprocessed.

Prrocessed DataFrame Saved to: test_output\processed_insurance_data.csv

DataFrame Head:


Unnamed: 0,Col1,Col2,TransactionMonth,RegistrationYear,VehicleIntroDate,Partialemptycol,Stringcol,CapitalOutstanding,Mmcode,Cubiccapacity,Kilowatts,NumberOfDoors,Cylinders,SumInsured,CalculatedPremiumPerTerm,TotalPremium,TotalClaims
0,1,A,2023-01-01,2020-01-01,2019-01-01,1.0,cat,1.2,101,1000,50,2,4,10000,100,500,10
1,2,B,2023-03-01,2021-01-01,2020-01-01,2.0,dog,3.4,102,1500,75,4,4,20000,200,1000,20
2,4,D,2023-04-01,2023-01-01,2022-01-01,4.0,dog,7.8,104,2500,125,2,4,40000,400,2000,40



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Col1                      3 non-null      int64         
 1   Col2                      3 non-null      object        
 2   TransactionMonth          3 non-null      datetime64[ns]
 3   RegistrationYear          3 non-null      datetime64[ns]
 4   VehicleIntroDate          3 non-null      datetime64[ns]
 5   Partialemptycol           3 non-null      float64       
 6   Stringcol                 3 non-null      object        
 7   CapitalOutstanding        3 non-null      float64       
 8   Mmcode                    3 non-null      int64         
 9   Cubiccapacity             3 non-null      int64         
 10  Kilowatts                 3 non-null      int64         
 11  NumberOfDoors             3 non-null      int64         
 12  Cylinders

(3, 17)


DataFrame Description:


Unnamed: 0,SumInsured,CalculatedPremiumPerTerm,TotalPremium,TotalClaims
count,3.0,3.0,3.0,3.0
mean,23333.333333,233.333333,1166.666667,23.333333
std,15275.252317,152.752523,763.762616,15.275252
min,10000.0,100.0,500.0,10.0
25%,15000.0,150.0,750.0,15.0
50%,20000.0,200.0,1000.0,20.0
75%,30000.0,300.0,1500.0,30.0
max,40000.0,400.0,2000.0,40.0


ok

----------------------------------------------------------------------
Ran 10 tests in 0.575s

OK
