In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
train_zakaria = pd.read_csv('train2.csv')
test_zakaria = pd.read_csv('test2.csv')

In [3]:
plain_train = pd.read_csv('train.csv')
plain_test = pd.read_csv('test.csv')

In [4]:
test_zakaria['ID'] = plain_test['ID']
test_zakaria.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
manipulated_zakaria = train_zakaria.copy()

In [6]:
manipulated_zakaria.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000 entries, 0 to 6999
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         7000 non-null   int64  
 1   Brand              6983 non-null   object 
 2   VehicleModel       6745 non-null   object 
 3   ManufacturingYear  6459 non-null   float64
 4   Type               6871 non-null   object 
 5   rating             6593 non-null   float64
 6   color              6766 non-null   object 
 7   Duty               6743 non-null   float64
 8   fuel               6835 non-null   object 
 9   CylinderCount      6817 non-null   float64
 10  type of gear       6839 non-null   object 
 11  Odometer           7000 non-null   int64  
 12  #airbags           7000 non-null   int64  
 13  price              7000 non-null   int64  
 14  Engine Volume      7000 non-null   float64
 15  Engine Type        7000 non-null   object 
dtypes: float64(5), int64(4),

In [7]:
manipulated_zakaria.isna().sum()

Unnamed: 0             0
Brand                 17
VehicleModel         255
ManufacturingYear    541
Type                 129
rating               407
color                234
Duty                 257
fuel                 165
CylinderCount        183
type of gear         161
Odometer               0
#airbags               0
price                  0
Engine Volume          0
Engine Type            0
dtype: int64

In [8]:
for col in manipulated_zakaria.columns:
    manipulated_zakaria[col] = manipulated_zakaria[col].astype(str)

In [9]:
def is_nan(column):
    if column.dtype == 'object':
        return column.isna().sum() + column.isin(['', ' ', 'nan', 'NAN', 'NaN', 'na', 'NA', 'n/a', 'N/A', 'null', 'NULL']).sum()
    else:
        return column.isna().sum()

In [10]:
for col in manipulated_zakaria.columns:
    print(f'{col} : {is_nan(manipulated_zakaria[col])}')

Unnamed: 0 : 0
Brand : 17
VehicleModel : 255
ManufacturingYear : 541
Type : 129
rating : 407
color : 234
Duty : 257
fuel : 165
CylinderCount : 183
type of gear : 161
Odometer : 0
#airbags : 0
price : 0
Engine Volume : 0
Engine Type : 0


In [11]:
for col in test_zakaria.columns:
    print(f'{col} : {is_nan(test_zakaria[col])}')

Brand : 15
VehicleModel : 146
ManufacturingYear : 305
Type : 62
rating : 191
color : 153
Duty : 143
fuel : 98
CylinderCount : 114
type of gear : 95
Odometer : 0
#airbags : 0
Engine Volume : 0
Engine Type : 0
ID : 0


## Cylindercount column

In [12]:
manipulated_zakaria['CylinderCount'].value_counts(dropna=False)

4.0     4694
6.0     1049
14.0     529
8.0      277
nan      183
16.0     109
5.0       50
3.0       30
18.0      26
2.0       13
12.0      13
1.0       10
15.0       7
10.0       4
11.0       2
7.0        2
9.0        1
13.0       1
Name: CylinderCount, dtype: int64

In [13]:
def manipulate_cylindercount(data, plain):
    
    data['CylinderCount'] = plain['CylinderCount']
    data['CylinderCount'] = data['CylinderCount'].str.extract(r'(\d+)', expand=False)
    data['CylinderCount'] = pd.to_numeric(data['CylinderCount'])
    data['CylinderCount'] = data['CylinderCount'].astype('category')
    return data

In [14]:
manipulated_zakaria = manipulate_cylindercount(manipulated_zakaria, plain_train)

In [15]:
for col in manipulated_zakaria.columns:
    print(f'{col} : {is_nan(manipulated_zakaria[col])}')

Unnamed: 0 : 0
Brand : 17
VehicleModel : 255
ManufacturingYear : 541
Type : 129
rating : 407
color : 234
Duty : 257
fuel : 165
CylinderCount : 0
type of gear : 161
Odometer : 0
#airbags : 0
price : 0
Engine Volume : 0
Engine Type : 0


## Brand column

In [16]:
manipulated_zakaria['Brand'].value_counts(dropna=False)

toyota           1396
hyundai          1353
mercedes-benz     711
chevrolet         394
ford              393
bmw               367
honda             357
lexus             310
nissan            235
volkswagen        215
ssangyong         173
kia               169
opel              145
subaru            113
mitsubishi        106
audi               91
mazda              67
jeep               47
daewoo             37
fiat               33
dodge              30
suzuki             27
nan                17
renault            15
jaguar             14
vaz                14
porsche            12
mini               12
lincoln            10
land rover          9
skoda               8
peugeot             8
volvo               7
buick               7
infiniti            7
Mercedes-Benz       6
daihatsu            5
uaz                 5
BMW                 5
gaz                 4
Toyota              4
gmc                 4
Nissan              4
chrysler            3
scion               3
Jeep      

In [17]:
def manipulate_brand(data):
    data.loc[data['Brand'] == 'სხვა', 'Brand'] = 'gac'
    data['Brand'] = data['Brand'].str.lower()
    return data

In [18]:
manipulated_zakaria = manipulate_brand(manipulated_zakaria)

In [19]:
manipulated_zakaria['Brand'].nunique()

57

In [20]:
manipulated_zakaria['Brand'].value_counts(dropna=False)

toyota           1400
hyundai          1354
mercedes-benz     717
ford              396
chevrolet         395
bmw               372
honda             359
lexus             312
nissan            239
volkswagen        216
ssangyong         173
kia               169
opel              145
subaru            114
mitsubishi        107
audi               93
mazda              68
jeep               50
daewoo             37
fiat               33
dodge              30
suzuki             27
nan                17
renault            15
vaz                14
jaguar             14
mini               12
porsche            12
lincoln            10
land rover          9
volvo               9
infiniti            9
peugeot             8
skoda               8
buick               8
daihatsu            5
uaz                 5
gmc                 4
gaz                 4
chrysler            3
cadillac            3
citroen             3
scion               3
alfa romeo          2
isuzu               2
acura     

In [21]:
manipulated_zakaria.loc[manipulated_zakaria['Brand'] == 'opel/suzuki', 'Brand'] = 'opel'

In [22]:
is_nan(manipulated_zakaria['Brand'])

17

## Vehicle model column

In [23]:
manipulated_zakaria['VehicleModel'].value_counts(dropna=False)

prius                       406
sonata                      396
camry                       337
elantra                     330
nan                         255
fit                         181
e 350                       176
tucson                      166
h1                          160
santa fe                    136
aqua                        132
x5                          122
cruze                       119
fusion                      110
optima                       92
highlander                   91
jetta                        86
transit                      83
actyon                       77
ml 350                       75
rx 450                       68
rexton                       67
rav 4                        63
gx 460                       60
juke                         59
astra                        58
forester                     51
escape                       51
volt                         50
lacetti                      49
captiva                      47
civic   

In [24]:
def manipulate_vehiclemodel(data):
    
    # to lower case all to ease the manipulation
    data["VehicleModel"] = data["VehicleModel"].str.lower()
    
    # to remove some unnecessary words
    to_remove = ["i have the fastest model ever.. yes you got it right it's ", "my model which is ", " is the best", "i own " ]
    for i in to_remove:
        data["VehicleModel"] = data["VehicleModel"].str.replace(i, "")
    
    # for removing trailing dots
    data["VehicleModel"] = data["VehicleModel"].str.replace('.', "")
    
    # for removing non-ascii characters
    data["VehicleModel"] = data["VehicleModel"].str.replace(r'[^\x00-\x7F]', '', regex=True).str.strip()
    
    brands = data['Brand'].unique()
    for brand in brands:
        mask = data['Brand'] == brand
        data.loc[mask, 'VehicleModel'] = data.loc[mask, 'VehicleModel'].astype(str)
        data.loc[mask, 'VehicleModel'] = data.loc[mask, 'VehicleModel'].apply(lambda x: ''.join(str(x).split(" ")[0]) if len(str(x).split(" ")) > 1 else x)
    return data

In [25]:
manipulated_zakaria['VehicleModel'].nunique()

771

In [26]:
is_nan(manipulated_zakaria['VehicleModel'])

255

In [27]:
manipulated_zakaria = manipulate_vehiclemodel(manipulated_zakaria)

In [28]:
manipulated_zakaria['VehicleModel'].nunique()

430

In [29]:
is_nan(manipulated_zakaria['VehicleModel'])

255

In [30]:
def fill_model(data):    
    data['VehicleModel'] = data['VehicleModel'].replace('nan', pd.NA)
    to_fill_model = data.groupby(['Brand', 'CylinderCount', 'Engine Type'])['VehicleModel']
    data['VehicleModel'] = to_fill_model.transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else 'unknown'))
    data['VehicleModel'] = data['VehicleModel'].fillna('unknown')
    return data

In [31]:
manipulated_zakaria = fill_model(manipulated_zakaria)

In [32]:
manipulated_zakaria['VehicleModel'].nunique()

430

In [33]:
is_nan(manipulated_zakaria['VehicleModel'])

0

In [34]:
len(manipulated_zakaria.loc[manipulated_zakaria['VehicleModel'] == 'unknown', 'VehicleModel'])

27

In [35]:
def fill_missing_brand(data):
    data['Brand'] = data['Brand'].replace('nan', pd.NA)
    to_fill_brands = data.groupby('VehicleModel')['Brand']
    data['Brand'] = to_fill_brands.transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else 'unknown'))
    data['Brand'] = data['Brand'].fillna('unknown')
    return data

In [36]:
manipulated_zakaria = fill_missing_brand(manipulated_zakaria)

In [37]:
is_nan(manipulated_zakaria['Brand'])

0

## ManufactureringYear column

In [38]:
manipulated_zakaria['ManufacturingYear'].nunique()

72

In [39]:
is_nan(manipulated_zakaria['ManufacturingYear'])

541

In [40]:
def check_years(year):
    if year == str(pd.NA):
        return year
    if year[0] == '3':
        return '2'+ year[1:]
    if year[0] == '2' and year[1] >='8': 
        return '19'+ year[2:]
    if year[0] == '2' and year[1] != '0':
        return '20'+ year[2:]
    elif year[0] == '2' and year[1] == '0':
        if year[2] <= '2':
            return '20'+ year[2:]
        else:
            return '19'+ year[2:]
    return year

In [41]:

def manipulate_years(data):

    data.ManufacturingYear = data.ManufacturingYear.apply(lambda x: check_years(str(x)))
    data.ManufacturingYear = pd.to_numeric(data.ManufacturingYear, errors='coerce')
    return data


In [42]:
manipulated_zakaria = manipulate_years(manipulated_zakaria)

In [43]:
manipulated_zakaria['ManufacturingYear'].nunique()


67

In [44]:
is_nan(manipulated_zakaria['ManufacturingYear'])

541

In [45]:
manipulated_zakaria['ManufacturingYear'].value_counts(dropna=False).sort_index()

1893.0      1
1895.0      1
1896.0      3
1898.0      4
1900.0      5
1901.0      2
1902.0      8
1903.0      2
1904.0      6
1905.0      5
1906.0      2
1907.0      7
1908.0     12
1909.0      6
1910.0     21
1911.0     29
1912.0     22
1913.0     23
1914.0     21
1915.0     11
1916.0     15
1917.0      9
1918.0      5
1919.0      4
1920.0      1
1939.0      1
1953.0      1
1978.0      1
1982.0      1
1983.0      3
1984.0      2
1985.0      2
1986.0      1
1987.0      2
1988.0      3
1989.0      1
1990.0      8
1991.0      4
1992.0     16
1993.0      8
1994.0     11
1995.0     41
1996.0     31
1997.0     47
1998.0     69
1999.0     67
2000.0     93
2001.0     78
2002.0     99
2003.0    137
2004.0    124
2005.0    128
2006.0    119
2007.0    160
2008.0    249
2009.0    194
2010.0    518
2011.0    559
2012.0    715
2013.0    643
2014.0    702
2015.0    487
2016.0    418
2017.0    282
2018.0    128
2019.0     72
2020.0      9
NaN       541
Name: ManufacturingYear, dtype: int64

In [46]:
def fill_missing_year(data):
    to_fill_years = data.groupby(['Brand', 'VehicleModel'])['ManufacturingYear']
    
    overall_mode = data['ManufacturingYear'].mode().iloc[0] if not data['ManufacturingYear'].mode().empty else None
    
    def fill_with_mode(x):
        group_mode = x.mode()
        if not group_mode.empty and not pd.isna(group_mode.iloc[0]):
            return x.fillna(group_mode.iloc[0])
        else:
            return x.fillna(overall_mode)
    
    data['ManufacturingYear'] = to_fill_years.transform(fill_with_mode)
    data['ManufacturingYear'] = data['ManufacturingYear'].fillna(overall_mode)
    
    data['ManufacturingYear'] = data['ManufacturingYear'].astype(int)
    
    data['ManufacturingYear'] = pd.to_datetime(data['ManufacturingYear'], format='%Y').dt.to_period('Y')
    
    return data

In [47]:
manipulated_zakaria = fill_missing_year(manipulated_zakaria)

In [48]:
is_nan(manipulated_zakaria['ManufacturingYear'])

0

## Type column

In [49]:
manipulated_zakaria['Type'].value_counts(dropna=False)

sedan          3131
jeep           1880
hatchback      1102
minivan         262
coupe           163
nan             129
microbus        110
universal       109
goods wagon      88
pickup           15
cabriolet         9
limousine         2
Name: Type, dtype: int64

In [50]:

def fill_missing_types(data):
    data['Type'] = data['Type'].replace('nan', pd.NA)
    to_fill_types = data.groupby(['Brand', 'VehicleModel', 'Engine Type'])['Type']
    data['Type'] = to_fill_types.transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else 'unknown'))
    data['Type'] = data['Type'].fillna('unknown')
    return data

In [51]:
manipulated_zakaria = fill_missing_types(manipulated_zakaria)

In [52]:
is_nan(manipulated_zakaria['Type'])

0

## Rating column

In [53]:
manipulated_zakaria['rating'].value_counts(dropna=False)

2.0     1643
3.0     1493
1.0     1313
4.0     1069
5.0      651
nan      407
6.0      281
7.0      100
8.0       35
9.0        7
10.0       1
Name: rating, dtype: int64

In [54]:
def fill_missing_ratings(data):
    data['rating'] = data['rating'].replace('nan', pd.NA)
    to_fill_ratings = data.groupby(['Brand', 'VehicleModel', 'Type'])['rating']
    data['rating'] = to_fill_ratings.transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else 0))
    data['rating'] = data['rating'].fillna(0)
    data['rating'] = data['rating'].astype('category')
    return data

In [55]:
manipulated_zakaria = fill_missing_ratings(manipulated_zakaria)

In [56]:
is_nan(manipulated_zakaria['rating'])

0

In [57]:
manipulated_zakaria['rating'].value_counts(dropna=False)


2.0     1770
3.0     1584
1.0     1379
4.0     1092
5.0      659
6.0      282
7.0      100
0         91
8.0       35
9.0        7
10.0       1
Name: rating, dtype: int64

## Color column

In [58]:
manipulated_zakaria['color'].value_counts(dropna=False)

Jet Black        1887
Metallic         1579
Pearl White      1564
Charcoal          745
Sapphire Blue     396
nan               234
Crimson           185
Emerald Green      79
Tangerine          66
Gold               57
Chocolate          49
Ruby Red           45
Sand               37
Sky Blue           35
Lemon Yellow       30
Royal Purple        7
Rose Pink           5
Name: color, dtype: int64

In [59]:
def fill_manipulate_missing_color(data):
    # Convert color to lowercase
    data['color'] = data['color'].str.lower()
    
    # Replace 'nan' with actual missing values
    data['color'].replace('nan', pd.NA, inplace=True)
    
    # Group by 'Brand', 'VehicleModel', and 'Type' and calculate the mode for 'color'
    to_fill_color = data.groupby(['Brand', 'VehicleModel', 'Type'])['color']
    
    # Fill missing values with the mode within each group
    data['color'] = to_fill_color.transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else 'other'))
    data['color'] = data['color'].fillna('other')
    data['color'] = data['color'].astype('category')
    
    return data

In [60]:
manipulated_zakaria = fill_manipulate_missing_color(manipulated_zakaria)

In [61]:
manipulated_zakaria['color'].value_counts(dropna=False)

jet black        1967
pearl white      1610
metallic         1606
charcoal          752
sapphire blue     398
crimson           188
emerald green      80
other              66
tangerine          66
gold               57
chocolate          50
ruby red           45
sand               37
sky blue           36
lemon yellow       30
royal purple        7
rose pink           5
Name: color, dtype: int64

## Duty column

In [62]:
manipulated_zakaria['Duty'].value_counts(dropna=False)

765.0      550
585.0      294
nan        257
503.0      216
777.0      210
531.0      203
640.0      202
751.0      156
779.0      155
919.0      142
574.0      139
1172.0     133
891.0      121
707.0      120
1399.0     119
639.0      106
586.0       92
781.0       90
642.0       89
694.0       76
1017.0      74
836.0       66
583.0       61
831.0       59
475.0       59
761.0       58
730.0       56
502.0       55
1091.0      55
917.0       54
753.0       53
584.0       52
972.0       50
645.0       49
528.0       46
394.0       45
1750.0      44
1058.0      43
934.0       42
749.0       40
862.0       38
770.0       36
1055.0      36
1018.0      35
629.0       34
1811.0      33
1053.0      33
1249.0      33
530.0       32
697.0       31
382.0       30
308.0       29
1079.0      29
2455.0      28
1273.0      27
915.0       27
687.0       24
1246.0      24
1024.0      24
790.0       23
843.0       21
1111.0      21
418.0       21
922.0       21
1077.0      19
690.0       19
2410.0    

In [63]:
is_nan(manipulated_zakaria['Duty'])

257

In [64]:
def fill_missing_duty(data):
    data['Duty'] = data['Duty'].replace('nan', pd.NA)
    data['Duty'] = pd.to_numeric(data['Duty'])
    to_fill = data.groupby(['Brand', 'VehicleModel', 'Type'])['Duty']
    data ['Duty'] = to_fill.transform(lambda x: x.fillna(x.mean()) if not pd.isnull(x.mean()) else data.groupby('Brand')['Duty'].mean())
    data['Duty'].fillna(data['Duty'].mean(), inplace=True)
    data['Duty'] = data['Duty'].astype(float)
    return data

In [65]:
manipulated_zakaria = fill_missing_duty(manipulated_zakaria)

In [66]:
is_nan(manipulated_zakaria['Duty'])

0

## Fuel column

In [67]:
manipulated_zakaria['fuel'].value_counts(dropna=False)

gas            3532
diesel         1410
hybrid         1397
others          333
nan             165
natural gas     163
Name: fuel, dtype: int64

In [68]:
def fill_manipulate_fuel(data):
    # Convert fuel to lowercase
    data['fuel'] = data['fuel'].str.lower()
    
    # Replace 'nan' with actual missing values
    data['fuel'].replace('nan', pd.NA, inplace=True)
    
    # Group by 'Brand', 'VehicleModel', and 'Type' and calculate the mode for 'fuel'
    to_fill_fuel = data.groupby(['Brand', 'VehicleModel', 'Type'])['fuel']
    
    # Fill missing values with the mode within each group
    data['fuel'] = to_fill_fuel.transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else 'other'))
    data['fuel'] = data['fuel'].fillna('other')
    data['fuel'] = data['fuel'].astype('category')
    
    return data

In [69]:
manipulated_zakaria = fill_manipulate_fuel(manipulated_zakaria)

In [70]:
is_nan(manipulated_zakaria['fuel'])

0

## Type of geer column

In [71]:
manipulated_zakaria['type of gear'].value_counts(dropna = False)

auto      3889
others    1251
gear       656
r          643
manual     400
nan        161
Name: type of gear, dtype: int64

In [72]:
def fill_manipulate_type_of_gear(data):
    data['type of gear'] = data['type of gear'].str.lower()
    data['type of gear'].replace('nan', pd.NA, inplace = True)
    to_fill_gear = data.groupby(['Brand', 'VehicleModel', 'Engine Type', 'Type'])['type of gear']
    data['type of gear'] = to_fill_gear.transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else 'others'))
    data['type of gear'] = data['type of gear'].fillna('others')
    data['type of gear'] = data['type of gear'].astype('category')
    return data
    

In [73]:
manipulated_zakaria = fill_manipulate_type_of_gear(manipulated_zakaria)

In [74]:
is_nan(manipulated_zakaria['type of gear'])


0

## Odometer column

In [75]:
manipulated_zakaria['Odometer'].value_counts(dropna=False).sort_index()

0             257
1000           34
10000          15
100000         42
100006          1
10003           1
100037          1
100184          1
100230          1
100247          1
100280          1
100300          1
100358          1
100403          1
100658          1
100747          6
100800          1
10084           1
100900          1
100940          1
100959          1
100963          1
10100           1
101000          6
101093          1
101280          1
101335          1
101345          1
101357          1
101387          1
101424          4
101537          1
101552          1
101682          1
101708          1
101717          1
101812          1
101816          5
101920          1
101928          4
102000         16
102082          1
102138          1
102156          1
102268          1
102379          1
102397          8
102400          2
102500          1
102506          1
102675          1
10270           1
102780          1
102788          1
102885          1
102907    

In [76]:
is_nan(manipulated_zakaria['Odometer'])

0

## Airbags column

In [77]:
def manipulate_airbag(data):
    data.rename(columns = {'#airbags':'airbags'}, inplace=True)
    data['airbags'] = data['airbags'].astype('category')
    return data

In [78]:
manipulated_zakaria = manipulate_airbag(manipulated_zakaria)

In [79]:
manipulated_zakaria['airbags'].value_counts(dropna = False)

12    2586
4     1870
0      774
8      545
6      455
2      357
10     270
5       34
1       33
7       30
9       19
11      16
3       11
Name: airbags, dtype: int64

In [80]:
is_nan(manipulated_zakaria['airbags'])

0

In [81]:
manipulated_zakaria.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000 entries, 0 to 6999
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype        
---  ------             --------------  -----        
 0   Unnamed: 0         7000 non-null   object       
 1   Brand              7000 non-null   object       
 2   VehicleModel       7000 non-null   object       
 3   ManufacturingYear  7000 non-null   period[A-DEC]
 4   Type               7000 non-null   object       
 5   rating             7000 non-null   category     
 6   color              7000 non-null   category     
 7   Duty               7000 non-null   float64      
 8   fuel               7000 non-null   category     
 9   CylinderCount      7000 non-null   category     
 10  type of gear       7000 non-null   category     
 11  Odometer           7000 non-null   object       
 12  airbags            7000 non-null   category     
 13  price              7000 non-null   object       
 14  Engine Volume      7000 

In [82]:
for col in ['Odometer', 'price', 'Engine Volume']:
    manipulated_zakaria[col] = pd.to_numeric(manipulated_zakaria[col], errors='coerce')

In [83]:
manipulated_zakaria.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000 entries, 0 to 6999
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype        
---  ------             --------------  -----        
 0   Unnamed: 0         7000 non-null   object       
 1   Brand              7000 non-null   object       
 2   VehicleModel       7000 non-null   object       
 3   ManufacturingYear  7000 non-null   period[A-DEC]
 4   Type               7000 non-null   object       
 5   rating             7000 non-null   category     
 6   color              7000 non-null   category     
 7   Duty               7000 non-null   float64      
 8   fuel               7000 non-null   category     
 9   CylinderCount      7000 non-null   category     
 10  type of gear       7000 non-null   category     
 11  Odometer           7000 non-null   int64        
 12  airbags            7000 non-null   category     
 13  price              7000 non-null   int64        
 14  Engine Volume      7000 

## Test data

In [84]:
test_zakaria_manipulated = test_zakaria.copy()

In [85]:
test_zakaria_manipulated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Brand              2985 non-null   object 
 1   VehicleModel       2854 non-null   object 
 2   ManufacturingYear  2695 non-null   float64
 3   Type               2938 non-null   object 
 4   rating             2809 non-null   float64
 5   color              2847 non-null   object 
 6   Duty               2857 non-null   float64
 7   fuel               2902 non-null   object 
 8   CylinderCount      2886 non-null   float64
 9   type of gear       2905 non-null   object 
 10  Odometer           3000 non-null   int64  
 11  #airbags           3000 non-null   int64  
 12  Engine Volume      3000 non-null   float64
 13  Engine Type        3000 non-null   object 
 14  ID                 3000 non-null   int64  
dtypes: float64(5), int64(3), object(7)
memory usage: 351.7+ KB


In [86]:
for col in test_zakaria_manipulated.columns:
    print(f'{col} : {is_nan(test_zakaria_manipulated[col ])}')

Brand : 15
VehicleModel : 146
ManufacturingYear : 305
Type : 62
rating : 191
color : 153
Duty : 143
fuel : 98
CylinderCount : 114
type of gear : 95
Odometer : 0
#airbags : 0
Engine Volume : 0
Engine Type : 0
ID : 0


In [87]:
for col in test_zakaria_manipulated.columns:
    test_zakaria_manipulated[col] = test_zakaria_manipulated[col].astype(str)

In [88]:
test_zakaria_manipulated = manipulate_cylindercount(test_zakaria, plain_test)
test_zakaria_manipulated = manipulate_brand(test_zakaria)
test_zakaria_manipulated = manipulate_vehiclemodel(test_zakaria)
test_zakaria_manipulated = fill_model(test_zakaria)
test_zakaria_manipulated = fill_missing_brand(test_zakaria)
test_zakaria_manipulated = manipulate_years(test_zakaria)
test_zakaria_manipulated = fill_missing_year(test_zakaria)
test_zakaria_manipulated = fill_missing_types(test_zakaria)
test_zakaria_manipulated = fill_missing_ratings(test_zakaria)
test_zakaria_manipulated = fill_manipulate_missing_color(test_zakaria)
test_zakaria_manipulated = fill_missing_duty(test_zakaria)
test_zakaria_manipulated = fill_manipulate_fuel(test_zakaria)
test_zakaria_manipulated = fill_manipulate_type_of_gear(test_zakaria)
test_zakaria_manipulated = manipulate_airbag(test_zakaria)


In [89]:
for col in test_zakaria_manipulated.columns:
    print(f'{col} : {is_nan(test_zakaria_manipulated[col ])}')

Brand : 0
VehicleModel : 0
ManufacturingYear : 0
Type : 0
rating : 0
color : 0
Duty : 0
fuel : 0
CylinderCount : 0
type of gear : 0
Odometer : 0
airbags : 0
Engine Volume : 0
Engine Type : 0
ID : 0


In [90]:
test_zakaria_manipulated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype        
---  ------             --------------  -----        
 0   Brand              3000 non-null   object       
 1   VehicleModel       3000 non-null   object       
 2   ManufacturingYear  3000 non-null   period[A-DEC]
 3   Type               3000 non-null   object       
 4   rating             3000 non-null   category     
 5   color              3000 non-null   category     
 6   Duty               3000 non-null   float64      
 7   fuel               3000 non-null   category     
 8   CylinderCount      3000 non-null   category     
 9   type of gear       3000 non-null   category     
 10  Odometer           3000 non-null   int64        
 11  airbags            3000 non-null   category     
 12  Engine Volume      3000 non-null   float64      
 13  Engine Type        3000 non-null   object       
 14  ID                 3000 

In [91]:
manipulated_zakaria.to_csv('train_ready_zakaria.csv', index=False)

In [92]:
test_zakaria_manipulated.to_csv('test_ready_zakaria.csv', index=False)