# Primary Data Cleaning
## Data source: https://www.kaggle.com/kkhandekar/cheapest-electric-cars
### Main problems: <li> 'PriceinGermany', 'PriceinUK' aren't in numeric format. Converted them into integers. Other examples: 'FastChargeSpeed', 'Range'. </li><li> 'Battery' is inside 'Subtitle. Had to extract it with massive formatting. </li><li> 'PriceinUK' had a few null entries.


In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('clean_comp.csv')

In [3]:
df.head()

Unnamed: 0,Name,Subtitle,Efficiency,FastChargeSpeed,Power,NumberofSeats,PriceinGermany,PriceinUK,Range (ml),Acceleration Per/Sec,Top Speed (KM/H)
0,Opel Ampera-e,Battery Electric Vehicle | 58 kWh,173 Wh/km,210 km/h,Front Wheel Drive,5,"€42,990",,335,7.3,150
1,Renault Kangoo Maxi ZE 33,Battery Electric Vehicle | 31 kWh,194 Wh/km,-,Front Wheel Drive,5,,"£31,680",160,22.4,130
2,Nissan Leaf,Battery Electric Vehicle | 36 kWh,164 Wh/km,230 km/h,Front Wheel Drive,5,"€29,990","£25,995",220,7.9,144
3,Audi e-tron Sportback 55 quattro,Battery Electric Vehicle | 86.5 kWh,231 Wh/km,600 km/h,All Wheel Drive,5,,"£79,900",375,5.7,200
4,Porsche Taycan Turbo S,Battery Electric Vehicle | 83.7 kWh,215 Wh/km,860 km/h,All Wheel Drive,4,"€186,336","£138,830",390,2.8,260


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Name                  180 non-null    object 
 1   Subtitle              180 non-null    object 
 2   Efficiency            180 non-null    object 
 3   FastChargeSpeed       180 non-null    object 
 4   Power                 180 non-null    object 
 5   NumberofSeats         180 non-null    int64  
 6   PriceinGermany        168 non-null    object 
 7   PriceinUK             136 non-null    object 
 8   Range (ml)            180 non-null    int64  
 9   Acceleration Per/Sec  180 non-null    float64
 10  Top Speed (KM/H)      180 non-null    int64  
dtypes: float64(1), int64(3), object(7)
memory usage: 15.6+ KB


In [5]:
price = df.PriceinGermany
price

0       €42,990
1           NaN
2       €29,990
3           NaN
4      €186,336
         ...   
175     €40,000
176     €53,560
177         NaN
178     €50,900
179     €49,500
Name: PriceinGermany, Length: 180, dtype: object

In [6]:
df['PriceinGermany'] = df['PriceinGermany'].str.replace(r'€', '')
df['PriceinGermany'] = df['PriceinGermany'].str.replace(r',', '')
df

Unnamed: 0,Name,Subtitle,Efficiency,FastChargeSpeed,Power,NumberofSeats,PriceinGermany,PriceinUK,Range (ml),Acceleration Per/Sec,Top Speed (KM/H)
0,Opel Ampera-e,Battery Electric Vehicle | 58 kWh,173 Wh/km,210 km/h,Front Wheel Drive,5,42990,,335,7.3,150
1,Renault Kangoo Maxi ZE 33,Battery Electric Vehicle | 31 kWh,194 Wh/km,-,Front Wheel Drive,5,,"£31,680",160,22.4,130
2,Nissan Leaf,Battery Electric Vehicle | 36 kWh,164 Wh/km,230 km/h,Front Wheel Drive,5,29990,"£25,995",220,7.9,144
3,Audi e-tron Sportback 55 quattro,Battery Electric Vehicle | 86.5 kWh,231 Wh/km,600 km/h,All Wheel Drive,5,,"£79,900",375,5.7,200
4,Porsche Taycan Turbo S,Battery Electric Vehicle | 83.7 kWh,215 Wh/km,860 km/h,All Wheel Drive,4,186336,"£138,830",390,2.8,260
...,...,...,...,...,...,...,...,...,...,...,...
175,MG Marvel R,Battery Electric Vehicle | 65 kWh,191 Wh/km,390 km/h,Rear Wheel Drive,5,40000,,340,7.9,200
176,Tesla Model 3 Long Range Dual Motor,Battery Electric Vehicle | 76 kWh,155 Wh/km,820 km/h,All Wheel Drive,5,53560,,490,4.4,233
177,MG MG5 EV Long Range,Battery Electric Vehicle | 57 kWh,168 Wh/km,340 km/h,Front Wheel Drive,5,,"£26,495",340,7.7,185
178,Audi Q4 e-tron 45 quattro,Battery Electric Vehicle | 76.6 kWh,199 Wh/km,470 km/h,All Wheel Drive,5,50900,,385,6.9,180


In [7]:
df.shape

(180, 11)

In [8]:
df.PriceinGermany = df.PriceinGermany.fillna(0)
df

Unnamed: 0,Name,Subtitle,Efficiency,FastChargeSpeed,Power,NumberofSeats,PriceinGermany,PriceinUK,Range (ml),Acceleration Per/Sec,Top Speed (KM/H)
0,Opel Ampera-e,Battery Electric Vehicle | 58 kWh,173 Wh/km,210 km/h,Front Wheel Drive,5,42990,,335,7.3,150
1,Renault Kangoo Maxi ZE 33,Battery Electric Vehicle | 31 kWh,194 Wh/km,-,Front Wheel Drive,5,0,"£31,680",160,22.4,130
2,Nissan Leaf,Battery Electric Vehicle | 36 kWh,164 Wh/km,230 km/h,Front Wheel Drive,5,29990,"£25,995",220,7.9,144
3,Audi e-tron Sportback 55 quattro,Battery Electric Vehicle | 86.5 kWh,231 Wh/km,600 km/h,All Wheel Drive,5,0,"£79,900",375,5.7,200
4,Porsche Taycan Turbo S,Battery Electric Vehicle | 83.7 kWh,215 Wh/km,860 km/h,All Wheel Drive,4,186336,"£138,830",390,2.8,260
...,...,...,...,...,...,...,...,...,...,...,...
175,MG Marvel R,Battery Electric Vehicle | 65 kWh,191 Wh/km,390 km/h,Rear Wheel Drive,5,40000,,340,7.9,200
176,Tesla Model 3 Long Range Dual Motor,Battery Electric Vehicle | 76 kWh,155 Wh/km,820 km/h,All Wheel Drive,5,53560,,490,4.4,233
177,MG MG5 EV Long Range,Battery Electric Vehicle | 57 kWh,168 Wh/km,340 km/h,Front Wheel Drive,5,0,"£26,495",340,7.7,185
178,Audi Q4 e-tron 45 quattro,Battery Electric Vehicle | 76.6 kWh,199 Wh/km,470 km/h,All Wheel Drive,5,50900,,385,6.9,180


In [9]:
df['PriceinGermany'] = df['PriceinGermany'].astype(int)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Name                  180 non-null    object 
 1   Subtitle              180 non-null    object 
 2   Efficiency            180 non-null    object 
 3   FastChargeSpeed       180 non-null    object 
 4   Power                 180 non-null    object 
 5   NumberofSeats         180 non-null    int64  
 6   PriceinGermany        180 non-null    int64  
 7   PriceinUK             136 non-null    object 
 8   Range (ml)            180 non-null    int64  
 9   Acceleration Per/Sec  180 non-null    float64
 10  Top Speed (KM/H)      180 non-null    int64  
dtypes: float64(1), int64(4), object(6)
memory usage: 15.6+ KB


In [11]:
# Converting this to USD

df['Price_USD'] = df['PriceinGermany'] * 1.16025163537

In [12]:
df.head()

Unnamed: 0,Name,Subtitle,Efficiency,FastChargeSpeed,Power,NumberofSeats,PriceinGermany,PriceinUK,Range (ml),Acceleration Per/Sec,Top Speed (KM/H),Price_USD
0,Opel Ampera-e,Battery Electric Vehicle | 58 kWh,173 Wh/km,210 km/h,Front Wheel Drive,5,42990,,335,7.3,150,49879.217805
1,Renault Kangoo Maxi ZE 33,Battery Electric Vehicle | 31 kWh,194 Wh/km,-,Front Wheel Drive,5,0,"£31,680",160,22.4,130,0.0
2,Nissan Leaf,Battery Electric Vehicle | 36 kWh,164 Wh/km,230 km/h,Front Wheel Drive,5,29990,"£25,995",220,7.9,144,34795.946545
3,Audi e-tron Sportback 55 quattro,Battery Electric Vehicle | 86.5 kWh,231 Wh/km,600 km/h,All Wheel Drive,5,0,"£79,900",375,5.7,200,0.0
4,Porsche Taycan Turbo S,Battery Electric Vehicle | 83.7 kWh,215 Wh/km,860 km/h,All Wheel Drive,4,186336,"£138,830",390,2.8,260,216196.648728


In [13]:
df.Price_USD = df['Price_USD'].round(2)

In [14]:
df.head()

Unnamed: 0,Name,Subtitle,Efficiency,FastChargeSpeed,Power,NumberofSeats,PriceinGermany,PriceinUK,Range (ml),Acceleration Per/Sec,Top Speed (KM/H),Price_USD
0,Opel Ampera-e,Battery Electric Vehicle | 58 kWh,173 Wh/km,210 km/h,Front Wheel Drive,5,42990,,335,7.3,150,49879.22
1,Renault Kangoo Maxi ZE 33,Battery Electric Vehicle | 31 kWh,194 Wh/km,-,Front Wheel Drive,5,0,"£31,680",160,22.4,130,0.0
2,Nissan Leaf,Battery Electric Vehicle | 36 kWh,164 Wh/km,230 km/h,Front Wheel Drive,5,29990,"£25,995",220,7.9,144,34795.95
3,Audi e-tron Sportback 55 quattro,Battery Electric Vehicle | 86.5 kWh,231 Wh/km,600 km/h,All Wheel Drive,5,0,"£79,900",375,5.7,200,0.0
4,Porsche Taycan Turbo S,Battery Electric Vehicle | 83.7 kWh,215 Wh/km,860 km/h,All Wheel Drive,4,186336,"£138,830",390,2.8,260,216196.65


In [15]:
# Front Wheel Drive

df.Power == 'Front Wheel Drive'

0       True
1       True
2       True
3      False
4      False
       ...  
175    False
176    False
177     True
178    False
179    False
Name: Power, Length: 180, dtype: bool

In [16]:
# Adding a Front Wheel Drive Column
df['Is_front_wheel_drive'] = df.Power == 'Front Wheel Drive'

In [17]:
df.head()

Unnamed: 0,Name,Subtitle,Efficiency,FastChargeSpeed,Power,NumberofSeats,PriceinGermany,PriceinUK,Range (ml),Acceleration Per/Sec,Top Speed (KM/H),Price_USD,Is_front_wheel_drive
0,Opel Ampera-e,Battery Electric Vehicle | 58 kWh,173 Wh/km,210 km/h,Front Wheel Drive,5,42990,,335,7.3,150,49879.22,True
1,Renault Kangoo Maxi ZE 33,Battery Electric Vehicle | 31 kWh,194 Wh/km,-,Front Wheel Drive,5,0,"£31,680",160,22.4,130,0.0,True
2,Nissan Leaf,Battery Electric Vehicle | 36 kWh,164 Wh/km,230 km/h,Front Wheel Drive,5,29990,"£25,995",220,7.9,144,34795.95,True
3,Audi e-tron Sportback 55 quattro,Battery Electric Vehicle | 86.5 kWh,231 Wh/km,600 km/h,All Wheel Drive,5,0,"£79,900",375,5.7,200,0.0,False
4,Porsche Taycan Turbo S,Battery Electric Vehicle | 83.7 kWh,215 Wh/km,860 km/h,All Wheel Drive,4,186336,"£138,830",390,2.8,260,216196.65,False


In [18]:
# All Wheel Drive filter

df.Power == 'All Wheel Drive'

0      False
1      False
2      False
3       True
4       True
       ...  
175    False
176     True
177    False
178     True
179    False
Name: Power, Length: 180, dtype: bool

In [19]:
df['Is_All_Wheel_Drive'] = df.Power == 'All Wheel Drive'

In [20]:
df.head()

Unnamed: 0,Name,Subtitle,Efficiency,FastChargeSpeed,Power,NumberofSeats,PriceinGermany,PriceinUK,Range (ml),Acceleration Per/Sec,Top Speed (KM/H),Price_USD,Is_front_wheel_drive,Is_All_Wheel_Drive
0,Opel Ampera-e,Battery Electric Vehicle | 58 kWh,173 Wh/km,210 km/h,Front Wheel Drive,5,42990,,335,7.3,150,49879.22,True,False
1,Renault Kangoo Maxi ZE 33,Battery Electric Vehicle | 31 kWh,194 Wh/km,-,Front Wheel Drive,5,0,"£31,680",160,22.4,130,0.0,True,False
2,Nissan Leaf,Battery Electric Vehicle | 36 kWh,164 Wh/km,230 km/h,Front Wheel Drive,5,29990,"£25,995",220,7.9,144,34795.95,True,False
3,Audi e-tron Sportback 55 quattro,Battery Electric Vehicle | 86.5 kWh,231 Wh/km,600 km/h,All Wheel Drive,5,0,"£79,900",375,5.7,200,0.0,False,True
4,Porsche Taycan Turbo S,Battery Electric Vehicle | 83.7 kWh,215 Wh/km,860 km/h,All Wheel Drive,4,186336,"£138,830",390,2.8,260,216196.65,False,True


In [21]:
# Rear Wheel Drive

df.Power == 'Rear Wheel Drive'

0      False
1      False
2      False
3      False
4      False
       ...  
175     True
176    False
177    False
178    False
179     True
Name: Power, Length: 180, dtype: bool

In [22]:
df['Is_Rear_Wheel_Drive'] = df.Power == 'Rear Wheel Drive'

In [23]:
df.head()

Unnamed: 0,Name,Subtitle,Efficiency,FastChargeSpeed,Power,NumberofSeats,PriceinGermany,PriceinUK,Range (ml),Acceleration Per/Sec,Top Speed (KM/H),Price_USD,Is_front_wheel_drive,Is_All_Wheel_Drive,Is_Rear_Wheel_Drive
0,Opel Ampera-e,Battery Electric Vehicle | 58 kWh,173 Wh/km,210 km/h,Front Wheel Drive,5,42990,,335,7.3,150,49879.22,True,False,False
1,Renault Kangoo Maxi ZE 33,Battery Electric Vehicle | 31 kWh,194 Wh/km,-,Front Wheel Drive,5,0,"£31,680",160,22.4,130,0.0,True,False,False
2,Nissan Leaf,Battery Electric Vehicle | 36 kWh,164 Wh/km,230 km/h,Front Wheel Drive,5,29990,"£25,995",220,7.9,144,34795.95,True,False,False
3,Audi e-tron Sportback 55 quattro,Battery Electric Vehicle | 86.5 kWh,231 Wh/km,600 km/h,All Wheel Drive,5,0,"£79,900",375,5.7,200,0.0,False,True,False
4,Porsche Taycan Turbo S,Battery Electric Vehicle | 83.7 kWh,215 Wh/km,860 km/h,All Wheel Drive,4,186336,"£138,830",390,2.8,260,216196.65,False,True,False


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Name                  180 non-null    object 
 1   Subtitle              180 non-null    object 
 2   Efficiency            180 non-null    object 
 3   FastChargeSpeed       180 non-null    object 
 4   Power                 180 non-null    object 
 5   NumberofSeats         180 non-null    int64  
 6   PriceinGermany        180 non-null    int64  
 7   PriceinUK             136 non-null    object 
 8   Range (ml)            180 non-null    int64  
 9   Acceleration Per/Sec  180 non-null    float64
 10  Top Speed (KM/H)      180 non-null    int64  
 11  Price_USD             180 non-null    float64
 12  Is_front_wheel_drive  180 non-null    bool   
 13  Is_All_Wheel_Drive    180 non-null    bool   
 14  Is_Rear_Wheel_Drive   180 non-null    bool   
dtypes: bool(3), float64(2),

In [25]:
df.FastChargeSpeed

0      210 km/h
1             -
2      230 km/h
3      600 km/h
4      860 km/h
         ...   
175    390 km/h
176    820 km/h
177    340 km/h
178    470 km/h
179    520 km/h
Name: FastChargeSpeed, Length: 180, dtype: object

In [26]:
# Converting FastChargeSpeed to Int

speed = df.FastChargeSpeed[0]
speed

'210 km/h'

In [27]:
speed_int = speed.split(' km/h')[0]
speed_int

'210'

In [28]:
def convert_speed(speed):
    speed_int = speed.split(' km/h')[0]
    return speed_int

In [29]:
df['Speed New'] = df.FastChargeSpeed.apply(convert_speed)

In [30]:
df.head()

Unnamed: 0,Name,Subtitle,Efficiency,FastChargeSpeed,Power,NumberofSeats,PriceinGermany,PriceinUK,Range (ml),Acceleration Per/Sec,Top Speed (KM/H),Price_USD,Is_front_wheel_drive,Is_All_Wheel_Drive,Is_Rear_Wheel_Drive,Speed New
0,Opel Ampera-e,Battery Electric Vehicle | 58 kWh,173 Wh/km,210 km/h,Front Wheel Drive,5,42990,,335,7.3,150,49879.22,True,False,False,210
1,Renault Kangoo Maxi ZE 33,Battery Electric Vehicle | 31 kWh,194 Wh/km,-,Front Wheel Drive,5,0,"£31,680",160,22.4,130,0.0,True,False,False,-
2,Nissan Leaf,Battery Electric Vehicle | 36 kWh,164 Wh/km,230 km/h,Front Wheel Drive,5,29990,"£25,995",220,7.9,144,34795.95,True,False,False,230
3,Audi e-tron Sportback 55 quattro,Battery Electric Vehicle | 86.5 kWh,231 Wh/km,600 km/h,All Wheel Drive,5,0,"£79,900",375,5.7,200,0.0,False,True,False,600
4,Porsche Taycan Turbo S,Battery Electric Vehicle | 83.7 kWh,215 Wh/km,860 km/h,All Wheel Drive,4,186336,"£138,830",390,2.8,260,216196.65,False,True,False,860


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Name                  180 non-null    object 
 1   Subtitle              180 non-null    object 
 2   Efficiency            180 non-null    object 
 3   FastChargeSpeed       180 non-null    object 
 4   Power                 180 non-null    object 
 5   NumberofSeats         180 non-null    int64  
 6   PriceinGermany        180 non-null    int64  
 7   PriceinUK             136 non-null    object 
 8   Range (ml)            180 non-null    int64  
 9   Acceleration Per/Sec  180 non-null    float64
 10  Top Speed (KM/H)      180 non-null    int64  
 11  Price_USD             180 non-null    float64
 12  Is_front_wheel_drive  180 non-null    bool   
 13  Is_All_Wheel_Drive    180 non-null    bool   
 14  Is_Rear_Wheel_Drive   180 non-null    bool   
 15  Speed New             1

In [32]:
# Cleaning Speed New and removing Null values

df['Speed New'] = df['Speed New'].str.replace(r'-', '0')
df['Speed New'] = df['Speed New'].fillna(0)

In [33]:
df.head()

Unnamed: 0,Name,Subtitle,Efficiency,FastChargeSpeed,Power,NumberofSeats,PriceinGermany,PriceinUK,Range (ml),Acceleration Per/Sec,Top Speed (KM/H),Price_USD,Is_front_wheel_drive,Is_All_Wheel_Drive,Is_Rear_Wheel_Drive,Speed New
0,Opel Ampera-e,Battery Electric Vehicle | 58 kWh,173 Wh/km,210 km/h,Front Wheel Drive,5,42990,,335,7.3,150,49879.22,True,False,False,210
1,Renault Kangoo Maxi ZE 33,Battery Electric Vehicle | 31 kWh,194 Wh/km,-,Front Wheel Drive,5,0,"£31,680",160,22.4,130,0.0,True,False,False,0
2,Nissan Leaf,Battery Electric Vehicle | 36 kWh,164 Wh/km,230 km/h,Front Wheel Drive,5,29990,"£25,995",220,7.9,144,34795.95,True,False,False,230
3,Audi e-tron Sportback 55 quattro,Battery Electric Vehicle | 86.5 kWh,231 Wh/km,600 km/h,All Wheel Drive,5,0,"£79,900",375,5.7,200,0.0,False,True,False,600
4,Porsche Taycan Turbo S,Battery Electric Vehicle | 83.7 kWh,215 Wh/km,860 km/h,All Wheel Drive,4,186336,"£138,830",390,2.8,260,216196.65,False,True,False,860


In [34]:
# Converting Speed from an object to an integer

df['Speed New'] = df['Speed New'].astype(int)

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Name                  180 non-null    object 
 1   Subtitle              180 non-null    object 
 2   Efficiency            180 non-null    object 
 3   FastChargeSpeed       180 non-null    object 
 4   Power                 180 non-null    object 
 5   NumberofSeats         180 non-null    int64  
 6   PriceinGermany        180 non-null    int64  
 7   PriceinUK             136 non-null    object 
 8   Range (ml)            180 non-null    int64  
 9   Acceleration Per/Sec  180 non-null    float64
 10  Top Speed (KM/H)      180 non-null    int64  
 11  Price_USD             180 non-null    float64
 12  Is_front_wheel_drive  180 non-null    bool   
 13  Is_All_Wheel_Drive    180 non-null    bool   
 14  Is_Rear_Wheel_Drive   180 non-null    bool   
 15  Speed New             1

In [36]:
# Converting KM/H to MPH: MPH = KM/H ÷ 1.609344

df['Speed New'] = df['Speed New'] / 1.609344

In [37]:
df.head()

Unnamed: 0,Name,Subtitle,Efficiency,FastChargeSpeed,Power,NumberofSeats,PriceinGermany,PriceinUK,Range (ml),Acceleration Per/Sec,Top Speed (KM/H),Price_USD,Is_front_wheel_drive,Is_All_Wheel_Drive,Is_Rear_Wheel_Drive,Speed New
0,Opel Ampera-e,Battery Electric Vehicle | 58 kWh,173 Wh/km,210 km/h,Front Wheel Drive,5,42990,,335,7.3,150,49879.22,True,False,False,130.48795
1,Renault Kangoo Maxi ZE 33,Battery Electric Vehicle | 31 kWh,194 Wh/km,-,Front Wheel Drive,5,0,"£31,680",160,22.4,130,0.0,True,False,False,0.0
2,Nissan Leaf,Battery Electric Vehicle | 36 kWh,164 Wh/km,230 km/h,Front Wheel Drive,5,29990,"£25,995",220,7.9,144,34795.95,True,False,False,142.915374
3,Audi e-tron Sportback 55 quattro,Battery Electric Vehicle | 86.5 kWh,231 Wh/km,600 km/h,All Wheel Drive,5,0,"£79,900",375,5.7,200,0.0,False,True,False,372.822715
4,Porsche Taycan Turbo S,Battery Electric Vehicle | 83.7 kWh,215 Wh/km,860 km/h,All Wheel Drive,4,186336,"£138,830",390,2.8,260,216196.65,False,True,False,534.379225


In [38]:
# rounding speed to 2 digits

df['Speed New'] = df['Speed New'].round(2)

In [39]:
df.head()

Unnamed: 0,Name,Subtitle,Efficiency,FastChargeSpeed,Power,NumberofSeats,PriceinGermany,PriceinUK,Range (ml),Acceleration Per/Sec,Top Speed (KM/H),Price_USD,Is_front_wheel_drive,Is_All_Wheel_Drive,Is_Rear_Wheel_Drive,Speed New
0,Opel Ampera-e,Battery Electric Vehicle | 58 kWh,173 Wh/km,210 km/h,Front Wheel Drive,5,42990,,335,7.3,150,49879.22,True,False,False,130.49
1,Renault Kangoo Maxi ZE 33,Battery Electric Vehicle | 31 kWh,194 Wh/km,-,Front Wheel Drive,5,0,"£31,680",160,22.4,130,0.0,True,False,False,0.0
2,Nissan Leaf,Battery Electric Vehicle | 36 kWh,164 Wh/km,230 km/h,Front Wheel Drive,5,29990,"£25,995",220,7.9,144,34795.95,True,False,False,142.92
3,Audi e-tron Sportback 55 quattro,Battery Electric Vehicle | 86.5 kWh,231 Wh/km,600 km/h,All Wheel Drive,5,0,"£79,900",375,5.7,200,0.0,False,True,False,372.82
4,Porsche Taycan Turbo S,Battery Electric Vehicle | 83.7 kWh,215 Wh/km,860 km/h,All Wheel Drive,4,186336,"£138,830",390,2.8,260,216196.65,False,True,False,534.38


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Name                  180 non-null    object 
 1   Subtitle              180 non-null    object 
 2   Efficiency            180 non-null    object 
 3   FastChargeSpeed       180 non-null    object 
 4   Power                 180 non-null    object 
 5   NumberofSeats         180 non-null    int64  
 6   PriceinGermany        180 non-null    int64  
 7   PriceinUK             136 non-null    object 
 8   Range (ml)            180 non-null    int64  
 9   Acceleration Per/Sec  180 non-null    float64
 10  Top Speed (KM/H)      180 non-null    int64  
 11  Price_USD             180 non-null    float64
 12  Is_front_wheel_drive  180 non-null    bool   
 13  Is_All_Wheel_Drive    180 non-null    bool   
 14  Is_Rear_Wheel_Drive   180 non-null    bool   
 15  Speed New             1

In [41]:
df.Subtitle

0       Battery Electric Vehicle |       58 kWh 
1       Battery Electric Vehicle |       31 kWh 
2       Battery Electric Vehicle |       36 kWh 
3      Battery Electric Vehicle |       86.5 kWh
4      Battery Electric Vehicle |       83.7 kWh
                         ...                    
175     Battery Electric Vehicle |       65 kWh 
176     Battery Electric Vehicle |       76 kWh 
177     Battery Electric Vehicle |       57 kWh 
178    Battery Electric Vehicle |       76.6 kWh
179    Battery Electric Vehicle |       76.6 kWh
Name: Subtitle, Length: 180, dtype: object

In [42]:
# extracting the battery spec
battery = df.Subtitle[0]
battery

'Battery Electric Vehicle |       58 kWh '

In [43]:
battery_int = battery.split('       Battery Electric Vehicle |       ')
battery_int

['Battery Electric Vehicle |       58 kWh ']

In [44]:
battery_int = battery.split(' kWh ')
battery_int

['Battery Electric Vehicle |       58', '']

In [45]:
battery_int = battery.split('Battery Electric Vehicle |       ',)[1]
battery_int

'58 kWh '

In [48]:
battery_int = battery.replace('Battery Electric Vehicle |       ', '')
battery_int

battery_int = battery.split(' kWh ')[0]
battery_int = battery.split('Battery Electric Vehicle |       ',)[1]
battery_int

battery_int = battery.split(' kWh ', n=1)[0]
battery_int

TypeError: 'n' is an invalid keyword argument for split()

In [49]:
new = df['Subtitle'].str.split('Battery Electric Vehicle |       ', n=1, expand=True)[1]
new

0       |       58 kWh 
1       |       31 kWh 
2       |       36 kWh 
3      |       86.5 kWh
4      |       83.7 kWh
             ...       
175     |       65 kWh 
176     |       76 kWh 
177     |       57 kWh 
178    |       76.6 kWh
179    |       76.6 kWh
Name: 1, Length: 180, dtype: object

In [50]:
one = new.str.split('       |       ', n=1, expand=True)[1]
one

0       58 kWh 
1       31 kWh 
2       36 kWh 
3      86.5 kWh
4      83.7 kWh
         ...   
175     65 kWh 
176     76 kWh 
177     57 kWh 
178    76.6 kWh
179    76.6 kWh
Name: 1, Length: 180, dtype: object

In [51]:
two = one.str.split(' kWh ', n=1, expand=True)[0]
two

0            58
1            31
2            36
3      86.5 kWh
4      83.7 kWh
         ...   
175          65
176          76
177          57
178    76.6 kWh
179    76.6 kWh
Name: 0, Length: 180, dtype: object

In [60]:
df['Battery_clean'] = two.str.split(' kWh', n=1, expand=True)[0]
df.tail()

Unnamed: 0,Name,Subtitle,Efficiency,FastChargeSpeed,Power,NumberofSeats,PriceinGermany,PriceinUK,Range (ml),Acceleration Per/Sec,Top Speed (KM/H),Price_USD,Is_front_wheel_drive,Is_All_Wheel_Drive,Is_Rear_Wheel_Drive,Speed New,Battery_clean
175,MG Marvel R,Battery Electric Vehicle | 65 kWh,191 Wh/km,390 km/h,Rear Wheel Drive,5,40000,,340,7.9,200,46410.07,False,False,True,242.33,65.0
176,Tesla Model 3 Long Range Dual Motor,Battery Electric Vehicle | 76 kWh,155 Wh/km,820 km/h,All Wheel Drive,5,53560,,490,4.4,233,62143.08,False,True,False,509.52,76.0
177,MG MG5 EV Long Range,Battery Electric Vehicle | 57 kWh,168 Wh/km,340 km/h,Front Wheel Drive,5,0,"£26,495",340,7.7,185,0.0,True,False,False,211.27,57.0
178,Audi Q4 e-tron 45 quattro,Battery Electric Vehicle | 76.6 kWh,199 Wh/km,470 km/h,All Wheel Drive,5,50900,,385,6.9,180,59056.81,False,True,False,292.04,76.6
179,Audi Q4 Sportback e-tron 40,Battery Electric Vehicle | 76.6 kWh,180 Wh/km,520 km/h,Rear Wheel Drive,5,49500,,425,8.5,160,57432.46,False,False,True,323.11,76.6


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Name                  180 non-null    object 
 1   Subtitle              180 non-null    object 
 2   Efficiency            180 non-null    object 
 3   FastChargeSpeed       180 non-null    object 
 4   Power                 180 non-null    object 
 5   NumberofSeats         180 non-null    int64  
 6   PriceinGermany        180 non-null    int64  
 7   PriceinUK             136 non-null    object 
 8   Range (ml)            180 non-null    int64  
 9   Acceleration Per/Sec  180 non-null    float64
 10  Top Speed (KM/H)      180 non-null    int64  
 11  Price_USD             180 non-null    float64
 12  Is_front_wheel_drive  180 non-null    bool   
 13  Is_All_Wheel_Drive    180 non-null    bool   
 14  Is_Rear_Wheel_Drive   180 non-null    bool   
 15  Speed New             1

In [64]:
# Convert battery_clean to int

df['Battery_clean'] = df['Battery_clean'].astype(float)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Name                  180 non-null    object 
 1   Subtitle              180 non-null    object 
 2   Efficiency            180 non-null    object 
 3   FastChargeSpeed       180 non-null    object 
 4   Power                 180 non-null    object 
 5   NumberofSeats         180 non-null    int64  
 6   PriceinGermany        180 non-null    int64  
 7   PriceinUK             136 non-null    object 
 8   Range (ml)            180 non-null    int64  
 9   Acceleration Per/Sec  180 non-null    float64
 10  Top Speed (KM/H)      180 non-null    int64  
 11  Price_USD             180 non-null    float64
 12  Is_front_wheel_drive  180 non-null    bool   
 13  Is_All_Wheel_Drive    180 non-null    bool   
 14  Is_Rear_Wheel_Drive   180 non-null    bool   
 15  Speed New             1

In [65]:
df.to_csv('clean_data_latest.csv', index=False)