In [265]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [266]:
import warnings
warnings.filterwarnings('ignore')
warnings.warn("this will not show")

In [267]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_info_columns', 500)
pd.set_option('display.max_info_rows', 2000)
pd.set_option('display.expand_frame_repr', True)
pd.set_option('display.width', 2000)

In [268]:
pd.set_option('display.max_columns', None)

In [269]:
scout_raw = pd.read_json('scout_car.json', lines=True)
scout = pd.read_json('scout_car.json', lines=True)

In [270]:
scout.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15919 entries, 0 to 15918
Data columns (total 54 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   url                            object 
 1   make_model                     object 
 2   short_description              object 
 3   body_type                      object 
 4   price                          int64  
 5   vat                            object 
 6   km                             object 
 7   registration                   object 
 8   prev_owner                     object 
 9   kW                             float64
 10  hp                             object 
 11  Type                           object 
 12  Previous Owners                object 
 13  Next Inspection                object 
 14  Inspection new                 object 
 15  Warranty                       object 
 16  Full Service                   object 
 17  Non-smoking Vehicle            object 
 18  null  

In [271]:
scout.shape

(15919, 54)

In [272]:
scout.describe()

Unnamed: 0,price,kW
count,15919.0,0.0
mean,18019.896727,
std,7386.169409,
min,13.0,
25%,12850.0,
50%,16900.0,
75%,21900.0,
max,74600.0,


## Missing Value Check

First of all let's examine the percentage of missing values.

In [273]:
(100 - scout.isnull().sum()*100/scout.shape[0]).sort_values()

kW                                 0.000000
Last Timing Belt Service Date      0.100509
Electricity consumption            0.860607
Available from                     1.708650
Last Service Date                  3.555500
Availability                       3.988944
Other Fuel Types                   5.527985
Next Inspection                   22.206169
Inspection new                    24.700044
Emission Label                    25.032979
Model Code                        31.270808
Non-smoking Vehicle               45.084490
Country version                   47.653747
Full Service                      51.605000
Weight                            56.190715
Drive chain                       56.919404
prev_owner                        57.107859
Previous Owners                   58.288837
Paint Type                        63.741441
Cylinders                         64.319367
Warranty                          65.952635
Gears                             70.400151
vat                             

## Dealing With Insufficient & Irrelevant Columns

We are going to consantrate on the columns that have high percentage of missing values. To do so we assume columns that have more than 35% missing values as columns of interest. 

In [274]:
def columns_interest(data, limit):
    interest = data.isnull().sum()*100/data.shape[0]
    return (interest.loc[lambda x : x > limit]).sort_values()
columns_interest(scout, 35)

Cylinders                         35.680633
Paint Type                        36.258559
Previous Owners                   41.711163
prev_owner                        42.892141
Drive chain                       43.080596
Weight                            43.809285
Full Service                      48.395000
Country version                   52.346253
Non-smoking Vehicle               54.915510
Model Code                        68.729192
Emission Label                    74.967021
Inspection new                    75.299956
Next Inspection                   77.793831
Other Fuel Types                  94.472015
Availability                      96.011056
Last Service Date                 96.444500
Available from                    98.291350
Electricity consumption           99.139393
Last Timing Belt Service Date     99.899491
kW                               100.000000
dtype: float64

- We can drop columns that have 90% or more missing values and investigate the rest of the columns of interest for the further phases of project.

- 'Other Fuel Types', 'Availability', 'Last Service Date', 'Available from', 'Electricity consumption', 'Last Timing Belt Service Date' and 'kW' will be dropped.

In [275]:
drop_lst_interest = ['Other Fuel Types', 'Availability', 'Last Service Date', 'Available from', 'Electricity consumption', 'Last Timing Belt Service Date', 'kW']

In [276]:
scout.drop(columns = drop_lst_interest, axis = 1, inplace = True)

Now we are going to investigate each column of interest.

### scout['Cylinders']

In [277]:
columns_interest(scout, 35).Cylinders

35.680633205603364

In [278]:
scout.Cylinders.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n4\n]    8105
NaN        5680
[\n3\n]    2104
[\n5\n]      22
[\n6\n]       3
[\n2\n]       2
[\n8\n]       2
[\n1\n]       1
Name: Cylinders, dtype: int64

In [279]:
scout.Cylinders.str[0].str.strip().value_counts(dropna = False)

4      8105
NaN    5680
3      2104
5        22
6         3
8         2
2         2
1         1
Name: Cylinders, dtype: int64

In [280]:
scout['Cylinders'] = scout.Cylinders.str[0].str.strip()

We are going to keep the column and deal with the missing values in further phases of the project

### scout['Paint Type']

In [281]:
columns_interest(scout, 35)['Paint Type']

36.258558954708214

In [282]:
scout['Paint Type'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\nMetallic\n]       9794
NaN                  5772
[\nUni/basic\n]       347
[\nPerl effect\n]       6
Name: Paint Type, dtype: int64

In [283]:
scout['Paint Type'].str[0].str.strip().value_counts(dropna = False)

Metallic       9794
NaN            5772
Uni/basic       347
Perl effect       6
Name: Paint Type, dtype: int64

In [284]:
scout['Paint_Type'] = scout['Paint Type'].str[0].str.strip()
scout.drop(columns = 'Paint Type', axis = 1, inplace = True)

### scout['Previous Owners'] & scout['prev_owner']

These two columns seems to contain similar data we are going to assess them together.

In [285]:
columns_interest(scout, 35)['Previous Owners']

41.71116276147999

In [286]:
columns_interest(scout, 35)['prev_owner']

42.8921414661725

In [287]:
scout['Previous Owners'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


\n1\n                                                                                        8101
NaN                                                                                          6640
\n2\n                                                                                         766
\n0\n                                                                                         163
\n3\n                                                                                          17
                                                                                             ... 
[\n1\n, \n181 g CO2/km (comb)\n]                                                                1
[\n1\n, \n, 6.1 l/100 km (comb), \n, 7.7 l/100 km (city), \n, 5.2 l/100 km (country), \n]       1
[\n1\n, \nEuro 6\n]                                                                             1
[\n1\n, \n, 5.9 l/100 km (comb), \n, 7.6 l/100 km (city), \n, 4.9 l/100 km (country), \n]       1
[\n1\n, \n102 g CO2/

#### scout['Previous Owners'

In [288]:
scout['Previous Owners'].str.strip().value_counts(dropna = False)

1      8101
NaN    6870
2       766
0       163
3        17
4         2
Name: Previous Owners, dtype: int64

In [289]:
scout['Previous Owners'].str[0].str[1].value_counts(dropna = False)

NaN    15689
1        193
0         25
2         12
Name: Previous Owners, dtype: int64

In [290]:
scout['Previous_Owners'] = scout['Previous Owners'].str.strip()

In [291]:
scout['Previous_Owners_add'] = scout['Previous Owners'].str[0].str[1]

- Now that we found additional useful previous owner values we are going to fill main column with the additional values.

In [292]:
scout.Previous_Owners.fillna(scout.Previous_Owners_add, inplace = True)

In [293]:
scout.Previous_Owners.value_counts(dropna = False)

1      8294
NaN    6640
2       778
0       188
3        17
4         2
Name: Previous_Owners, dtype: int64

In [294]:
scout['prev_owner'].value_counts(dropna = False)

1 previous owner     8294
NaN                  6828
2 previous owners     778
3 previous owners      17
4 previous owners       2
Name: prev_owner, dtype: int64

In [295]:
scout['prev_owner'].str[0].value_counts(dropna = False)

1      8294
NaN    6828
2       778
3        17
4         2
Name: prev_owner, dtype: int64

In [296]:
scout['prev_owner'] = scout['prev_owner'].str[0].value_counts(dropna = False)

In [297]:
scout.drop(columns = ['prev_owner', 'Previous Owners', 'Previous_Owners_add'], inplace = True)

As 'Previous Owners' contains same values including 188 first owner (0) we dropped 'prev_owner' and also 'Previous Owners' and 'Previous_Owners_add' are no longer needed to be kept.

### scout['Drive chain']

In [298]:
columns_interest(scout, 35)['Drive chain']

43.08059551479364

In [299]:
scout['Drive chain'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\nfront\n]    8886
NaN            6858
[\n4WD\n]       171
[\nrear\n]        4
Name: Drive chain, dtype: int64

In [300]:
scout['Drive chain'].str[0].str.strip().value_counts(dropna = False)

front    8886
NaN      6858
4WD       171
rear        4
Name: Drive chain, dtype: int64

In [301]:
scout['drive_chain'] = scout['Drive chain'].str[0].str.strip()

In [302]:
scout.drop(columns = 'Drive chain', inplace = True)

### scout['Weight']

In [303]:
columns_interest(scout, 35).Weight

43.8092845027954

In [304]:
scout.Weight.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN               6974
[\n1,163 kg\n]     574
[\n1,360 kg\n]     356
[\n1,165 kg\n]     301
[\n1,335 kg\n]     242
                  ... 
[\n1,030 kg\n]       1
[\n1,206 kg\n]       1
[\n1,492 kg\n]       1
[\n1,057 kg\n]       1
[\n1,939 kg\n]       1
Name: Weight, Length: 435, dtype: int64

In [305]:
scout.Weight.str[0].str.strip().str.extract('(\d,*\d*)')[0].str.replace(',', '').value_counts(dropna = False)

NaN     6974
1163     574
1360     356
1165     301
1335     242
        ... 
1648       1
1397       1
1792       1
2044       1
1523       1
Name: 0, Length: 435, dtype: int64

In [306]:
scout['Weight'] = scout.Weight.str[0].str.strip().str.extract('(\d,*\d*)')[0].str.replace(',', '')

Regarding its high Null value percentage and intuitively being not price related factor we will drop 'Weight' column.

In [307]:
scout.drop(columns = 'Weight', inplace = True)

# TO BE FILLED

# Step-1: Dropping irrelevent/insufficient columns

In [308]:
scout['Model'].value_counts()

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n, A3, \n]          3097
[\n, A1, \n]          2614
[\n, Insignia, \n]    2598
[\n, Astra, \n]       2526
[\n, Corsa, \n]       2219
[\n, Clio, \n]        1839
[\n, Espace, \n]       991
[\n, Duster, \n]        34
[\n, A2, \n]             1
Name: Model, dtype: int64

In [309]:
scout.drop(['kW','url','null','Offer Number','Model Code',\
            'Electricity consumption','Other Fuel Types','Availability',\
            'Last Timing Belt Service Date','Available from'], axis=1, inplace=True)

KeyError: "['kW' 'Electricity consumption' 'Other Fuel Types' 'Availability'\n 'Last Timing Belt Service Date' 'Available from'] not found in axis"

In [None]:
scout.shape

# Step-2: Column by column data cleaning

In [None]:
scout.iloc[:, 0:5].info()

## 1- scout['make_model']

In [None]:
scout['make_model'].value_counts()

## 2- scout['short_description']

### 2.1- scout['__cc']

In [None]:
scout['make_model'].describe()

Column-2: scout['short_description']

In [None]:
scout['short_description'].value_counts()

In [None]:
def single_list(x):
    return [i for i in x]

In [None]:
def split_jungle(x):
    return [w for sublist in (i.split(" ") if i is not None else '' for i in x) for w in sublist]    

In [None]:
def split_sublist(x):
    return [i.split(" ") if i is not None else '' for i in x]  

In [None]:
def re_description(x):
    return [j for j in (re.findall('.*([1-3]{1}\.\d{1}).*',i) if i is not None else '' for i in x)]

In [None]:
cc=scout['short_description'].str.extract('\D*([0-3]{1}\.\d{1})[\D ]')
scout['__cc']=cc[0]

In [None]:
scout['__cc'].value_counts()

### 2.2- scout['__xx']

In [None]:
scout['short_description'].value_counts()

In [None]:
#xx=scout['short_description'].str.extract('')

## 3- scout['body_type']

In [None]:
scout['body_type'].value_counts()

In [None]:
body_null = scout[scout['body_type'].isnull()]

In [None]:
body_group = scout.groupby('make_model')['body_type'].apply(pd.DataFrame)
body_group['Audi A3'].value_counts()

In [None]:
body_null.groupby('make_model')['short_description'].apply(pd.DataFrame)

## 4- scout['price']

In [None]:
scout['price'].isna().sum()

In [None]:
scout['price'].dtypes

## 5- scout['vat']

In [None]:
scout['vat'].isna().sum()

In [None]:
scout['vat'].value_counts()

In [None]:
#cats = ['', '']
#cat_dtype = pd.api.types.CategoricalDtype(categories=cats, ordered=True)
#weather['rating'] = weather['rating'].astype(cat_dtype)

## 6- scout['km']

In [None]:
scout['km'].value_counts()

In [None]:
def numerization(x):
    return [re.sub('\D', '', i) if i is not None else '' for i in x]

In [None]:
scout['km']=numerization(scout['km'])
scout['km']=pd.to_numeric(scout['km'])

In [None]:
scout['km'].dtypes

## 7- scout['registration']

In [None]:
scout['registration'].head()

In [None]:
dates = []
wrong_dates = []
for i in scout['registration'].unique():
    try: 
        date = pd.to_datetime(i)
        dates.append(i)
    except:
        wrong_dates.append(i)

In [None]:
wrong_dates

In [None]:
scout['registration'] = pd.to_datetime(scout['registration'], errors='coerce')

In [None]:
scout['registration'].dtypes

In [None]:
scout['registration'].value_counts().sum()

## 8- scout['prev_owner']

In [None]:
scout['prev_owner'].value_counts()

In [None]:
scout['prev_owner']=numerization(scout['prev_owner'])

In [None]:
scout['prev_owner']=pd.to_numeric(scout['prev_owner'])

## 9- scout['hp']

In [None]:
scout['hp'].isnull().sum()

In [None]:
scout['hp'].value_counts()

In [None]:
scout['hp']=numerization(scout['hp'])

In [None]:
scout['hp']=pd.to_numeric(scout['hp'])

In [None]:
scout['hp'].dtypes

In [None]:
scout[scout['hp'] < 30]['hp'].value_counts()

## 10- scout['Type']

In [None]:
scout['Type'].isnull().sum()

In [None]:
scout['Type'].value_counts()

In [None]:
def explosion(df,column,new_column_prefix):
    df_exploded = df[column].apply(pd.Series)
    df_exploded = df_exploded.rename(columns = lambda x : new_column_prefix + str(x))
    df = pd.concat([df[:], df_exploded[:]], axis=1)    
    return df

### 10.1- scout['__status']

In [None]:
scout = explosion(scout,'Type','Type_')

In [None]:
scout.drop(columns=['Type','Type_0','Type_2'], inplace=True)

In [None]:
scout.head(2)

In [None]:
scout.rename(columns={'Type_1':'__status'}, inplace=True)

In [None]:
scout['__status'].dtypes

### 10.2- scout['fuel_from_type']

In [None]:
scout.rename(columns={'Type_3':'__fuel_from_type'}, inplace=True)

In [None]:
scout['__fuel_from_type'].dtypes

In [None]:
scout['__fuel_from_type'].value_counts()

In [None]:
scout['Fuel'].value_counts()

## 11- scout['Previous Owners']

In [None]:
scout['Previous Owners'].value_counts()

In [None]:
scout = explosion(scout,'Previous Owners','Prev_')

In [None]:
scout.tail()

In [None]:
scout.drop(columns=['Previous Owners','Prev_1','Prev_2','Prev_3','Prev_4','Prev_5','Prev_6','Prev_7'], inplace=True)

In [None]:
def newline_erase(df,column):
    column = df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True)[column]
    return column

In [None]:
scout['Prev_0'] = scout['Prev_0'].str.extract('(\d)')

In [None]:
scout.rename(columns={'Prev_0':'__prev_owner'}, inplace=True)
scout = scout.astype({'__prev_owner' : 'int'}, copy = False, errors='ignore')
scout.drop(columns=['prev_owner'], inplace=True)

## 12- scout['Next Inspection']

In [None]:
scout['Next Inspection'].value_counts()

In [None]:
scout = explosion(scout,'Next Inspection','_next_insp')

In [None]:
scout.drop(columns=['Next Inspection', '_next_insp1', '_next_insp2', '_next_insp3', '_next_insp4', '_next_insp5', '_next_insp6', '_next_insp7'], inplace=True)

In [None]:
scout['_next_insp0'] = scout['_next_insp0'].str.extract('(\d{2}/\d{4})')

In [None]:
scout['_next_insp0'].value_counts()

In [None]:
scout = scout.astype({'_next_insp0' : 'int'}, copy = False, errors='ignore')
scout.rename(columns={'_next_insp0':'__next_insp'}, inplace=True)

## 13- scout['Inspection new']

In [None]:
scout['Inspection new'].value_counts()

In [None]:
scout = explosion(scout, 'Inspection new', '__Insp_New_')

In [None]:
scout['__Insp_New_0'].value_counts()

In [None]:
scout.drop(columns=['Inspection new', '__Insp_New_1', '__Insp_New_2', '__Insp_New_3', '__Insp_New_4', '__Insp_New_5', '__Insp_New_6', '__Insp_New_7'], inplace=True)

In [None]:
scout['__Insp_New_0'] = scout['__Insp_New_0'].str.extract('(Yes)')

In [None]:
scout = scout.astype({'__Insp_New_0' : 'bool'}, copy = False, errors='ignore')

In [None]:
scout.rename(columns={'__Insp_New_0':'__Insp_New'}, inplace=True)

In [None]:
scout['__Insp_New']

# FIX ABOVE

## 14. scout['Warranty']

In [310]:
scout.Warranty.isnull().sum()/scout.Warranty.shape[0]*100

34.047364784220115

In [311]:
scout.Warranty.isnull().sum()

5420

In [312]:
scout.Warranty.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                                         5420
[\n, \n, \nEuro 6\n]                        1868
\n12 months\n                               1177
\n                                           979
\n24 months\n                                566
                                            ... 
[\n24 months\n, \n128 g CO2/km (comb)\n]       1
[\n60 months\n, \n98 g CO2/km (comb)\n]        1
[\n20 months\n, \n139 g CO2/km (comb)\n]       1
[\n10 months\n, \n104 g CO2/km (comb)\n]       1
[\n16 months\n, \n116 g CO2/km (comb)\n]       1
Name: Warranty, Length: 516, dtype: int64

In [313]:
scout.Warranty.value_counts(dropna = False).sample(50)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n12 months\n, \n14,457 g CO2/km (comb)\n]                                                             1
[\n6 months\n, \n108 g CO2/km (comb)\n]                                                                 1
\n12 months\n                                                                                        1177
[\n60 months\n, \n, 6.2 l/100 km (comb), \n, 7.6 l/100 km (city), \n, 5.4 l/100 km (country), \n]       3
[\n36 months\n, \n4 (Green)\n]                                                                         30
[\n6 months\n, \n98 g CO2/km (comb)\n]                                                                  1
[\n60 months\n, \n137 g CO2/km (comb)\n]                                                                3
[\n, \n, \n151 g CO2/km (comb)\n]                                                                       5
[\n11 months\n, \n117 g CO2/km (comb)\n]                                                                1
[\n72 months\n, \n, 6.1 l/100 km (comb), \n, 7

In [314]:
warranty_lst = [''.join(item).strip() if isinstance(item,list) else item for item in scout['Warranty']]

In [315]:
scout['warranty_mnth'] = pd.DataFrame(warranty_lst)

In [316]:
scout['warranty_mnth'] = scout.warranty_mnth.str.extract('(\d{1,3}) months')

In [317]:
scout['warranty_mnth'].value_counts(dropna = 0)

NaN    11066
12      2594
24      1118
60       401
36       279
48       149
6        125
72        59
3         33
23        11
18        10
20         7
25         6
2          5
26         4
50         4
16         4
34         3
4          3
13         3
1          3
19         3
14         2
28         2
22         2
45         2
9          2
46         2
11         2
21         2
17         2
49         1
10         1
7          1
65         1
15         1
33         1
47         1
56         1
40         1
30         1
8          1
Name: warranty_mnth, dtype: int64

In [318]:
scout['warranty_mnth'].isnull().sum()/scout['warranty_mnth'].shape[0]*100

69.51441673471952

Since 69% is null we will drop 'Warranty' and 'warranty_mnth'. We also drop non related 'url' column.

In [319]:
scout.drop(columns = ['warranty_mnth', 'Warranty', 'url'], inplace = True)

## 17. scout['Make']

In [320]:
scout['Make'].value_counts(dropna = False)

\nOpel\n       7343
\nAudi\n       5712
\nRenault\n    2864
Name: Make, dtype: int64

In [321]:
scout['Make'] = scout['Make'].str.strip()

In [322]:
scout['Make'].value_counts()

Opel       7343
Audi       5712
Renault    2864
Name: Make, dtype: int64

## 18-scout['Model']

In [323]:
scout['Model'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n, A3, \n]          3097
[\n, A1, \n]          2614
[\n, Insignia, \n]    2598
[\n, Astra, \n]       2526
[\n, Corsa, \n]       2219
[\n, Clio, \n]        1839
[\n, Espace, \n]       991
[\n, Duster, \n]        34
[\n, A2, \n]             1
Name: Model, dtype: int64

In [324]:
scout['Model'] = scout.Model.str[1]

## scout['Body Color']

In [325]:
scout['Body Color'].value_counts()

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n, Black, \n]     3745
[\n, Grey, \n]      3505
[\n, White, \n]     3406
[\n, Silver, \n]    1647
[\n, Blue, \n]      1431
[\n, Red, \n]        957
[\n, Brown, \n]      289
[\n, Green, \n]      154
[\n, Beige, \n]      108
[\n, Yellow, \n]      51
[\n, Violet, \n]      18
[\n, Bronze, \n]       6
[\n, Orange, \n]       3
[\n, Gold, \n]         2
Name: Body Color, dtype: int64

In [326]:
scout['body_color'] = scout['Body Color'].str[1].str.strip()

In [327]:
scout.body_color.fillna(method = 'bfill', inplace = True)

In [328]:
scout.body_color.value_counts(dropna = False)

Black     3888
Grey      3638
White     3540
Silver    1687
Blue      1524
Red        989
Brown      299
Green      163
Beige      108
Yellow      53
Violet      18
Bronze       7
Orange       3
Gold         2
Name: body_color, dtype: int64

## scout['Body Color Original']

In [331]:
scout['Body Color Original'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                             3759
[\nOnyx Schwarz\n]               338
[\nBianco\n]                     282
[\nMythosschwarz Metallic\n]     238
[\nBrillantschwarz\n]            216
                                ... 
[\nBianca - Tetto Nero\n]          1
[\ndezir rot\n]                    1
[\nnero/tetto argento met\n]       1
[\nPython Yellow Metallic\n]       1
[\nkarbongrau\n]                   1
Name: Body Color Original, Length: 1928, dtype: int64

In [332]:
scout.drop(columns='Body Color Original', inplace = True)

## 23-scout['Upholstery']

In [333]:
scout['Upholstery'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\nCloth, Black\n]           5821
NaN                          3720
[\nPart leather, Black\n]    1121
[\nCloth\n]                  1005
[\nCloth, Grey\n]             891
[\nCloth, Other\n]            639
[\nFull leather, Black\n]     575
[\nBlack\n]                   491
[\nGrey\n]                    273
[\nOther, Other\n]            182
[\nPart leather\n]            140
[\nFull leather\n]            139
[\nFull leather, Brown\n]     116
[\nPart leather, Grey\n]      116
[\nOther, Black\n]            110
[\nFull leather, Other\n]      72
[\nFull leather, Grey\n]       67
[\nPart leather, Other\n]      65
[\nOther\n]                    56
[\nPart leather, Brown\n]      50
[\nalcantara, Black\n]         47
[\nVelour, Black\n]            36
[\nFull leather, Beige\n]      36
[\nCloth, Brown\n]             28
[\nVelour\n]                   16
[\nOther, Grey\n]              15
[\nCloth, Beige\n]             13
[\nCloth, Blue\n]              12
[\nBrown\n]                    12
[\nVelour, Gre

In [335]:
scout['Upholstery'].isnull().sum()/scout['Upholstery'].shape[0]*100

23.368302029021923

In [339]:
scout['Upholstery'] = scout.Upholstery.str[0].str.strip()

In [None]:
## scout['Nr. of Doors']

In [340]:
scout['Nr. of Doors'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n5\n]    11575
[\n4\n]     3079
[\n3\n]      832
[\n2\n]      219
NaN          212
[\n7\n]        1
[\n1\n]        1
Name: Nr. of Doors, dtype: int64

In [343]:
scout['door_number'] = scout['Nr. of Doors'].str[0].str.strip()

In [344]:
scout.drop(columns = 'Nr. of Doors', inplace = True)

## scout['Nr. of Seats']

In [345]:
scout['Nr. of Seats'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n5\n]    13336
[\n4\n]     1125
NaN          977
[\n7\n]      362
[\n2\n]      116
[\n6\n]        2
[\n3\n]        1
Name: Nr. of Seats, dtype: int64

In [347]:
scout['seat_number'] = scout['Nr. of Seats'].str[0].str.strip()

In [350]:
scout.drop(columns = 'Nr. of Seats', inplace = True)

## scout['Gearing Type']

In [351]:
scout['Gearing Type'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n, Manual, \n]            8153
[\n, Automatic, \n]         7297
[\n, Semi-automatic, \n]     469
Name: Gearing Type, dtype: int64

In [354]:
scout['gear_type'] = scout['Gearing Type'].str[1]

In [355]:
scout.drop(columns = 'Gearing Type', inplace = True)

## scout['Displacement']

In [356]:
scout['Displacement'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n1,598 cc\n]     4761
[\n999 cc\n]       2438
[\n1,398 cc\n]     1314
[\n1,399 cc\n]      749
[\n1,229 cc\n]      677
                   ... 
[\n1,800 cc\n]        1
[\n140 cc\n]          1
[\n15,898 cc\n]       1
[\n1,686 cc\n]        1
[\n1,368 cc\n]        1
Name: Displacement, Length: 78, dtype: int64

In [359]:
scout['Displacement'] = scout.Displacement.str[0].str.strip()

In [364]:
scout.Displacement.replace({',':'', ' cc':''}, regex = True, inplace = True)

In [None]:
oto['Displacement']