In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("capstone_data_clean.csv")
df["Adv_Year"] = df["Adv_Year"].astype("category")
df["Adv_Month"] = df["Adv_Month"].astype("category")
df["Gearbox_Type"] = df["Gearbox_Type"].astype("category")
df["Fuel_Type"] = df["Fuel_Type"].astype("category")

In [3]:
df.dtypes

Maker             object
Genmodel          object
Genmodel_ID       object
Adv_ID            object
Adv_Year        category
Adv_Month       category
Color             object
Body_Type         object
Gearbox_Type    category
Fuel_Type       category
Reg_Year         float64
Mileage          float64
Engine_Size      float64
Price            float64
Engine_Power     float64
Annual_Tax       float64
Wheelbase        float64
Height           float64
Width            float64
Length           float64
Average_Mpg      float64
Top_Speed        float64
Seat_Num         float64
Door_Num         float64
dtype: object

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268252 entries, 0 to 268251
Data columns (total 24 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   Maker         268252 non-null  object  
 1   Genmodel      268252 non-null  object  
 2   Genmodel_ID   268252 non-null  object  
 3   Adv_ID        268252 non-null  object  
 4   Adv_Year      268252 non-null  category
 5   Adv_Month     268252 non-null  category
 6   Color         246377 non-null  object  
 7   Body_Type     267298 non-null  object  
 8   Gearbox_Type  268085 non-null  category
 9   Fuel_Type     267843 non-null  category
 10  Reg_Year      268245 non-null  float64 
 11  Mileage       267197 non-null  float64 
 12  Engine_Size   266188 non-null  float64 
 13  Price         267107 non-null  float64 
 14  Engine_Power  236442 non-null  float64 
 15  Annual_Tax    220716 non-null  float64 
 16  Wheelbase     232831 non-null  float64 
 17  Height        239507 non-null

In [7]:
df.head()

Unnamed: 0,Maker,Genmodel,Genmodel_ID,Adv_ID,Adv_Year,Adv_Month,Color,Body_Type,Gearbox_Type,Fuel_Type,...,Engine_Power,Annual_Tax,Wheelbase,Height,Width,Length,Average_Mpg,Top_Speed,Seat_Num,Door_Num
0,Bentley,Arnage,10_1,10_1_1,2018,Apr,Silver,Saloon,Automatic,Petrol,...,,,3116.0,1515.0,2125.0,5390.0,,,5.0,4.0
1,Bentley,Arnage,10_1,10_1_2,2018,Jun,Grey,Saloon,Automatic,Petrol,...,450.0,315.0,3116.0,1515.0,2125.0,5390.0,13.7,179.0,5.0,4.0
2,Bentley,Arnage,10_1,10_1_3,2017,Nov,Blue,Saloon,Automatic,Petrol,...,400.0,315.0,3116.0,1515.0,2125.0,5390.0,14.7,155.0,5.0,4.0
3,Bentley,Arnage,10_1,10_1_4,2018,Apr,Green,Saloon,Automatic,Petrol,...,,,3116.0,1515.0,2125.0,5390.0,,,5.0,4.0
4,Bentley,Arnage,10_1,10_1_5,2017,Nov,Grey,Saloon,Automatic,Petrol,...,,,3116.0,1515.0,2125.0,5390.0,,,5.0,4.0


In [8]:
df["Price"].describe()

count    2.671070e+05
mean     1.475596e+04
std      3.232048e+04
min      1.000000e+02
25%      4.990000e+03
50%      9.299000e+03
75%      1.715000e+04
max      9.999999e+06
Name: Price, dtype: float64

In [9]:
df["Price"].mode()

0    3995.0
Name: Price, dtype: float64

In [10]:
df["Price"].skew()

125.96446025762475

### Price normal distribution test:
> H0: the variable is normally distributed
> H1: the variable is not normally distributed

In [11]:
stats.normaltest(df["Price"], nan_policy="omit")

NormaltestResult(statistic=1141533.6021175836, pvalue=0.0)

There's a reason to reject H0 in favor of H1.

## Numeric features' influence on Price

### Spearman correlations
Non-parametric rank-based test. No assumptions about the distributions or data, except that variables should be at least ordinal. 

In [12]:
spearman = df.corr(method="spearman", numeric_only=True)

weak_positive = spearman["Price"].between(0.1, 0.38, inclusive="left")
weak_negative = spearman["Price"].between(-0.38, -0.1, inclusive="right")

mod_positive = spearman["Price"].between(0.38, 0.68, inclusive="left")
mod_negative = spearman["Price"].between(-0.68, -0.38, inclusive="right")

str_positive = spearman["Price"].between(0.68, 0.89, inclusive="left")
str_negative = spearman["Price"].between(-0.89, -0.68, inclusive="right")

v_str_positive = spearman["Price"] >= 0.89
v_str_negative = spearman["Price"] <= -0.89

In [13]:
spearman_weak = spearman[weak_positive | weak_negative]["Price"]
spearman_mod = spearman[mod_positive | mod_negative]["Price"]
spearman_str = spearman[str_positive | str_negative]["Price"]
spearman_v_str = spearman[v_str_positive | v_str_negative]["Price"]

In [14]:
spearman_str

Reg_Year    0.735516
Name: Price, dtype: float64

In [15]:
spearman_mod.sort_values(ascending=False)

Engine_Power    0.580741
Length          0.482357
Top_Speed       0.458331
Width           0.454906
Wheelbase       0.441040
Engine_Size     0.439399
Mileage        -0.665604
Name: Price, dtype: float64

#### Correlation significance test

In [16]:
stats.spearmanr(df["Price"], df["Reg_Year"], nan_policy="omit")

SignificanceResult(statistic=0.7355158764918075, pvalue=0.0)

In [17]:
stats.spearmanr(df["Price"], df["Mileage"], nan_policy="omit")

SignificanceResult(statistic=-0.6656039321682224, pvalue=0.0)

In [18]:
stats.spearmanr(df["Price"], df["Engine_Power"], nan_policy="omit")

SignificanceResult(statistic=0.5807410645830855, pvalue=0.0)

In [19]:
stats.spearmanr(df["Price"], df["Length"], nan_policy="omit")

SignificanceResult(statistic=0.482357341044973, pvalue=0.0)

In [20]:
stats.spearmanr(df["Price"], df["Top_Speed"], nan_policy="omit")

SignificanceResult(statistic=0.4583309002586669, pvalue=0.0)

In [21]:
stats.spearmanr(df["Price"], df["Width"], nan_policy="omit")

SignificanceResult(statistic=0.4549064360385548, pvalue=0.0)

In [22]:
stats.spearmanr(df["Price"], df["Wheelbase"], nan_policy="omit")

SignificanceResult(statistic=0.44104003589368684, pvalue=0.0)

In [23]:
stats.spearmanr(df["Price"], df["Engine_Size"], nan_policy="omit")

SignificanceResult(statistic=0.4393991133398473, pvalue=0.0)

## Categorical features' influence on Price

### Kruskal-Wallis H-test due to not-normally distributed Price variable

In [24]:
def write_names(category):
    values = df[category].unique()
    names = {}
    numbers = {}
    cat_nr = 1
    for name in values:
        if name != np.nan:
            numbers[name] = cat_nr
            names[cat_nr] = name
            cat_nr += 1      
    return numbers, names


def swap(category, col_dict):
    return df[category].apply(lambda x : col_dict[x])

Gearbox_Type

In [59]:
to_num, to_cat = write_names("Gearbox_Type")

In [60]:
df["Gearbox_Type"] = swap("Gearbox_Type", to_num)

In [61]:
stats.kruskal(df["Price"], df["Gearbox_Type"], nan_policy="omit")

KruskalResult(statistic=417463.18889256084, pvalue=0.0)

In [63]:
to_num

{'Automatic': 1, 'Semi-Automatic': 2, 'Manual': 3, nan: 4}

In [64]:
a = df[df["Gearbox_Type"] == 1]
sa = df[df["Gearbox_Type"] == 2]
m = df[df["Gearbox_Type"] == 3]

In [65]:
stats.kruskal(a["Price"], sa["Price"], m["Price"], nan_policy="omit")

KruskalResult(statistic=61427.93397669031, pvalue=0.0)

In [67]:
stats.kruskal(a["Price"], m["Price"], nan_policy="omit")

KruskalResult(statistic=61232.713394955135, pvalue=0.0)

In [66]:
stats.kruskal(a["Price"], sa["Price"], nan_policy="omit")

KruskalResult(statistic=124.10416413688691, pvalue=7.993527227514507e-29)

In [28]:
df["Gearbox_Type"] = swap("Gearbox_Type", to_cat)

Fuel Type

In [29]:
to_num, to_cat = write_names("Fuel_Type")

In [30]:
df["Fuel_Type"] = swap("Fuel_Type", to_num)

In [31]:
stats.kruskal(df["Price"], df["Fuel_Type"], nan_policy="omit")

KruskalResult(statistic=413107.40276462416, pvalue=0.0)

In [32]:
df["Fuel_Type"] = swap("Fuel_Type", to_cat)

Maker

In [33]:
to_num, to_cat = write_names("Maker")

In [34]:
df["Maker"] = swap("Maker", to_num)

In [35]:
stats.kruskal(df["Price"], df["Maker"], nan_policy="omit")

KruskalResult(statistic=401676.6562261765, pvalue=0.0)

In [36]:
df["Maker"] = swap("Maker", to_cat)

Genmodel

In [37]:
to_num, to_cat = write_names("Genmodel")

In [38]:
df["Genmodel"] = swap("Genmodel", to_num)

In [39]:
stats.kruskal(df["Price"], df["Genmodel"], nan_policy="omit")

KruskalResult(statistic=391945.5574246298, pvalue=0.0)

In [40]:
df["Genmodel"] = swap("Genmodel", to_cat)

Adv_Year

In [41]:
to_num, to_cat = write_names("Adv_Year")

In [42]:
df["Adv_Year"] = swap("Adv_Year", to_num)

In [43]:
stats.kruskal(df["Price"], df["Adv_Year"], nan_policy="omit")

KruskalResult(statistic=441273.48909218324, pvalue=0.0)

In [44]:
df["Adv_Year"] = swap("Adv_Year", to_cat)

Adv_Month

In [45]:
to_num, to_cat = write_names("Adv_Month")

In [46]:
df["Adv_Month"] = swap("Adv_Month", to_num)

In [47]:
stats.kruskal(df["Price"], df["Adv_Month"], nan_policy="omit")

KruskalResult(statistic=403033.34923676885, pvalue=0.0)

In [48]:
df["Adv_Month"] = swap("Adv_Month", to_cat)

Color

In [49]:
to_num, to_cat = write_names("Color")

In [50]:
df["Color"] = swap("Color", to_num)

In [51]:
stats.kruskal(df["Price"], df["Color"], nan_policy="omit")

KruskalResult(statistic=402459.53175692825, pvalue=0.0)

In [52]:
df["Color"] = swap("Color", to_cat)

Body_Type

In [53]:
to_num, to_cat = write_names("Body_Type")

In [54]:
df["Body_Type"] = swap("Body_Type", to_num)

In [55]:
stats.kruskal(df["Price"], df["Body_Type"], nan_policy="omit")

KruskalResult(statistic=405266.1024670215, pvalue=0.0)

In [56]:
df["Body_Type"] = swap("Body_Type", to_cat)

In [57]:
df.head()

Unnamed: 0,Maker,Genmodel,Genmodel_ID,Adv_ID,Adv_Year,Adv_Month,Color,Body_Type,Gearbox_Type,Fuel_Type,...,Engine_Power,Annual_Tax,Wheelbase,Height,Width,Length,Average_Mpg,Top_Speed,Seat_Num,Door_Num
0,Bentley,Arnage,10_1,10_1_1,2018,Apr,Silver,Saloon,Automatic,Petrol,...,,,3116.0,1515.0,2125.0,5390.0,,,5.0,4.0
1,Bentley,Arnage,10_1,10_1_2,2018,Jun,Grey,Saloon,Automatic,Petrol,...,450.0,315.0,3116.0,1515.0,2125.0,5390.0,13.7,179.0,5.0,4.0
2,Bentley,Arnage,10_1,10_1_3,2017,Nov,Blue,Saloon,Automatic,Petrol,...,400.0,315.0,3116.0,1515.0,2125.0,5390.0,14.7,155.0,5.0,4.0
3,Bentley,Arnage,10_1,10_1_4,2018,Apr,Green,Saloon,Automatic,Petrol,...,,,3116.0,1515.0,2125.0,5390.0,,,5.0,4.0
4,Bentley,Arnage,10_1,10_1_5,2017,Nov,Grey,Saloon,Automatic,Petrol,...,,,3116.0,1515.0,2125.0,5390.0,,,5.0,4.0


## ANOVA

In [58]:
stats.f_oneway(df["Price"], df["Adv_Month"])

ValueError: could not convert string to float: 'Apr'

### Kendall correlations
Non-parametric rank-based test. No assumptions about distributions, but variables should have the same sample sizes (N).

In [56]:
kendall = df.corr(method="kendall", numeric_only=True)

weak_positive = kendall["Price"].between(0.06, 0.26, inclusive="left")
weak_negative = kendall["Price"].between(-0.26, -0.06, inclusive="right")

mod_positive = kendall["Price"].between(0.26, 0.49, inclusive="left")
mod_negative = kendall["Price"].between(-0.49, -0.26, inclusive="right")

str_positive = kendall["Price"].between(0.49, 0.1, inclusive="left")
str_negative = kendall["Price"].between(-0.71, -0.49, inclusive="right")

v_str_positive = kendall["Price"] >= 0.71
v_str_negative = kendall["Price"] <= -0.71

In [57]:
kendall_weak = kendall[weak_positive | weak_negative]["Price"]
kendall_mod = kendall[mod_positive | mod_negative]["Price"]
kendall_str = kendall[str_positive | str_negative]["Price"]
kendall_v_str = kendall[v_str_positive | v_str_negative]["Price"]

In [58]:
kendall_mod

Mileage        -0.471981
Engine_Size     0.295248
Engine_Power    0.416304
Wheelbase       0.305761
Width           0.316817
Length          0.333532
Top_Speed       0.324813
Name: Price, dtype: float64

### Pearson correlations
Sensitive to outliers, requires normal distribution of the variables

In [59]:
pearson = df.corr(method="pearson", numeric_only=True)

weak_positive = pearson["Price"].between(0.1, 0.4, inclusive="left")
weak_negative = pearson["Price"].between(-0.4, -0.1, inclusive="right")

mod_positive = pearson["Price"].between(0.4, 0.7, inclusive="left")
mod_negative = pearson["Price"].between(-0.7, -0.4, inclusive="right")

str_positive = pearson["Price"].between(0.7, 0.9, inclusive="left")
str_negative = pearson["Price"].between(-0.9, -0.7, inclusive="right")

v_str_positive = pearson["Price"] >= 0.9
v_str_negative = pearson["Price"] <= -0.9

In [60]:
pearson_weak = pearson[weak_positive | weak_negative]["Price"]
pearson_mod = pearson[mod_positive | mod_negative]["Price"]
pearson_str = pearson[str_positive | str_negative]["Price"]
pearson_v_str = pearson[v_str_positive | v_str_negative]["Price"]

In [61]:
pearson_mod

Engine_Power    0.475262
Name: Price, dtype: float64