In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("capstone_data_clean.csv")
df["Adv_Year"] = df["Adv_Year"].astype("category")
df["Adv_Month"] = df["Adv_Month"].astype("category")
df["Gearbox_Type"] = df["Gearbox_Type"].astype("category")
df["Fuel_Type"] = df["Fuel_Type"].astype("category")

In [3]:
df.dtypes

Maker             object
Genmodel          object
Genmodel_ID       object
Adv_ID            object
Adv_Year        category
Adv_Month       category
Color             object
Body_Type         object
Gearbox_Type    category
Fuel_Type       category
Reg_Year         float64
Mileage          float64
Engine_Size      float64
Price            float64
Engine_Power     float64
Annual_Tax       float64
Wheelbase        float64
Height           float64
Width            float64
Length           float64
Average_Mpg      float64
Top_Speed        float64
Seat_Num         float64
Door_Num         float64
dtype: object

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268250 entries, 0 to 268249
Data columns (total 24 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   Maker         268250 non-null  object  
 1   Genmodel      268250 non-null  object  
 2   Genmodel_ID   268250 non-null  object  
 3   Adv_ID        268250 non-null  object  
 4   Adv_Year      268250 non-null  category
 5   Adv_Month     268250 non-null  category
 6   Color         246375 non-null  object  
 7   Body_Type     267296 non-null  object  
 8   Gearbox_Type  268083 non-null  category
 9   Fuel_Type     267841 non-null  category
 10  Reg_Year      268243 non-null  float64 
 11  Mileage       267195 non-null  float64 
 12  Engine_Size   266186 non-null  float64 
 13  Price         267105 non-null  float64 
 14  Engine_Power  236441 non-null  float64 
 15  Annual_Tax    220715 non-null  float64 
 16  Wheelbase     232830 non-null  float64 
 17  Height        239506 non-null

In [5]:
df.head()

Unnamed: 0,Maker,Genmodel,Genmodel_ID,Adv_ID,Adv_Year,Adv_Month,Color,Body_Type,Gearbox_Type,Fuel_Type,...,Engine_Power,Annual_Tax,Wheelbase,Height,Width,Length,Average_Mpg,Top_Speed,Seat_Num,Door_Num
0,Bentley,Arnage,10_1,10_1_1,2018,Apr,Silver,Saloon,Automatic,Petrol,...,,,3116.0,1515.0,2125.0,5390.0,,,5.0,4.0
1,Bentley,Arnage,10_1,10_1_2,2018,Jun,Grey,Saloon,Automatic,Petrol,...,450.0,315.0,3116.0,1515.0,2125.0,5390.0,13.7,179.0,5.0,4.0
2,Bentley,Arnage,10_1,10_1_3,2017,Nov,Blue,Saloon,Automatic,Petrol,...,400.0,315.0,3116.0,1515.0,2125.0,5390.0,14.7,155.0,5.0,4.0
3,Bentley,Arnage,10_1,10_1_4,2018,Apr,Green,Saloon,Automatic,Petrol,...,,,3116.0,1515.0,2125.0,5390.0,,,5.0,4.0
4,Bentley,Arnage,10_1,10_1_5,2017,Nov,Grey,Saloon,Automatic,Petrol,...,,,3116.0,1515.0,2125.0,5390.0,,,5.0,4.0


# Characteristic of the Price variable

In [6]:
df["Price"].describe()

count    2.671050e+05
mean     1.471860e+04
std      2.591019e+04
min      1.000000e+02
25%      4.990000e+03
50%      9.299000e+03
75%      1.715000e+04
max      2.599990e+06
Name: Price, dtype: float64

In [7]:
df["Price"].mode()

0    3995.0
Name: Price, dtype: float64

In [8]:
df["Price"].skew()

30.218340246645827

### Price normal distribution test:
> H0: the variable is normally distributed
> H1: the variable is not normally distributed

In [9]:
stats.normaltest(df["Price"], nan_policy="omit")

NormaltestResult(statistic=698863.9256911167, pvalue=0.0)

### Conclusion
1. There's a reason to reject H0 in favor of H1 and assume that the distribution of prices is different than normal.
2. Because of that I have to use non-parametric tests for calculations.

## Numeric features' influence on Price

### Spearman correlations
I choose a Spearman R correlation, because:
1. It's a non-parametric, rank-based test. 
2. there's no assumptions about the distributions or data, except that variables should be at least ordinal. 

In [10]:
spearman = df.corr(method="spearman", numeric_only=True)

weak_positive = spearman["Price"].between(0.1, 0.38, inclusive="left")
weak_negative = spearman["Price"].between(-0.38, -0.1, inclusive="right")

mod_positive = spearman["Price"].between(0.38, 0.68, inclusive="left")
mod_negative = spearman["Price"].between(-0.68, -0.38, inclusive="right")

str_positive = spearman["Price"].between(0.68, 0.89, inclusive="left")
str_negative = spearman["Price"].between(-0.89, -0.68, inclusive="right")

v_str_positive = spearman["Price"] >= 0.89
v_str_negative = spearman["Price"] <= -0.89

In [11]:
spearman_weak = spearman[weak_positive | weak_negative]["Price"]
spearman_mod = spearman[mod_positive | mod_negative]["Price"]
spearman_str = spearman[str_positive | str_negative]["Price"]
spearman_v_str = spearman[v_str_positive | v_str_negative]["Price"]

In [54]:
def spearman_asess(data_frame, correlate_with):
    table = pd.DataFrame()
    names = []
    strength = []
    correlation = []
    pvalue = []
    
# Calculating Spearman R for all numeric variables
    for column in data_frame.columns:
        if data_frame[column].dtype.kind in 'if':
            if column != correlate_with:
                test = stats.spearmanr(data_frame[correlate_with], data_frame[column], nan_policy="omit")
                
# Preparing series for a summary table       
                names.append(column)
                correlation.append(test[0])
                pvalue.append(test[1])
                if test[0] >= 0.89 or test[0] <= -0.89:
                    strength.append("Very Strong")
                elif test[0] >= 0.68 or test[0] <= -0.68:
                    strength.append("Strong")
                elif test[0] >= 0.38 or test[0] <= -0.38:
                    strength.append("Moderate")
                elif test[0] >= 0.1 or test[0] <= -0.1:
                    strength.append("Weak")
                else:
                    strength.append("None")

# Creating a summary table                    
    table["Feature"] = names
    table["Spearman_R"] = correlation
    table["P_Value"] = pvalue
    table["Strength"] = strength
        
    return table.reindex(table["Spearman_R"].abs().sort_values(ascending=False).index).reset_index(drop=True)

In [55]:
correlation_strength = spearman_asess(df, "Price")

In [56]:
correlation_strength

Unnamed: 0,Feature,Spearman_R,P_Value,Strength
0,Reg_Year,0.735519,0.0,Strong
1,Mileage,-0.665604,0.0,Moderate
2,Engine_Power,0.580736,0.0,Moderate
3,Length,0.482357,0.0,Moderate
4,Top_Speed,0.458324,0.0,Moderate
5,Width,0.454901,0.0,Moderate
6,Wheelbase,0.441041,0.0,Moderate
7,Engine_Size,0.439394,0.0,Moderate
8,Height,0.156925,0.0,Weak
9,Seat_Num,0.04139,3.3513150000000003e-99,


In [51]:
correlation_strength = correlation_strength.reindex(correlation_strength["Spearman_R"].abs().sort_values(ascending=False).index).reset_index(drop=True)

if correlation_strength["P_Value"] < 0.05:
    correlation_strength["Significance"] = "True"
else:
    correlation_strength["Significance"] = "False"
    
correlation_strength

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

#### Strong correlations

In [12]:
spearman_str

Reg_Year    0.735519
Name: Price, dtype: float64

#### Moderate correlations

In [13]:
spearman_mod.sort_values(ascending=False)

Engine_Power    0.580736
Length          0.482357
Top_Speed       0.458324
Width           0.454901
Wheelbase       0.441041
Engine_Size     0.439394
Mileage        -0.665604
Name: Price, dtype: float64

#### Correlation significance tests

In [14]:
stats.spearmanr(df["Price"], df["Reg_Year"], nan_policy="omit")

SignificanceResult(statistic=0.7355191646581417, pvalue=0.0)

In [15]:
stats.spearmanr(df["Price"], df["Mileage"], nan_policy="omit")

SignificanceResult(statistic=-0.6656038269117949, pvalue=0.0)

In [16]:
stats.spearmanr(df["Price"], df["Engine_Power"], nan_policy="omit")

SignificanceResult(statistic=0.5807357440750907, pvalue=0.0)

In [17]:
stats.spearmanr(df["Price"], df["Length"], nan_policy="omit")

SignificanceResult(statistic=0.4823571221575504, pvalue=0.0)

In [18]:
stats.spearmanr(df["Price"], df["Top_Speed"], nan_policy="omit")

SignificanceResult(statistic=0.4583236637721382, pvalue=0.0)

In [19]:
stats.spearmanr(df["Price"], df["Width"], nan_policy="omit")

SignificanceResult(statistic=0.4549006097806938, pvalue=0.0)

In [20]:
stats.spearmanr(df["Price"], df["Wheelbase"], nan_policy="omit")

SignificanceResult(statistic=0.4410410522242223, pvalue=0.0)

In [21]:
stats.spearmanr(df["Price"], df["Engine_Size"], nan_policy="omit")

SignificanceResult(statistic=0.43939377944495484, pvalue=0.0)

## Categorical features' influence on Price

### Due to the not-normally distributed price, I decided to use Kruskal-Wallis H-test for categorical features.

Implementation of that test in Python requires the categories to be in a numerical format, so I created a function to convert values.

In [22]:
def write_names(category):
    values = df[category].unique()
    names = {}
    numbers = {}
    cat_nr = 1
    for name in values:
        if name != np.nan:
            numbers[name] = cat_nr
            names[cat_nr] = name
            cat_nr += 1      
    return numbers, names


def swap(category, col_dict):
    return df[category].apply(lambda x : col_dict[x])

#### Gearbox_Type

In [23]:
to_num, to_cat = write_names("Gearbox_Type")

In [24]:
df["Gearbox_Type"] = swap("Gearbox_Type", to_num)

In [25]:
stats.kruskal(df["Price"], df["Gearbox_Type"], nan_policy="omit")

KruskalResult(statistic=417460.1154289276, pvalue=0.0)

In [26]:
df["Gearbox_Type"] = swap("Gearbox_Type", to_cat)

#### Fuel Type

In [27]:
to_num, to_cat = write_names("Fuel_Type")

In [28]:
df["Fuel_Type"] = swap("Fuel_Type", to_num)

In [29]:
stats.kruskal(df["Price"], df["Fuel_Type"], nan_policy="omit")

KruskalResult(statistic=413104.31808755535, pvalue=0.0)

In [30]:
df["Fuel_Type"] = swap("Fuel_Type", to_cat)

#### Maker

In [31]:
to_num, to_cat = write_names("Maker")

In [32]:
df["Maker"] = swap("Maker", to_num)

In [33]:
stats.kruskal(df["Price"], df["Maker"], nan_policy="omit")

KruskalResult(statistic=401673.6554066382, pvalue=0.0)

In [34]:
df["Maker"] = swap("Maker", to_cat)

#### Genmodel

In [35]:
to_num, to_cat = write_names("Genmodel")

In [36]:
df["Genmodel"] = swap("Genmodel", to_num)

In [37]:
stats.kruskal(df["Price"], df["Genmodel"], nan_policy="omit")

KruskalResult(statistic=391942.59451188263, pvalue=0.0)

In [38]:
df["Genmodel"] = swap("Genmodel", to_cat)

#### Adv_Year

In [39]:
to_num, to_cat = write_names("Adv_Year")

In [40]:
df["Adv_Year"] = swap("Adv_Year", to_num)

In [41]:
stats.kruskal(df["Price"], df["Adv_Year"], nan_policy="omit")

KruskalResult(statistic=441270.6241519115, pvalue=0.0)

In [42]:
df["Adv_Year"] = swap("Adv_Year", to_cat)

#### Adv_Month

In [43]:
to_num, to_cat = write_names("Adv_Month")

In [44]:
df["Adv_Month"] = swap("Adv_Month", to_num)

In [45]:
stats.kruskal(df["Price"], df["Adv_Month"], nan_policy="omit")

KruskalResult(statistic=403030.33119141846, pvalue=0.0)

In [46]:
df["Adv_Month"] = swap("Adv_Month", to_cat)

#### Color

In [47]:
to_num, to_cat = write_names("Color")

In [48]:
df["Color"] = swap("Color", to_num)

In [49]:
stats.kruskal(df["Price"], df["Color"], nan_policy="omit")

KruskalResult(statistic=402456.5219254176, pvalue=0.0)

In [50]:
df["Color"] = swap("Color", to_cat)

#### Body_Type

In [51]:
to_num, to_cat = write_names("Body_Type")

In [52]:
df["Body_Type"] = swap("Body_Type", to_num)

In [53]:
stats.kruskal(df["Price"], df["Body_Type"], nan_policy="omit")

KruskalResult(statistic=405263.07081098575, pvalue=0.0)

In [54]:
df["Body_Type"] = swap("Body_Type", to_cat)

## Conclusions

According to the tests, all the categorical features affect prices, and the relationships are statistically significant.
As for the numerical features, there's 1 strong, statistically significant correlation, and 7 of moderate strength.

I need to select only 5 of them for this project, so I decide to go with 2 categorical and 3 numerical features:
1. Maker
2. Fuel Type
3. Registration Year
4. Mileage
5. Engine Power

In case of categorical variables, the decision is rather arbitrary, but the numeric variables I select based on the correlation strength.