In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Easy

In [2]:
df = pd.read_csv('../data/audi.csv')
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4
1,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0
2,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4
3,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0
4,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0


In [3]:
df = df.drop(columns=['transmission', 'fuelType', 'model']) # работаем только с числовыми признаками
df.head()

Unnamed: 0,year,price,mileage,tax,mpg,engineSize
0,2017,12500,15735,150,55.4,1.4
1,2016,16500,36203,20,64.2,2.0
2,2016,11000,29946,30,55.4,1.4
3,2017,16800,25952,145,67.3,2.0
4,2019,17300,1998,145,49.6,1.0


In [4]:
X = df.drop(columns=['price'])
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
X.head()

Unnamed: 0,year,mileage,tax,mpg,engineSize
0,2017,15735,150,55.4,1.4
1,2016,36203,20,64.2,2.0
2,2016,29946,30,55.4,1.4
3,2017,25952,145,67.3,2.0
4,2019,1998,145,49.6,1.0


In [6]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(r2_score(y_pred, y_test))

0.8732648459361659


In [7]:
model.feature_importances_

array([0.06407231, 0.64896675, 0.06824801, 0.18643184, 0.0322811 ])

Наиболее значительный признак - это `mileage`. `mpg` также немного влияет на результат, остальные незначительны.

# Medium

In [8]:
df_audi = pd.read_csv('../data/audi.csv')
df_bmw = pd.read_csv('../data/bmw.csv')
df_cclass = pd.read_csv('../data/cclass.csv')
df_focus = pd.read_csv('../data/focus.csv')
df_ford = pd.read_csv('../data/ford.csv')
df_hyundi = pd.read_csv('../data/hyundi.csv')
df_merc = pd.read_csv('../data/merc.csv')
df_skoda = pd.read_csv('../data/skoda.csv')
df_toyota = pd.read_csv('../data/toyota.csv')
df_vauxhall = pd.read_csv('../data/vauxhall.csv')
df_vw = pd.read_csv('../data/vw.csv')

In [9]:
df = pd.concat([df_audi, df_bmw, df_cclass, df_focus, df_ford, df_hyundi, df_merc, 
                df_skoda, df_toyota, df_vauxhall, df_vw], ignore_index=True)

In [10]:
df.shape

(108540, 10)

In [11]:
print(set(df['transmission']))
print(set(df['fuelType']))
print(set(df['model']))

{'Other', 'Semi-Auto', 'Automatic', 'Manual'}
{'Diesel', 'Hybrid', 'Petrol', 'Other', 'Electric'}
{' Accent', ' GT86', ' Zafira Tourer', ' Fox', ' X6', ' Scirocco', ' Tucson', ' Z4', ' Mokka X', ' Getz', ' Zafira', ' Polo', ' Astra', ' Q8', ' A3', ' Insignia', ' C-MAX', ' Mondeo', ' KA', ' 3 Series', ' SLK', ' Prius', ' S5', ' M5', ' CLA Class', ' Z3', ' I30', ' Yaris', ' TT', ' IQ', ' A8', ' IX20', ' Beetle', ' Mustang', ' Kuga', ' V Class', ' RS7', ' I10', ' Viva', ' Meriva', ' M3', ' M2', ' EcoSport', ' Ka+', ' CC', ' 6 Series', ' Cascada', ' GLA Class', ' CL Class', ' 4 Series', ' Edge', '200', ' Superb', ' Kodiaq', ' T-Cross', ' Tiguan', ' IX35', ' X4', ' Veloster', ' Focus', ' Caddy', ' A6', ' i3', ' Grand C-MAX', ' Tigra', ' A2', ' Fiesta', ' B Class', ' Corolla', ' Urban Cruiser', ' Combo Life', ' RS3', ' Vivaro', ' SQ7', ' A4', ' Crossland X', ' CLC Class', ' Ioniq', ' Sharan', ' Roomster', ' GLE Class', ' Tiguan Allspace', ' Mokka', ' Golf', ' Q7', ' A Class', '230', ' Yeti',

`transmission`, `fuelType` и `model` заменим на последовательные числа от 0 до n-1, где n - количество различных значений признака.

In [12]:
features_to_change = {'transmission' : list(set(df['transmission'])),
            'fuelType' : list(set(df['fuelType'])),
            'model' : list(set(df['model']))}

def change_feature(df, feature, feature_list):
    for i in df.index:
        df.loc[i, feature] = feature_list.index(df.loc[i, feature])
    return df

In [13]:
for feature in features_to_change.keys():
    df = change_feature(df, feature, features_to_change[feature])
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,tax(£)
0,117,2017,12500,3,15735,2,150.0,55.4,1.4,
1,61,2016,16500,2,36203,0,20.0,64.2,2.0,
2,117,2016,11000,3,29946,2,30.0,55.4,1.4,
3,74,2017,16800,2,25952,0,145.0,67.3,2.0,
4,14,2019,17300,3,1998,2,145.0,49.6,1.0,


In [14]:
params_decision_tree = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [i for i in range(3, 21)], 
    'max_features': ['sqrt', 'log2', None]
}

params_random_forest = {
    'n_estimators': [5, 25, 50, 100, 500],
    'criterion': ['gini', 'entropy'],
    'max_depth': [i for i in range(3, 21)] #log2(10^5) is close to 16, but let's consider numbers up to 20 too, since each class has very few members
}

X = df.drop(columns=['price'])
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
for i in params_decision_tree['max_depth']:
    model = DecisionTreeClassifier(max_depth=i)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(i, r2_score(y_pred, y_test))

3 0.06310688828449962
4 0.11527947430197583
5 0.5232373697124747
6 0.5635249725141452
7 0.581122403071725
8 -1.7606722344538421
9 0.10390476902314294
10 0.6284331846761142
11 0.2662003170726608
12 0.6572484837437562
13 0.6233573933977508
14 0.6760347145633858
15 0.6479482066590762
16 0.6888234355083926
17 0.6807593397048644
18 0.6606080795637566
19 0.7378370390618892
20 0.7293701388598062


In [16]:
model = DecisionTreeClassifier(criterion='gini')
model.fit(X_train, y_train)
y_pred1 = model.predict(X_test)

model = DecisionTreeClassifier(criterion='entropy')
model.fit(X_train, y_train)
y_pred2 = model.predict(X_test)
ttest_ind(y_pred1, y_pred2, alternative='less')

TtestResult(statistic=4.544463771542613, pvalue=0.9999972388213494, df=43414.0)

In [17]:
model = DecisionTreeClassifier(max_features='log2')
model.fit(X_train, y_train)
y_pred1 = model.predict(X_test)

model = DecisionTreeClassifier(max_features='sqrt')
model.fit(X_train, y_train)
y_pred2 = model.predict(X_test)
print(ttest_ind(y_pred1, y_pred2, alternative='less'))

model = DecisionTreeClassifier(max_features=None)
model.fit(X_train, y_train)
y_pred3 = model.predict(X_test)
print(ttest_ind(y_pred2, y_pred3, alternative='less'))

TtestResult(statistic=-10.281269670571875, pvalue=4.569091281592875e-25, df=43414.0)
TtestResult(statistic=14.458903451706686, pvalue=1.0, df=43414.0)


Сравним модель с лучшими гиперпараметрами с дефолтной

In [22]:
model_tree_custom = DecisionTreeClassifier(max_features=None, criterion='entropy', max_depth=19) # на этих гиперпараметрах результаты наилучшие
model_tree_custom.fit(X_train, y_train)
y_pred1 = model_tree_custom.predict(X_test)

model_tree = DecisionTreeClassifier()
model_tree.fit(X_train, y_train)
y_pred2 = model_tree.predict(X_test)
ttest_ind(y_pred1, y_pred2, alternative='less')

TtestResult(statistic=-2.236899405344602, pvalue=0.012648985594206697, df=43414.0)

Дефолтная модель выиграла

К сожалению, не получается запустить код того же содержания для random forest'а, потому что каждый раз, когда я запускаю его с дефолтными параметрами, у меня умиарет ядро.

In [18]:
'''for i in params_random_forest['max_depth']:
    model = RandomForestClassifier(max_depth=i)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(i, r2_score(y_pred, y_test))''';

In [19]:
'''for i in params_random_forest['n_estimators']:
    model = RandomForestClassifier(max_depth=i)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(i, r2_score(y_pred, y_test))''';

In [20]:
'''model = RandomForestClassifier(criterion='gini')
model.fit(X_train, y_train)
y_pred1 = model.predict(X_test)

model = RandomForestClassifier(criterion='entropy')
model.fit(X_train, y_train)
y_pred2 = model.predict(X_test)
ttest_ind(y_pred1, y_pred2, alternative='less')''';

In [21]:
'''model_forest_custom = RandomForestClassifier(n_estimators=, criterion='', max_depth=)
model_forest_custom.fit(X_train, y_train)
y_pred1 = model_forest_custom.predict(X_test)

model_forest = RandomForestClassifier()
model_forest.fit(X_train, y_train)
y_pred2 = model_forest.predict(X_test)
ttest_ind(y_pred1, y_pred2, alternative='less')''';

# Hard (TODO)