In [1]:
import pandas as pd
import re
import timeit as ti
import datetime as dt
import numpy as np
np.random.seed(4999)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.feature_selection import SelectFromModel
from sklearn.utils import shuffle

In [3]:
from catboost import CatBoostRegressor
from category_encoders.target_encoder import TargetEncoder

####  Load and purge data

In [4]:
class DataSet:
    def __init__(self,path = None, dsets=None,teacher=None):
        if path is not None:
            self.fe = pd.read_csv(path)
            if 'sellingprice' in self.fe.columns:
                self.ta = self.fe['sellingprice']
                self.fe = self.fe.drop(['sellingprice'],axis=1) 
            self.vin = self.fe['vin']
            self.fe = self.fe.drop(['vin'],axis=1) 
            self.fe = self.fe.rename(columns={'year':'prod_year'})
        elif dsets is not None:
            self.fe = dsets[0].copy()
            self.ta = dsets[1].copy()
        else:
            raise 'Wrong DataSet constructor , all values are None'
        self.teacher = teacher    


        self.price_cols = ['sellingprice','pr','pe']
        self.make_cols = ['make','model','trim']
        self.wear_cols = ['prod_year','condition','odometer']

In [5]:
%%time
def fillna(self):
    self.fe[['make','model','trim']] = self.fe[['make','model','trim']].fillna('UNKNOWN')
    self.fe[['color','interior']] = self.fe[['color','interior']].fillna('—')
    self.fe['transmission'] = self.fe['transmission'].fillna('automatic')
    cond_mean = self.fe.groupby('prod_year').condition.mean()
    idx_na= self.fe.condition.isna()
    self.fe.loc[idx_na,'condition'] = self.fe[idx_na].prod_year.apply(lambda s: cond_mean[s])
    run_mean = self.fe.groupby('prod_year').odometer.mean()
    idx_na=self.fe.odometer.isna() 
    self.fe.loc[idx_na,'odometer'] = self.fe[idx_na].prod_year.apply(lambda s: run_mean[s])
    return(self)
    
DataSet.fillna = fillna

CPU times: total: 0 ns
Wall time: 0 ns


In [6]:
def normalize(self,rounding):
    start = ti.default_timer()
    cols_to_upper= ['make','model','trim','body']
    cols_dt = ['year','month','day','hour','minute','second','weekday','yearday','dl']
    cols_abbr = ['trim','seller']
    cols_trash = ['saledate','trim','abbr_seller','seller','second','yearday','dl']
    def transform_row(r,cols_to_upper,cols_dt,cols_abbr):
        def abbr(s,prefix_size=5 ):
            s = s.strip().upper()
            if len(s) <= prefix_size:
                return(s)
            s = s[:prefix_size].replace(' ','-')+s[prefix_size:]
            i = s.find(' ')
            if i > 0:
                s = s[:i]        
            return(s)

        t =  dt.datetime.strptime(r['saledate'].split('GMT')[0]  ,"%a %b %d %Y %H:%M:%S ").timetuple()
        dc = dict(zip(cols_dt ,t))
        for col in cols_abbr:
            dc['abbr_'+col] = abbr(r[col])

        for col in cols_to_upper:
            dc[col] = str(r[col]).upper()
        dc['odometer']= round(r['odometer']/rounding['odometer']) 
        dc['condition']= round(r['condition'],rounding['condition']) 
        return dc

    transformed = self.fe.apply(transform_row, axis=1,result_type='expand',
                                         cols_to_upper=cols_to_upper,cols_dt=cols_dt,cols_abbr=cols_abbr)
    self.fe[transformed.columns] = transformed   
    
    self.fe = self.fe.drop(cols_trash ,axis=1)
#    print(f"normalize: {ti.default_timer()-start} s " )
    return self

DataSet.normalize = normalize

In [7]:
def split_(self,rate):
    fe0,fe1,ta0,ta1 = train_test_split(self.fe,self.ta,test_size=rate )
    return DataSet(dsets = [fe0,ta0]),DataSet( dsets = [fe1,ta1])
DataSet.split = split_

In [8]:
def skew(self,threshold,mult):
    ta0 = self.ta[self.ta>threshold]
    ta1 = self.ta[self.ta<threshold]
    fe0 = self.fe[self.ta>threshold]
    fe1 = self.fe[self.ta<threshold]
    if mult >=1:
         ta_new = pd.concat( [ta0]+mult*[ta1] ,axis=0).copy()
         fe_new = pd.concat( [fe0]+mult*[fe1] ,axis=0).copy()
    else:
        idx = ta0.sample(frac=mult, replace=True).index
        ta_new = pd.concat( [ta0[idx]]+[ta1] ,axis=0).copy()
        fe_new = pd.concat( [fe0.loc[idx]]+[fe1] ,axis=0).copy()        
    self.fe,self.ta =  shuffle(fe_new,ta_new)
    return self

DataSet.skew = skew
# fe5,ta5 = swing_price(fe_tr,ta_tr,5000,.1)
# print(fe5.shape,ta5.shape)

In [9]:
def encode(self):
    start= ti.default_timer()
    if self.teacher is None:
#        self.enc= OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1)
        self.enc= TargetEncoder(handle_unknown='value')
        self.enc.fit(self.fe,self.ta)
    else:
        self.enc= self.teacher.enc
    self.fe = pd.DataFrame( self.enc.transform(self.fe),columns = self.enc.get_feature_names_out() )

    print(f"encode: {ti.default_timer()-start} s " )
    return self

DataSet.encode = encode

#### Run

In [10]:
%%time
#ds = DataSet(path='datasets/train.csv').fillna().normalize().encode().skew(threshold=3000,mult = 5)
#dst = DataSet(path='datasets/test.csv',teacher=ds).fillna().normalize().encode()
print(f"rounding,\t threshold,\t mult, \t score, \t time \t start")
for rounding in [3000]:
    for threshold in [4000]:
        for mult in [4]:
            start = ti.default_timer()
            norm_dc = {'condition':1,'odometer':rounding}
            ds=DataSet('datasets/train.csv').fillna().normalize(norm_dc).encode().skew(threshold=threshold,mult=mult)
            dst=DataSet(path='datasets/test.csv',teacher=ds).fillna().normalize(norm_dc).encode()
            md = RandomForestRegressor(criterion='poisson',
 #                                  max_depth=200, n_estimators=500,
                                   max_depth=50, n_estimators=200,                                       
                                   min_samples_split=2, min_samples_leaf=2, max_features=16,
                                   warm_start=True)
            md.fit(ds.fe,ds.ta)
            prt = md.predict(dst.fe)
            stop = ti.default_timer()
            print(f"{rounding},\t{threshold},\t{mult},\t{mape(prt,dst.ta)},\t {round(stop-start)},\t {start}")

rounding,	 threshold,	 mult, 	 score, 	 time 	 start
encode: 8.800801000000007 s 
encode: 0.897882899999999 s 


AttributeError: 'DataSet' object has no attribute 'ta'

In [11]:
print(f"{rounding},\t{threshold},\t{mult},\t{mape(prt,dst.ta)},\t {round(stop-start)},\t {start}")

AttributeError: 'DataSet' object has no attribute 'ta'

In [None]:
ds.fe.head()

In [None]:
dst.fe.head()

In [None]:
for c in ds.fe.columns:
    if sum(ds.fe[c].isna())>0:
        print(c, sum(ds.fe[c].isna()))
        

In [None]:
prt = pd.Series( md.predict(dst.fe),index=dst.fe.index )

In [None]:
mape(dst.ta,prt)

In [12]:
prv = pd.DataFrame({'vin': dst.vin,'sellingprice':prt} )
prv.to_csv('datasets/result.csv',index=False)

In [None]:
raise

In [None]:
pd.DataFrame({'real':dst.ta,'predict':prt}).head()

In [None]:
for c in dst.fe.columns:
    print( c, sum( dst.fe[c]==-1 ) )
dst.fe.shape    

In [None]:
raise

In [None]:
for c in ds.columns:
    print(f"{c} : ----- { ds[c].value_counts().size } ============> NaNs: {sum(ds[c].isna()) }----")
    print(dict( ds[c].value_counts().head(10)) )
    print(dict( ds[c].value_counts().tail()) )
    print()

In [None]:
ds.head()

#### Split dataset

In [None]:
ds_vin = ds.vin
ds_ta = ds.sellingprice
trash_cols = ['saledate','trim','abbr_seller','seller','second','yearday','dl']+['year','hour','minute','weekday']
ds_fe = ds.drop(['vin','sellingprice']+trash_cols,axis=1)

In [None]:
enc= OrdinalEncoder()
ds_fe = pd.DataFrame( enc.fit_transform(ds_fe),columns = enc.get_feature_names_out() )

In [None]:
fe_tr,fe_va,ta_tr,ta_va = train_test_split(ds_fe,ds_ta,test_size = .25)
print( fe_tr.shape,fe_va.shape,ta_tr.shape,ta_va.shape)
fe_tr.head()

#### Validation

In [None]:
%%time
for n_est in [200]:
    for mss in [2]:
        for msl in [2]:
            for mf in [24]:
                for cri in ['poisson']:  
#            min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=1.0
                    start = ti.default_timer()  
                    md = RandomForestRegressor(criterion=cri,max_depth=100, n_estimators=n_est,
       #                  random_state = 5555,                      
                         min_samples_split=mss, min_samples_leaf=msl, max_features=mf,
                         warm_start=True)
                    md.fit(fe_tr,ta_tr)
                    prt = pd.Series( md.predict(fe_va),index=fe_va.index )
                    print(f"est:{n_est} mss:{mss} msl:{msl} mf:{mf} cri:{cri}=> {mape(ta_va,prt)} | {ti.default_timer()-start}")

In [None]:
pr = prt

In [None]:
mape(ta_va,pr)

In [None]:
RandomForestRegressor().get_params()

In [None]:
# %%time
# md = DecisionTreeRegressor(max_depth=333)
# md.fit(fe_tr,ta_tr)
# prt = pd.Series( md.predict(fe_va),index=fe_va.index )
# mape(ta_va,prt)               

#### Final fit

In [None]:
pe = abs(pr-ta_va)/(pr+ta_va)
pe.hist(figsize=(20,6),bins=100);

In [None]:
ds['pr'] = pr
ds['pe'] = pe

In [None]:
print( ds.sellingprice[ds.pe < 10_000_000 ].shape,pr.shape,ta_va.shape,(pr-ta_va).shape )
pr.head().index,ta_va.head().index,fe_va.head().index,(pr - ta_va).head().index

In [None]:
for th in [1000,2000,3500,5000]:
    print(f" {th}:\t { mape(pr[pr>th],ta_va[pr>th])} \t {pr[pr<th].shape} ")

In [None]:
print(mape(ta_va[ds.pe<.25],pr[ds.pe<.25]),mape(ta_va[ds.pe<.5],pr[ds.pe<.5]),mape(ta_va[ds.pe<.75],pr[ds.pe<.75]))
#diffs_50_plus = 

In [None]:
%%time

def swing_price(fe,ta,th,k):
    ta0 = ta[ta>th]
    ta1 = ta[ta<th]
    fe0 = fe[ta>th]
    fe1 = fe[ta<th]
    if k >=1:
        ta_new = pd.concat( [ta0]+k*[ta1] ,axis=0).copy()
        fe_new = pd.concat( [fe0]+k*[fe1] ,axis=0).copy()
    else:
        idx = ta0.sample(frac=k).index
        ta_new = pd.concat( [ta0[idx]]+[ta1] ,axis=0).copy()
        fe_new = pd.concat( [fe0.loc[idx]]+[fe1] ,axis=0).copy()        
    return shuffle(fe_new,ta_new)
fe5,ta5 = swing_price(fe_tr,ta_tr,5000,.1)
print(fe5.shape,ta5.shape)

In [None]:
%%time
for th in (3000,6000):
    for mlt in (5,10):
        start = ti.default_timer()
        fe,ta = swing_price(fe_tr,ta_tr,th,mlt)
        md = RandomForestRegressor(criterion='poisson',max_depth=100, n_estimators=200,
                          random_state = 5555,                      
             min_samples_split=2, min_samples_leaf=2, max_features=24,
             warm_start=True)
        md.fit(fe,ta)
        prt = pd.Series( md.predict(fe_va),index=fe_va.index )
        print(f" {th} {mlt} --> {mape(ta_va,prt)} | {ti.default_timer()-start} {ti.default_timer()} ")

In [None]:
print( ds.sellingprice.mean(),pr.mean(),rel_diffs.sellingprice.mean(),rel_diffs.pr.mean() )
rel_diffs[price_cols+make_cols+wear_cols+['seller']+['abbr_power']+['q25']+['state']]

In [None]:
pd.DataFrame( {
'd50': diffs_50.q25.value_counts().head(30),
'd66': diffs_66.q25.value_counts().head(30),
'ds': ds.q75.value_counts().head(30)
}    ).sort_values(by = 'd50',ascending=False)    

In [None]:
raise

In [None]:
#print( rel_diffs.seller.value_counts() )
#ds.seller.value_counts()

In [None]:
cnd = ds.pivot_table(
    index='seller',values=['sellingprice','condition'],aggfunc=['mean','count']
)
cnd[ cnd[('count','condition')]>10000].sort_values(by=('mean','condition') )

In [None]:
cheapsellers = list( ds[ds.sellingprice<1000].seller.value_counts().head(60).index )
print(ds[ds.seller.isin( cheapsellers )].groupby(ds.seller).sellingprice.quantile(.25) )
ds.groupby(ds.seller).condition.quantile(.25).head(60)

In [None]:
ta_va[ta_va<2000].hist(bins=10,figsize = (20,8),alpha=.5);
pr[pr<2000].hist(bins=10,figsize = (20,8),alpha=.5);

In [None]:
ds[ds.seller=='credit acceptance corp/vrs/southfield'].sellingprice.hist(bins=50,figsize = (20,8),alpha=0.5);
ds[ds.seller.str[:12]=='purple heart'].sellingprice.hist(bins=50,figsize = (20,8),alpha=0.5);

In [None]:
ds[ds['abbr_power']=='PURPLE'].pr.quantile([.1,.25,.5,.75])

In [None]:
ds[ds['abbr_power']=='QUALITY'].pr.quantile([.1,.25,.5,.75,.9])