https://www.kaggle.com/competitions/playground-series-s4e8

In [1]:
from fastkaggle import *
from pathlib import Path
import zipfile
import pandas as pd
import polars as pl
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [2]:
comp_name = "playground-series-s4e8"

In [3]:
setup_comp(comp_name)



Path('playground-series-s4e8')

In [4]:
datapath = Path(comp_name)

In [5]:
df_train = pl.read_csv(f'{datapath}/train.csv')

In [6]:
df_train.head()

id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
i64,str,f64,str,str,str,str,str,str,str,f64,f64,str,str,str,str,str,str,str,str,str,str
0,"""e""",8.8,"""f""","""s""","""u""","""f""","""a""","""c""","""w""",4.51,15.39,,,"""w""",,,"""f""","""f""",,"""d""","""a"""
1,"""p""",4.51,"""x""","""h""","""o""","""f""","""a""","""c""","""n""",4.79,6.48,,"""y""","""o""",,,"""t""","""z""",,"""d""","""w"""
2,"""e""",6.94,"""f""","""s""","""b""","""f""","""x""","""c""","""w""",6.85,9.93,,"""s""","""n""",,,"""f""","""f""",,"""l""","""w"""
3,"""e""",3.88,"""f""","""y""","""g""","""f""","""s""",,"""g""",4.16,6.53,,,"""w""",,,"""f""","""f""",,"""d""","""u"""
4,"""e""",5.85,"""x""","""l""","""w""","""f""","""d""",,"""w""",3.37,8.36,,,"""w""",,,"""f""","""f""",,"""g""","""a"""


In [7]:
df_train.describe()

statistic,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
str,f64,str,f64,str,str,str,str,str,str,str,f64,f64,str,str,str,str,str,str,str,str,str,str
"""count""",3116945.0,"""3116945""",3116941.0,"""3116905""","""2445922""","""3116933""","""3116937""","""2593009""","""1858510""","""3116888""",3116945.0,3116945.0,"""359922""","""1136084""","""3116907""","""159452""","""375998""","""3116921""","""2988065""","""267263""","""3116900""","""3116945"""
"""null_count""",0.0,"""0""",4.0,"""40""","""671023""","""12""","""8""","""523936""","""1258435""","""57""",0.0,0.0,"""2757023""","""1980861""","""38""","""2957493""","""2740947""","""24""","""128880""","""2849682""","""45""","""0"""
"""mean""",1558472.0,,6.309848,,,,,,,,6.348333,11.153785,,,,,,,,,,
"""std""",899784.661737,,4.657931,,,,,,,,2.699755,8.095477,,,,,,,,,,
"""min""",0.0,"""e""",0.03,"""0""","""0""","""1""","""2""","""0""","""0""","""0""",0.0,0.0,"""1""","""0""","""1""","""21""","""2""","""10""","""1""","""10 None""","""1""","""a"""
"""25%""",779236.0,,3.32,,,,,,,,4.67,4.97,,,,,,,,,,
"""50%""",1558472.0,,5.75,,,,,,,,5.88,9.65,,,,,,,,,,
"""75%""",2337708.0,,8.24,,,,,,,,7.41,15.63,,,,,,,,,,
"""max""",3116944.0,"""p""",80.67,"""z""","""z""","""z""","""z""","""z""","""y""","""z""",88.72,102.9,"""z""","""z""","""z""","""y""","""z""","""z""","""z""","""y""","""z""","""w"""


In [8]:
df_train.dtypes

[Int64,
 String,
 Float64,
 String,
 String,
 String,
 String,
 String,
 String,
 String,
 Float64,
 Float64,
 String,
 String,
 String,
 String,
 String,
 String,
 String,
 String,
 String,
 String]

In [9]:
df_train.select(pl.all().approx_n_unique())

id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
3083554,2,3884,75,84,79,27,78,49,64,2736,5828,39,61,60,23,25,24,41,33,53,4


In [10]:
df_train.columns

['id',
 'class',
 'cap-diameter',
 'cap-shape',
 'cap-surface',
 'cap-color',
 'does-bruise-or-bleed',
 'gill-attachment',
 'gill-spacing',
 'gill-color',
 'stem-height',
 'stem-width',
 'stem-root',
 'stem-surface',
 'stem-color',
 'veil-type',
 'veil-color',
 'has-ring',
 'ring-type',
 'spore-print-color',
 'habitat',
 'season']

Take a look at the value counts for the target 

In [11]:
df_train['class'].value_counts()

class,count
str,u32
"""e""",1411549
"""p""",1705396


Split the training set into data and target

In [12]:
x = df_train.drop('class')
y = df_train['class']

For a less ordinal data we could use `One Hot Encoding`, but since the prediction is binary we would can use a basic mapping or sklearn's `LabelEncoder`

In [13]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [14]:
cat_cols = [col for col, dtype in x.schema.items() if dtype == pl.Utf8]
num_cols = [col for col in x.columns if col not in cat_cols]

In [15]:
cat_cols

['cap-shape',
 'cap-surface',
 'cap-color',
 'does-bruise-or-bleed',
 'gill-attachment',
 'gill-spacing',
 'gill-color',
 'stem-root',
 'stem-surface',
 'stem-color',
 'veil-type',
 'veil-color',
 'has-ring',
 'ring-type',
 'spore-print-color',
 'habitat',
 'season']

In [16]:
num_cols

['id', 'cap-diameter', 'stem-height', 'stem-width']

Create categorical transformer, handle NaN vals

In [17]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

Create numeric transformer, handle NaN vals

In [18]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')) 
])

In [19]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ]
)

Split to training and test sets

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

In [21]:
y_train[:10]

array([1, 1, 1, 1, 1, 1, 1, 0, 0, 1])

Create a classifier

In [22]:
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(n_estimators=40, max_samples=200_000, random_state=42, verbose=2)),
    ]
)

Run fit 

In [23]:
x_train_pandas = x_train.to_pandas()

In [24]:
pipeline.fit(x_train_pandas, y_train)

building tree 1 of 40
building tree 2 of 40
building tree 3 of 40
building tree 4 of 40
building tree 5 of 40
building tree 6 of 40
building tree 7 of 40
building tree 8 of 40
building tree 9 of 40
building tree 10 of 40
building tree 11 of 40
building tree 12 of 40
building tree 13 of 40
building tree 14 of 40
building tree 15 of 40
building tree 16 of 40
building tree 17 of 40
building tree 18 of 40
building tree 19 of 40
building tree 20 of 40
building tree 21 of 40
building tree 22 of 40
building tree 23 of 40
building tree 24 of 40
building tree 25 of 40
building tree 26 of 40
building tree 27 of 40
building tree 28 of 40
building tree 29 of 40
building tree 30 of 40
building tree 31 of 40
building tree 32 of 40
building tree 33 of 40
building tree 34 of 40
building tree 35 of 40
building tree 36 of 40
building tree 37 of 40
building tree 38 of 40
building tree 39 of 40
building tree 40 of 40


[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:  4.5min
[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:  4.5min


In [25]:
y_pred = pipeline.predict(x_test)

[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    2.3s
[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    2.3s


In [26]:
y_pred.shape

(623389,)

In [27]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9918943067651178

In [28]:
X_test = pl.read_csv(f'{datapath}/test.csv')

In [29]:
preds = pipeline.predict(X_test)

[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    7.4s
[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    7.4s


In [30]:
label_encoder.inverse_transform(preds)

array(['e', 'p', 'p', ..., 'p', 'e', 'e'], dtype='<U1')

In [31]:
sample_df = pd.read_csv(datapath/'sample_submission.csv')

In [32]:
sample_df.head()

Unnamed: 0,id,class
0,3116945,e
1,3116946,e
2,3116947,e
3,3116948,e
4,3116949,e


In [33]:
decoded_preds = label_encoder.inverse_transform(preds)

In [34]:
sample_df['class'] = decoded_preds

In [35]:
sample_df

Unnamed: 0,id,class
0,3116945,e
1,3116946,p
2,3116947,p
3,3116948,p
4,3116949,e
...,...,...
2077959,5194904,p
2077960,5194905,p
2077961,5194906,p
2077962,5194907,e


In [36]:
sample_df.to_csv('submission.csv', index=False)


In [37]:
submit = True

In [38]:
if not iskaggle and submit:
    from kaggle import api
    api.competition_submit_cli(file_name='submission.csv', message="baseline model", competition=comp_name)

100%|██████████| 19.8M/19.8M [00:08<00:00, 2.40MB/s]
