In [1]:
import numpy as np
import pandas as pd
from dcor import distance_correlation
from autogluon.tabular import TabularDataset, TabularPredictor

In [2]:
train_data = TabularDataset('train.csv')
test_data = TabularDataset('test.csv')

INFER_MISSING = False

In [3]:
# Train a number of models to predict missing values. We will train the models on the data set which does not have any NAN.
if INFER_MISSING:
    train_data_dropped = train_data.dropna()
    for cx in train_data_dropped:
        if train_data[cx].isna().sum() > 0:
            if train_data_dropped[cx].nunique() <= 5:
                print(f'Infering missing values for feature {cx} as classification')
                tmp = TabularPredictor(label=cx).fit(train_data.dropna(subset=cx).drop('rainfall', axis=1), verbosity=1)
    
            elif pd.api.types.is_numeric_dtype(train_data[cx]):
                #print(f'Infering missing values for feature {cx} as regression')
                #tmp = TabularPredictor(label=cx, problem_type='regression').fit(train_data.dropna(subset=cx), verbosity=1)
                print(f'Infering missing values for feature {cx} as setting all to zero')
                train_data[cx].fillna(0.0, inplace=True)
                continue
    
            else:
                print(f'WARNING: Skipping feature {cx}')
                
            train_data.loc[train_data[cx].isna(), cx] = tmp.predict(train_data[train_data[cx].isna()])
            test_data.loc[test_data[cx].isna(), cx] = tmp.predict(test_data[test_data[cx].isna()])

    train_data.to_csv('train_inferred.csv')
    test_data.to_csv('test_inferred.csv')

else:
    
    train_data.to_csv('train_manual.csv')
    test_data.to_csv('test_manual.csv')

In [4]:
train_data.isna().mean()

id               0.0
day              0.0
pressure         0.0
maxtemp          0.0
temparature      0.0
mintemp          0.0
dewpoint         0.0
humidity         0.0
cloud            0.0
sunshine         0.0
winddirection    0.0
windspeed        0.0
rainfall         0.0
dtype: float64

In [5]:
predictor = TabularPredictor(label='rainfall').fit(train_data)

No path specified. Models will be saved in: "AutogluonModels/ag-20250306_093721"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.12.2
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #144-Ubuntu SMP Fri Feb 7 20:47:38 UTC 2025
CPU Count:          8
Memory Avail:       16.99 GB / 23.10 GB (73.5%)
Disk Space Avail:   271.25 GB / 937.33 GB (28.9%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accuracy wi

In [6]:
predictor.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.89726,accuracy,0.025004,3.119911,0.000611,0.080359,2,True,14
1,CatBoost,0.892694,accuracy,0.001162,0.874949,0.001162,0.874949,1,True,7
2,NeuralNetFastAI,0.892694,accuracy,0.020406,1.914589,0.020406,1.914589,1,True,10
3,XGBoost,0.888128,accuracy,0.002826,0.250015,0.002826,0.250015,1,True,11
4,LightGBMXT,0.888128,accuracy,0.003071,0.556095,0.003071,0.556095,1,True,3
5,NeuralNetTorch,0.888128,accuracy,0.005156,3.344795,0.005156,3.344795,1,True,12
6,LightGBM,0.885845,accuracy,0.000899,0.30992,0.000899,0.30992,1,True,4
7,RandomForestEntr,0.883562,accuracy,0.043135,0.570728,0.043135,0.570728,1,True,6
8,ExtraTreesEntr,0.883562,accuracy,0.045076,0.472141,0.045076,0.472141,1,True,9
9,LightGBMLarge,0.878995,accuracy,0.005696,0.884417,0.005696,0.884417,1,True,13


In [7]:
predictor.fit_summary(show_plot=True)

*** Summary of fit() ***
Estimated performance of each model:
                  model  score_val eval_metric  pred_time_val  fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2   0.897260    accuracy       0.025004  3.119911                0.000611           0.080359            2       True         14
1              CatBoost   0.892694    accuracy       0.001162  0.874949                0.001162           0.874949            1       True          7
2       NeuralNetFastAI   0.892694    accuracy       0.020406  1.914589                0.020406           1.914589            1       True         10
3               XGBoost   0.888128    accuracy       0.002826  0.250015                0.002826           0.250015            1       True         11
4            LightGBMXT   0.888128    accuracy       0.003071  0.556095                0.003071           0.556095            1       True          3
5        NeuralNetTorch   0.888128    

{'model_types': {'KNeighborsUnif': 'KNNModel',
  'KNeighborsDist': 'KNNModel',
  'LightGBMXT': 'LGBModel',
  'LightGBM': 'LGBModel',
  'RandomForestGini': 'RFModel',
  'RandomForestEntr': 'RFModel',
  'CatBoost': 'CatBoostModel',
  'ExtraTreesGini': 'XTModel',
  'ExtraTreesEntr': 'XTModel',
  'NeuralNetFastAI': 'NNFastAiTabularModel',
  'XGBoost': 'XGBoostModel',
  'NeuralNetTorch': 'TabularNeuralNetTorchModel',
  'LightGBMLarge': 'LGBModel',
  'WeightedEnsemble_L2': 'WeightedEnsembleModel'},
 'model_performance': {'KNeighborsUnif': 0.817351598173516,
  'KNeighborsDist': 0.821917808219178,
  'LightGBMXT': 0.8881278538812786,
  'LightGBM': 0.8858447488584474,
  'RandomForestGini': 0.8789954337899544,
  'RandomForestEntr': 0.8835616438356164,
  'CatBoost': 0.8926940639269406,
  'ExtraTreesGini': 0.8698630136986302,
  'ExtraTreesEntr': 0.8835616438356164,
  'NeuralNetFastAI': 0.8926940639269406,
  'XGBoost': 0.8881278538812786,
  'NeuralNetTorch': 0.8881278538812786,
  'LightGBMLarge': 0.

In [8]:
predictor.feature_importance(train_data)

Computing feature importance via permutation shuffling for 12 features using 2190 rows with 5 shuffle sets...
	3.65s	= Expected runtime (0.73s per shuffle set)


Wird in einer aktuellen Browsersitzung geöffnet.


	1.53s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
cloud,0.196621,0.005654,8.193904e-08,5,0.208262,0.18498
sunshine,0.028584,0.002615,8.313815e-06,5,0.033969,0.0232
dewpoint,0.020822,0.003516,9.399296e-05,5,0.028062,0.013582
id,0.015799,0.001786,1.92706e-05,5,0.019477,0.012122
humidity,0.011872,0.000969,5.270696e-06,5,0.013867,0.009878
windspeed,0.010137,0.001494,5.496275e-05,5,0.013212,0.007062
maxtemp,0.009772,0.000829,6.171531e-06,5,0.01148,0.008064
pressure,0.009132,0.001826,0.0001821741,5,0.012893,0.005372
winddirection,0.008219,0.000854,1.380377e-05,5,0.009978,0.00646
day,0.008219,0.001407,9.925395e-05,5,0.011117,0.005321


In [9]:
predictor.predict(test_data, model=predictor.model_best).to_frame().join(test_data.id).set_index('id').to_csv('submission_autogluon.csv')