In [1]:
# install fastkaggle if not available
try: import fastkaggle
except ModuleNotFoundError:
    !pip install -Uq fastkaggle

from fastkaggle import *

[0m

## Getting set up

In [2]:
comp = 'godaddy-microbusiness-density-forecasting'

path = setup_comp(comp, install='fastai "timm>=0.6.2.dev0"')

In [3]:
!ls godaddy-microbusiness-density-forecasting

census_starter.csv  sample_submission.csv  test.csv  train.csv


Now we can import the stuff we'll need from fastai, set a seed (for reproducibility -- just for the purposes of making this notebook easier to write; I don't recommend doing that in your own analysis however) and check what's in the data:

In [4]:
from fastai.imports import *
np.set_printoptions(linewidth=130)

## Looking at the data

In [5]:
df = pd.read_csv(path/'train.csv')

In [6]:
df['first_day_of_month'] =  pd.to_datetime(df['first_day_of_month'])

In [7]:
df_state_info = df[['cfips','county','state']].set_index('cfips')

In [8]:
df = df.drop(['county','state'],axis=1)

In [9]:
census = pd.read_csv(path/'census_starter.csv').set_index('cfips').join(df_state_info)

## Feature engineering

In [10]:
dep='microbusiness_density'
def xs_y(df):
    # df[cats] = df[cats].apply(lambda x: x.cat.codes)
    xs = df[set(df.columns)-set([dep])-set(cols_to_remove)].fillna(0)
    return xs,df[dep] if dep in df else None

# trn_xs,trn_y = xs_y(trn_df)
# val_xs,val_y = xs_y(val_df)

In [11]:
def _select_cols(df):
    ft_cols=['pct_college','pct_foreign_born','pct_it_workers_','first_day_of_month',dep,'county','state']
    l = [[col for col in df.columns if col.startswith(pattern)] for pattern in ft_cols]
    flat_list = [item for sublist in l for item in sublist]
    return df[flat_list]

def feature_engineering(df):
    df = df.set_index('cfips')
    df = df.join(census)
    return df
    # return _select_cols(df)



df_ft = feature_engineering(df)
# proc_data(tst_df)

In [12]:
df.head(2)

Unnamed: 0,row_id,cfips,first_day_of_month,microbusiness_density,active
0,1001_2019-08-01,1001,2019-08-01,3.007682,1249
1,1001_2019-09-01,1001,2019-09-01,2.88487,1198


In [None]:
_select_cols(df_ft)

In [None]:
df_ft[['first_day_of_month','microbusiness_density'
]].head(5)

In [None]:
# m = DecisionTreeClassifier(min_samples_leaf=50)
# m.fit(trn_xs, trn_y)
# draw_tree(m, trn_xs, size=25)

## Our first model

In [None]:
df_ft.dtypes

In [None]:
test_set = list(df_ft.first_day_of_month.drop_duplicates().sort_values().tail(3))

In [None]:
from sklearn.model_selection import train_test_split
# trn_df,val_df = train_test_split(df_ft, test_size=0.25)
trn_df = df_ft[~df_ft.first_day_of_month.isin(test_set)].drop('first_day_of_month',axis=1)
val_df = df_ft[df_ft.first_day_of_month.isin(test_set)].drop('first_day_of_month',axis=1)

In [None]:
trn_xs,trn_y = xs_y(trn_df)
val_xs,val_y = xs_y(val_df)

In [None]:
trn_y.head(5)

In [None]:
trn_xs.head(5)

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(100, min_samples_leaf=5)
rf.fit(trn_xs, trn_y);

In [None]:
def calculate_smape(actual, predicted) -> float:
  
    # Convert actual and predicted to numpy
    # array data type if not already
    actual = np.array(actual)
    predicted = np.array(predicted)
  
    return round(
        np.mean(
            np.abs(predicted - actual) / 
            ((np.abs(predicted) + np.abs(actual))/2)
        )*100, 2
    )

In [None]:
from sklearn.metrics import mean_absolute_error,mean_absolute_percentage_error
print(mean_absolute_error(val_y, rf.predict(val_xs)))
print(calculate_smape(val_y, rf.predict(val_xs)))

In [None]:
pd.DataFrame(dict(cols=trn_xs.columns, imp=rf.feature_importances_)).sort_values('imp')

In [None]:
pd.DataFrame(dict(cols=trn_xs.columns, imp=rf.feature_importances_)).plot('cols', 'imp', 'barh');

## Submitting to Kaggle

In [None]:
test_df = pd.read_csv(path/'test.csv')

In [None]:
test_df

In [None]:
test_df_ft = feature_engineering(test_df).drop('first_day_of_month',axis=1)

In [None]:
test_xs = xs_y(test_df_ft)[0]

In [None]:
test_xs

In [None]:
test_df['microbusiness_density'] = rf.predict(test_xs)

In [None]:
result = test_df[['row_id','microbusiness_density']]

In [None]:
result.to_csv('subm.csv', index=False)
!head subm.csv

Let's submit this to kaggle. We can do it from the notebook if we're running on Kaggle, otherwise we can use the API:

In [None]:
if not iskaggle:
    from kaggle import api
    api.competition_submit_cli('subm.csv', 'updated model', comp)

Success! We successfully created a submission.

## Conclusion

## Addendum

`fastkaggle` also provides a function that pushes a notebook to Kaggle Notebooks. I wrote this notebook on my own machine, and pushed it to Kaggle from there -- here's the command I used:

In [None]:
if not iskaggle:
    push_notebook('madhaowagh', 'rf model updated',
                  title='starter notebook ',
                  file='rf_model_v1.ipynb',
                  competition=comp, private=False, gpu=False)