I ran this query in Athena:

```
select m.aa_authority_type__code, m.ac_award_crit__code, m.iso_country__value, m.nc_contract_nature__code, m.original_cpv_code, m.pr_proc__code, m.rp_regulation__code, m.td_document_type__code, m.ty_type_bid__code, m.ma_main_activities__code, m.main_ma_main_activities__code, m.contracting_body__address_contracting_body__country__value,m.object_contract__title, m.object_contract__type_contract__ctype, m.object_contract__cpv_main__cpv_code__code, m.object_contract__short_descr, m.object_contract__object_descr__n2016_nuts__code, m.legal_basis__value, m.year, m_1.value_eur,
m_1.total_tenders_received
from merged as m
inner join merged as m_1 on m_1.ref_no = m.no_doc_ojs
where m_1.td_document_type = 'Contract award notice'
and m.td_document_type = 'Contract notice'
and m_1.TOTAL_TENDERS_RECEIVED != 0
and m.value_eur >= 0
limit 100000
```


In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
# from google.colab import files
# uploaded = files.upload()


In [None]:
# import io
# data = pd.read_csv(io.BytesIO(uploaded['num_tenders_data.csv']))
# data.head()

In [None]:
data = pd.read_csv("https://s3.eu-west-3.amazonaws.com/deep.skoo.ch/cca/num_tenders_data.csv")
data.head()

In [None]:
clip_values = [0, 1000]

# drop rows without value_eur
data.dropna(axis=0, subset=['value_eur'], inplace=True)

# drop rows with total_tenders_received > 2000
data = data[data['total_tenders_received'] <= clip_values[1]]

# drop rows with values < 1,000
data = data[data['value_eur'] > 10000]

# and drop rows with values > 1e10
data = data[data['value_eur'] < 1e10]

y = data['total_tenders_received'].values

In [None]:
def list_to_item(x):
    try:
        return x.replace("[", "").replace("]", "").split(",")[0].strip()
    except:
        return ""

In [None]:
# transformations
data['main_cpv_code'] = data['original_cpv_code'].map(lambda x: x.replace("[", "").replace("]", "").split(",")[0].strip()[:2])
data['expanded_cpv_code'] = data['original_cpv_code'].map(lambda x: x.replace("[", "").replace("]", "").split(",")[0].strip()[:4])
data['main_nuts_code'] = data['object_contract__object_descr__n2016_nuts__code'].map(list_to_item)
data['ma_main_activities__code'] = data['ma_main_activities__code'].map(list_to_item)

In [None]:
# drop some columns
data.drop(columns=["year", "legal_basis__value", "original_cpv_code","contracting_body__address_contracting_body__country__value", "td_document_type__code", "main_nuts_code","main_ma_main_activities__code", "ty_type_bid__code", "object_contract__cpv_main__cpv_code__code", "object_contract__title", "object_contract__short_descr", "total_tenders_received", "object_contract__object_descr__n2016_nuts__code"], inplace=True)

In [None]:
data.head()

In [None]:
cols_to_ohe = ['aa_authority_type__code',  'ac_award_crit__code', 'iso_country__value',
     'pr_proc__code', 'rp_regulation__code',
       'ma_main_activities__code',
       'object_contract__type_contract__ctype',
       'main_cpv_code', 'expanded_cpv_code']

In [None]:
# OHE the variables
ohe_data = pd.get_dummies(data, columns=cols_to_ohe)

In [None]:
ohe_data.head()

In [None]:
# split the data
X_tr, X_te, y_tr, y_te = train_test_split(ohe_data, y, random_state=10)

In [None]:
print("X_tr:", X_tr.shape)
print("X_te:", X_te.shape)
print("y_tr:", y_tr.shape)
print("y_te:", y_te.shape)

In [None]:
plt.hist(y_tr, bins=np.arange(0, 500, 50))
plt.show()

## Fit Some Models

In [None]:
# baseline RMSE using mean
y_mean = np.mean(y_tr)
baseline_preds = [y_mean] * len(y_te)
baseline_mse = mean_squared_error(y_te, baseline_preds)
print("Target Mean:", y_mean)
print("Target Std:", np.std(y_tr))
print("Baseline RMSE:", np.sqrt(baseline_mse))
print("Baseline R^2:", r2_score(y_te, baseline_preds))

### RandomForest

Note - run this entire section, then come back and run it again skipping the cell below. The first time through will identify which columns are not used by the model and then the second time will drop those columns which improves the performance.

In [None]:
rfr_drop_features = []

In [None]:
rfr = RandomForestRegressor(n_estimators=350, max_depth=38, n_jobs=-1)
rfr.fit(X_tr.drop(rfr_drop_features, axis=1), y_tr)

In [None]:
# depth 35 - all features - max 1000
rfr_preds = rfr.predict(X_te.drop(rfr_drop_features, axis=1)).clip(*clip_values)
rfr_acc = mean_squared_error(y_te, rfr_preds)
print("Test RMSE:", np.sqrt(rfr_acc))
print("Test R^2:", rfr.score(X_te.drop(rfr_drop_features, axis=1), y_te))

In [None]:
# depth 38 - all features - max 1000
rfr_preds = rfr.predict(X_te.drop(rfr_drop_features, axis=1)).clip(*clip_values)
rfr_acc = mean_squared_error(y_te, rfr_preds)
print("Test RMSE:", np.sqrt(rfr_acc))
print("Test R^2:", rfr.score(X_te.drop(rfr_drop_features, axis=1), y_te))

In [None]:
# depth 35 - used features - max 1000
rfr_preds = rfr.predict(X_te.drop(rfr_drop_features, axis=1)).clip(*clip_values)
rfr_acc = mean_squared_error(y_te, rfr_preds)
print("Test RMSE:", np.sqrt(rfr_acc))
print("Test R^2:", rfr.score(X_te.drop(rfr_drop_features, axis=1), y_te))

In [None]:
# depth 35 - all features - max 500
rfr_preds = rfr.predict(X_te.drop(rfr_drop_features, axis=1)).clip(*clip_values)
rfr_acc = mean_squared_error(y_te, rfr_preds)
print("Test RMSE:", np.sqrt(rfr_acc))
print("Test R^2:", rfr.score(X_te.drop(rfr_drop_features, axis=1), y_te))

In [None]:
# depth 32 - all features - max 2000
rfr_preds = rfr.predict(X_te.drop(rfr_drop_features, axis=1)).clip(*clip_values)
rfr_acc = mean_squared_error(y_te, rfr_preds)
print("Test RMSE:", np.sqrt(rfr_acc))
print("Test R^2:", rfr.score(X_te.drop(rfr_drop_features, axis=1), y_te))

In [None]:
# all features
rfr_tr_preds = rfr.predict(X_tr.drop(rfr_drop_features, axis=1))
print("Train RMSE:", np.sqrt(mean_squared_error(y_tr, rfr_tr_preds)))
print("Train R^2:", rfr.score(X_tr.drop(rfr_drop_features, axis=1), y_tr))

In [None]:
print("Pred Mean:", np.mean(rfr_preds))
print("Pred Std:", np.std(rfr_preds))
print("Pred Min:", np.min(rfr_preds))
print("Pred Max:", np.max(rfr_preds))
print()
print("True Mean:", np.mean(y_te))
print("True Std:", np.std(y_te))
print("True Min:", np.min(y_te))
print("True Max:", np.max(y_te))

In [None]:
# depth 35 - all features - max 1000
plt.scatter(rfr_preds, y_te, alpha=0.5)
plt.title("Random Forest Predictions vs Ground Truth")
plt.xlabel("Predictions")
plt.ylabel("Ground Truth")
plt.show()

In [None]:
# depth 35 - used features - max 1000
plt.scatter(rfr_preds, y_te, alpha=0.5)
plt.title("Random Forest Predictions vs Ground Truth")
plt.xlabel("Predictions")
plt.ylabel("Ground Truth")
plt.show()

In [None]:
# depth 35 - all features
plt.scatter(rfr_preds, y_te, alpha=0.5)
plt.title("Random Forest Predictions vs Ground Truth")
plt.xlabel("Predictions")
plt.ylabel("Ground Truth")
plt.show()

In [None]:
# feature importances
rfr_drop_features = []

ranked = [(x, imp) for imp,x in sorted(zip(rfr.feature_importances_, X_tr.columns), reverse=True)]
for col, imp in ranked:
    print(col, imp)
    if imp == 0:
        rfr_drop_features.append(col)

In [None]:
print("Features to drop:", len(rfr_drop_features))

### ExtraTrees

In [None]:
et_drop_features = []

In [None]:
et = ExtraTreesRegressor(n_estimators=350, max_depth=35, n_jobs=-1)
et.fit(X_tr.drop(et_drop_features, axis=1), y_tr)

In [None]:
# all features
et_preds = et.predict(X_te.drop(et_drop_features, axis=1)).clip(*clip_values)
et_acc = mean_squared_error(y_te, et_preds)
print("Test RMSE:", np.sqrt(et_acc))
print("Test R^2:", et.score(X_te.drop(et_drop_features, axis=1), y_te))

In [None]:
# used features
et_preds = et.predict(X_te.drop(et_drop_features, axis=1)).clip(*clip_values)
et_acc = mean_squared_error(y_te, et_preds)
print("Test RMSE:", np.sqrt(et_acc))
print("Test R^2:", et.score(X_te.drop(et_drop_features, axis=1), y_te))

In [None]:
et_tr_preds = et.predict(X_tr.drop(et_drop_features, axis=1))
print("Train MSE:", np.sqrt(mean_squared_error(y_tr, et_tr_preds)))
print("Train R^2:", et.score(X_tr.drop(et_drop_features, axis=1), y_tr))

In [None]:
print("Pred Mean:", np.mean(et_preds))
print("Pred Std:", np.std(et_preds))
print("Pred Min:", np.min(et_preds))
print("Pred Max:", np.max(et_preds))
print()
print("True Mean:", np.mean(y_te))
print("True Std:", np.std(y_te))
print("True Min:", np.min(y_te))
print("True Max:", np.max(y_te))

In [None]:
plt.scatter(et_preds, y_te, alpha=0.5)
plt.title("Extra Trees Predictions vs Ground Truth")
plt.xlabel("Predictions")
plt.ylabel("Ground Truth")
plt.show()

In [None]:
# feature importances
et_drop_features = []

ranked = [(x, imp) for imp,x in sorted(zip(et.feature_importances_, X_tr.columns), reverse=True)]
for col, imp in ranked:
    print(col, imp)
    if imp == 0:
        et_drop_features.append(col)

In [None]:
print("ET Features to Drop:", len(et_drop_features))

### LinearRegression

In [None]:
lr = LinearRegression()
lr.fit(X_tr, y_tr)
lr_preds = lr.predict(X_te).clip(*clip_values)
lr_acc = mean_squared_error(y_te, lr_preds)
print("Test RMSE:", np.sqrt(lr_acc))

In [None]:
lr_tr_preds = lr.predict(X_tr)
print("Train RMSE:", np.sqrt(mean_squared_error(y_tr, lr_tr_preds)))

In [None]:
print("Pred Mean:", np.mean(lr_preds))
print("Pred Std:", np.std(lr_preds))
print("Pred Min:", np.min(lr_preds))
print("Pred Max:", np.max(lr_preds))
print()
print("True Mean:", np.mean(y_te))
print("True Std:", np.std(y_te))
print("True Min:", np.min(y_te))
print("True Max:", np.max(y_te))

In [None]:
plt.scatter(lr_preds, y_te, alpha=0.5)
plt.title("LR Preds vs True")
plt.xlabel("Predictions")
plt.ylabel("Ground Truth")
plt.show()