In [1]:
import csv
import random
from sklearn import naive_bayes
from sklearn.ensemble.forest import RandomForestRegressor, ExtraTreesRegressor
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree.tree import DecisionTreeRegressor
from sklearn.linear_model.ridge import Ridge
import sklearn.metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors.regression import KNeighborsRegressor
from sklearn import cross_validation
from sklearn import datasets
from sklearn.cross_validation import StratifiedKFold
from pprint import pprint
import xgboost as xgb

In [2]:
def load_dataset(file, columns, value_filter=lambda x: x):
    with open(file, "r") as fd:
        csv_reader = csv.DictReader(fd, fieldnames=columns, delimiter=";")
        _header = next(csv_reader)
        print("Loading dataset ...")
        return [{key: value_filter(value) for key, value in row.items()} for row in csv_reader]


def filter_dataset(dataset, features, transform=lambda x: x):
    return [[transform(row[feature]) for feature in features] for row in dataset]

In [3]:
def cleanup_data(value):
    return value if value not in ["NR", ""] else -1


# Open the file
train_data_file = "./data/ech_apprentissage.csv"
columns = [
    'id',
    'annee_naissance',
    'annee_permis',
    'marque',
    'puis_fiscale',
    'anc_veh',
    'codepostal',
    'energie_veh',
    'kmage_annuel',
    'crm',
    'profession',
    'var1',
    'var2',
    'var3',
    'var4',
    'var5',
    'var6',
    'var7',
    'var8',
    'var9',
    'var10',
    'var11',
    'var12',
    'var13',
    'var14',
    'var15',
    'var16',
    'var17',
    'var18',
    'var19',
    'var20',
    'var21',
    'var22',
    'prime_tot_ttc'
]
dataset = load_dataset(file=train_data_file,
                       columns=columns,
                       value_filter=cleanup_data)

Loading dataset ...


In [4]:
print("Filtering features ...")
# ["crm", "annee_naissance", "kmage_annuel"]
feature_to_filter = [
    # 'id',
    'annee_naissance',
    'annee_permis',
    # 'marque',
    'puis_fiscale',
    'anc_veh',
    # 'codepostal',
    # 'energie_veh',
    'kmage_annuel',
    'crm',
    # 'profession',
    # 'var1',
    # 'var2',
    'var3',
    'var4',
    'var5',
    # 'var6',
    'var7',
    # 'var8',
    'var9',
    # 'var10',
    'var11',
    'var12',
    'var13',
    # 'var14',
    'var15',
    # 'var16',
    'var17',
    'var18',
    'var19',
    'var20',
    'var21',
    'var22',
    # 'prime_tot_ttc'
]
#feature_to_filter = [
#    feature
#    for feature in columns
#    if feature not in feature_list
#]
print("feature_to_filter = {}".format(feature_to_filter))
dataset_filtered = filter_dataset(dataset=dataset,
                                  features=feature_to_filter,
                                  transform=float)

targets = [float(row["prime_tot_ttc"]) for row in dataset]
print("Splitting dataset")

dataset_size = len(dataset_filtered)
train_dataset_size = int(dataset_size * 0.7)
test_dataset_size = dataset_size - train_dataset_size

train_dataset = dataset_filtered[:train_dataset_size]
test_dataset = dataset_filtered[train_dataset_size:]

train_target = targets[:train_dataset_size]
test_target = targets[train_dataset_size:]

print("train_dataset_size = {}\ntest_dataset_size = {}".format(len(train_dataset), len(test_dataset)))

Filtering features ...
feature_to_filter = ['annee_naissance', 'annee_permis', 'puis_fiscale', 'anc_veh', 'kmage_annuel', 'crm', 'var3', 'var4', 'var5', 'var7', 'var9', 'var11', 'var12', 'var13', 'var15', 'var17', 'var18', 'var19', 'var20', 'var21', 'var22']
Splitting dataset
train_dataset_size = 210000
test_dataset_size = 90000


n_estimators=100, learning_rate=0.1, loss='lad' 0.82
n_estimators=100, learning_rate=0.2, loss='lad' 0.84
n_estimators=100, learning_rate=0.2, loss='huber' 0.84


In [5]:
model = xgb.XGBRegressor(max_depth=8, n_estimators=400, silent=False)
print("Fitting model")
model.fit(train_dataset, train_target)

Fitting model


XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=8,
       min_child_weight=1, missing=None, n_estimators=400, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=1)

In [None]:
#cv = StratifiedKFold(train_dataset, n_folds=5)
print("Validating")
###scoring
scores = cross_validation.cross_val_score(model, train_dataset, train_target, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

#Evaluate the quality of the prediction
test_predictions = model.predict(test_dataset)
quality = sklearn.metrics.mean_absolute_error(test_predictions, test_target)
print("Errors: %0.2f" % quality)

In [37]:
print("Predicting ...")
test_dataset = load_dataset(file="data/ech_test.csv",
                            columns=columns,
                            value_filter=cleanup_data)

test_dataset_filtered = filter_dataset(dataset=test_dataset,
                                       features=feature_to_filter,
                                       transform=float)

with open("result.csv", mode="w") as outfile:
    csv_writer = csv.writer(outfile, delimiter=";")
    result = model.predict(test_dataset_filtered)
    csv_writer.writerow(("id", "prime_tot_ttc"))
    for res, row in zip(result, test_dataset):
        csv_writer.writerow((row['id'], res))

Predicting ...
Loading dataset ...
