# Modélisation

In [1]:
# Bibliothèques nécessaires
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
import math
import operator

def RMSLE(estimate, real):
    """
        Computes the Root Mean Square Logarithmic Error of two elements.
        
        :param estimate: The predicted values
        :param real: The real values
        :return: The RMSLE of the two input elements
    """
    
    # we check the size on the input elements are the same
    assert(estimate.size == real.size), "Input elements must have the same size"
    size = estimate.size
    diff = map(operator.sub, 
               map(lambda x:math.log(x+1), estimate),
               map(lambda x:math.log(x+1), real))
    diff_squared = map(lambda x:x*x, diff)
    return math.sqrt(sum(diff_squared)/size)

In [2]:
# Chemin vers le fichier csv contenant les données à explorer.
path= "data/cleaned/mower_market_snapshot_cleaned.csv"

# Délimiteur utilisé dans le fichier csv
delimiter=","

# Création de la dataframe à partir du fichier csv
mower_df = pd.read_csv(path, delimiter=delimiter)

In [3]:
# Training set
train_size = 1120 # we do 80/20 since the dataset is not highly big
train_mower = mower_df.sample(n=train_size)

# test set
test_mower = mower_df[~mower_df.id.isin(train_mower.id)]

# On supprime les colonnes inutiles
train_y = train_mower["attractiveness"]
test_y = test_mower["attractiveness"]

train_x = train_mower
train_x.drop(labels="id", axis=1, inplace=True)
train_x.drop(labels="attractiveness", axis=1, inplace=True)

test_x = test_mower
test_x.is_copy = False
test_x.drop(labels="id", axis=1, inplace=True)
test_x.drop(labels="attractiveness", axis=1, inplace=True)

## Support Vector Classification with linear kernel

In [4]:
clf_svr_lin = SVR(kernel='linear', tol=0.001)
# training
clf_svr_lin = clf_svr_lin.fit(train_x.values, train_y.values)
# testing
result_svr_lin = clf_svr_lin.predict(test_x.values)
# error
RMSLE(result_svr_lin, test_y.values)

0.0721828092783727

## Support Vector Classification

In [5]:
reg = linear_model.Ridge(alpha = .5)
# training
reg.fit(train_x.values, train_y.values)
# testing
result_reg = reg.predict(test_x.values)
# error
RMSLE(result_reg, test_y.values)

0.07167980462491595

## Support Vector Classification with rbf kernel

In [6]:
clf_rbf = SVR(kernel='rbf')
clf_rbf = clf_rbf.fit(train_x.values, train_y.values)
result_rbf = clf_rbf.predict(test_x.values)
RMSLE(result_rbf, test_y.values)

0.05540540717710952

## Random forest

In [7]:
clf_rfg = RandomForestRegressor(100)
clf_rfg = clf_rfg.fit(train_x.values, train_y.values)
result_rfg = clf_rfg.predict(test_x.values)
RMSLE(result_rfg, test_y.values)

0.04425356268895705

La fôret aléatoire semble le meilleur candidat pour réaliser notre prédiction

## Prédiction

In [15]:
# Chemin vers le fichier csv contenant les données à étudier.
path= "data/original/submission_set.csv"

# Délimiteur utilisé dans le fichier csv
delimiter_submission=";"

# Création de la dataframe à partir du fichier csv
submission = pd.read_csv(path, delimiter=delimiter_submission)

# On supprime les variables que nous ne voulons pas garder pour la prédiction
submission.drop(labels="margin", axis=1, inplace=True)
submission.drop(labels="prod_cost", axis=1, inplace=True)
submission.drop(labels="warranty", axis=1, inplace=True)

# binarisation des variables catégorielles
submission_df = pd.get_dummies(submission)

In [9]:
submission_df.describe(include='all')

Unnamed: 0,capacity,failure_rate,id,price,product_type_auto-portee,product_type_electrique,product_type_essence,quality_Hight,quality_Low,quality_Medium
count,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0
mean,50.101143,0.164916,19957.5,244.531361,0.1,0.308333,0.591667,0.123333,0.678333,0.198333
std,16.364164,0.073387,173.349358,241.642303,0.30025,0.46219,0.491936,0.329094,0.467506,0.399077
min,7.43233,0.029201,19658.0,63.969204,0.0,0.0,0.0,0.0,0.0,0.0
25%,38.754969,0.095825,19807.75,73.367481,0.0,0.0,0.0,0.0,0.0,0.0
50%,50.761661,0.202148,19957.5,78.618777,0.0,0.0,1.0,0.0,1.0,0.0
75%,61.342713,0.225939,20107.25,380.587182,0.0,1.0,1.0,0.0,1.0,0.0
max,92.666666,0.280646,20257.0,919.588021,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
# On garde les ids pour enregistrer notre prédiction
ids = submission_df["id"]
submission_df.drop(labels="id", axis=1, inplace=True)

In [11]:
# prédiction en utilisant le modèle donné par la fôret aléatoire
submission_result = clf_rfg.predict(submission_df.values)

col = ['id', 'attractiveness']
submission_to_save = pd.DataFrame(zip(ids,submission_result), columns=col)

In [12]:
# On enregistre au format csv
submission_to_save.to_csv(path_or_buf="data/result/besson_mathieu_attractiveness.csv", sep=';', index=False)

In [13]:
submission_to_save.describe()

Unnamed: 0,id,attractiveness
count,600.0,600.0
mean,19957.5,0.640592
std,173.349358,0.109899
min,19658.0,0.374462
25%,19807.75,0.559213
50%,19957.5,0.640819
75%,20107.25,0.721446
max,20257.0,0.876933


In [14]:
submission_to_save

Unnamed: 0,id,attractiveness
0,20049,0.772682
1,19699,0.662101
2,19704,0.696098
3,20072,0.652712
4,20183,0.665864
5,19967,0.723292
6,20046,0.876484
7,19897,0.853276
8,20160,0.733350
9,20058,0.802830
