# Modélisation

In [1]:
# Bibliothèques nécessaires
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
import math
import operator

def RMSLE(estimate, real):
    """
        Computes the Root Mean Square Logarithmic Error of two elements.
        
        :param estimate: The predicted values
        :param real: The real values
        :return: The RMSLE of the two input elements
    """
    
    # we check the size on the input elements are the same
    assert(estimate.size == real.size), "Input elements must have the same size"
    size = estimate.size
    diff = map(operator.sub, 
               map(lambda x:math.log(x+1), estimate),
               map(lambda x:math.log(x+1), real))
    diff_squared = map(lambda x:x*x, diff)
    return math.sqrt(sum(diff_squared)/size)

In [2]:
# Chemin vers le fichier csv contenant les données à explorer.
path= "data/cleaned/mower_market_snapshot_cleaned.csv"

# Délimiteur utilisé dans le fichier csv
delimiter=","

# Création de la dataframe à partir du fichier csv
mower_df = pd.read_csv(path, delimiter=delimiter)

In [3]:
# Training set
train_size = 1120 # we do 80/20 since the dataset is not highly big
train_mower = mower_df.sample(n=train_size)

train_x = train_mower.iloc[0:,0:7]
train_x.drop(labels="id", axis=1, inplace=True)

# test set
test_mower = mower_df[~mower_df.id.isin(train_mower.id)]

test_x = test_mower.iloc[0:,0:7]
test_x.drop(labels="id", axis=1, inplace=True)

## Support Vector Classification with linear kernel

In [4]:
clf_svr_lin = SVR(kernel='linear', tol=0.001)
# training
clf_svr_lin = clf_svr_lin.fit(train_x.values, train_mower.attractiveness.values)
# testing
result_svr_lin = clf_svr_lin.predict(test_x.values)
# error
RMSLE(result_svr_lin, test_mower.attractiveness.values)

0.07270711149308698

## Support Vector Classification

In [5]:
reg = linear_model.Ridge(alpha = .5)
# training
reg.fit(train_x.values, train_mower.attractiveness.values)
# testing
result_reg = reg.predict(test_x.values)
# error
RMSLE(result_reg, test_mower.attractiveness.values)

0.07135023368762025

## Support Vector Classification with rbf kernel

In [6]:
clf_rbf = SVR(kernel='rbf')
clf_rbf = clf_rbf.fit(train_x.values, train_mower.attractiveness.values)
result_rbf = clf_rbf.predict(test_x.values)
RMSLE(result_rbf, test_mower.attractiveness.values)

0.05880304421489724

## Random forest

In [7]:
clf_rfg = RandomForestRegressor(50)
clf_rfg = clf_rfg.fit(train_x.values, train_mower.attractiveness.values)
result_rfg = clf_rfg.predict(test_x.values)
RMSLE(result_rfg, test_mower.attractiveness.values)

0.04295184123308616

La fôret aléatoire semble le meilleur candidat pour réaliser notre prédiction

## Prédiction

In [8]:
# Chemin vers le fichier csv contenant les données à explorer.
path= "data/original/submission_set.csv"

# Délimiteur utilisé dans le fichier csv
delimiter_submission=";"

# Création de la dataframe à partir du fichier csv
submission_df = pd.read_csv(path, delimiter=delimiter_submission)

In [9]:
# On supprime les variables que nous ne voulons pas garder pour la prédiction
submission_df.drop(labels="margin", axis=1, inplace=True)
submission_df.drop(labels="prod_cost", axis=1, inplace=True)

# Puis on applique les dictionnaires appliquées dans l'étape précédente

# Dictionnaire à appliquer sur la variable "indice de qualité".
dict_quality = {"Low": -1, "Medium": 0, "Hight": 1}

for line_index in range(submission_df.quality.size):
        quality = submission_df["quality"][line_index]
        submission_df.set_value(line_index, "quality", dict_quality[quality])
        
# dictionnaire à appliquer sur la variable "product type"
product_type_dict = {"auto-portee":1, "electrique":2, "essence":3}

for line_index in range(submission_df.product_type.size):
        product_type = submission_df["product_type"][line_index]
        submission_df.set_value(line_index, "product_type", product_type_dict[product_type])
        
for line in range(submission_df.warranty.size):
    warranty = submission_df["warranty"][line]
    submission_df.set_value(line, "warranty", np.int(warranty[0]))

In [10]:
submission_df.describe(include='all')

Unnamed: 0,capacity,failure_rate,id,price,product_type,quality,warranty
count,600.0,600.0,600.0,600.0,600.0,600.0,600.0
unique,,,,,3.0,3.0,3.0
top,,,,,3.0,-1.0,1.0
freq,,,,,355.0,407.0,355.0
mean,50.101143,0.164916,19957.5,244.531361,,,
std,16.364164,0.073387,173.349358,241.642303,,,
min,7.43233,0.029201,19658.0,63.969204,,,
25%,38.754969,0.095825,19807.75,73.367481,,,
50%,50.761661,0.202148,19957.5,78.618777,,,
75%,61.342713,0.225939,20107.25,380.587182,,,


In [11]:
ids = submission_df["id"]
submission_df.drop(labels="id", axis=1, inplace=True)

In [12]:
submission_result = clf_rfg.predict(submission_df.values)

col = ['id', 'attractiveness']
submission_to_save = pd.DataFrame(zip(ids,submission_result), columns=col)

In [13]:
# On enregistre au format csv
submission_to_save.to_csv(path_or_buf="data/result/besson_mathieu_attractiveness.csv", sep=';', index=False)

In [14]:
submission_to_save.describe()

Unnamed: 0,id,attractiveness
count,600.0,600.0
mean,19957.5,0.643014
std,173.349358,0.111017
min,19658.0,0.357996
25%,19807.75,0.557676
50%,19957.5,0.641417
75%,20107.25,0.726423
max,20257.0,0.876951


In [18]:
mower_df = pd.read_csv(path, delimiter=";")

In [19]:
mowerByQuality = mower_df.groupby("quality")

In [21]:
mowerByQuality.count()

Unnamed: 0_level_0,capacity,failure_rate,id,margin,price,prod_cost,product_type,warranty
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Hight,74,74,74,74,74,74,74,74
Low,407,407,407,407,407,407,407,407
Medium,119,119,119,119,119,119,119,119
