In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.learning_curve import learning_curve
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split

In [2]:
df_origin = pd.read_csv('processed/processed_products_averaged.csv')
# print df_origin
# print df_origin.product_type.unique().shape[0]
df_origin.iloc[368]

ProductId                                             9853
product_name         Pasteles yo Galletas 2a 500g MLA 9853
product_shortname             Pasteles yo Galletas 2a 500g
brand                                                  MLA
weight                                                 500
pieces                                                 NaN
volume                                                 NaN
product_type                 [u'pastel', u'yo', u'gallet']
weight_per_piece                                       NaN
has_choco                                            False
has_vanilla                                          False
has_multigrain                                       False
has_promotion                                        False
cluster                                                 30
product_type_len                                         3
Name: 368, dtype: object

In [3]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,\
                     train_scores_mean + train_scores_std, alpha=0.1,\
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,\
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",\
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",\
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

In [4]:
idx_predict = df_origin[df_origin.AdjDemandMean.isnull()].index
idx_train = df_origin[df_origin.AdjDemandMean.notnull()].index
df = df_origin.fillna(0)

# Feature processing
[df['productTypeFct'],typeFct] = pd.factorize(df.product_type)
[df['brandFct'],brandFct] = pd.factorize(df.brand)
df['has_choco'] = df.apply(lambda x: 1 if x.has_choco else 0,axis=1)
df['has_multigrain'] = df.apply(lambda x: 1 if x.has_multigrain else 0,axis=1)
df['has_vanilla'] = df.apply(lambda x: 1 if x.has_vanilla else 0,axis=1)
df['has_promotion'] = df.apply(lambda x: 1 if x.has_promotion else 0,axis=1)

# Feature scaling
scaler = MinMaxScaler()
df['weight_s'] = scaler.fit_transform(df['weight'].reshape(-1,1))
df['volume_s'] = scaler.fit_transform(df['volume'].reshape(-1,1))
df['wpp_s'] = scaler.fit_transform(df['weight_per_piece'].reshape(-1,1))
df['pieces_s'] = scaler.fit_transform(df['pieces'].reshape(-1,1))

AttributeError: 'DataFrame' object has no attribute 'AdjDemandMean'

In [None]:
# empty values to predict
df_to_predict = df.ix[idx_predict].copy(deep=True)
df_train = df.ix[idx_train].copy(deep=True)

#split into training and test datasets
df_train = df_train[['product_name','productTypeFct','brandFct','weight_s','volume_s','pieces_s',\
          'wpp_s','has_choco','has_vanilla','has_multigrain','has_promotion','cluster',\
          'AdjDemandMean','AdjDemandMeanScaled']]

# X - all columns except name and y-vectors
X = df_train.loc[:,'productTypeFct':'cluster']

# choose y - either Median or Mean
y = df_train.loc[:,'AdjDemandMean']

# split into validation and testing
X_train, X_test, y_train, y_test = train_test_split(
        X,y,test_size=0.2,random_state=42)

# Fit the model
n_est = 5
model = RandomForestRegressor(n_estimators = n_est)
title = 'Random forest: ' + str(n_est)
model.fit(X_train,y_train)

# plot the learning curve
plot_learning_curve(model, title, X_train, y_train, ylim=(0.0, 1.01), cv=5, n_jobs=4)

# Plot feature importance
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. %s (%f)" % (f + 1, X.columns[indices[f]], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

# Evaluate quality of prediction on test dataset
y_pred_s = model.predict(X_test)
global_median = y.median()
global_mean = y.mean()
mse_pred = np.sqrt(np.sum((y_test-y_pred_s)**2)/2/y_test.shape[0])
mse_median = np.sqrt(np.sum((y_test-global_median)**2)/2/y_test.shape[0])
mse_mean = np.sqrt(np.sum((y_test-global_mean)**2)/2/y_test.shape[0])
print "Pred: {:10.4f}, Median: {:10.4f}, Mean: {:10.4f}".format(mse_pred,mse_median,mse_mean)

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(y_test,y_pred_s,'ro')
ax.plot([0,100],[global_median,global_median],'b--')
xt = ax.set_xlim([0,8])
yt = ax.set_ylim([0,8])

In [None]:
#Predict new values
X_predict = df_to_predict[X_train.columns]
y_predict = np.exp(model.predict(X_predict))-1.0
# y_predict

In [None]:
y

In [None]:
df_new = df_origin
df_new.loc[idx_predict,'AdjDemandMean'] = y_predict

In [None]:
# df_new.loc[:,'ProductId':].set_index('ProductId').to_csv('processed/predicted_products.csv')
df_to_predict.shape[0]