In [1]:
import pandas as pd
import numpy as np
import os
from datetime import timedelta
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display, HTML


In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
from  sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
PATH = "../data"
os.listdir(PATH)

['kaggle_sample_submission.csv', 'player_price.csv', 'player_stats.csv']

#### Read sample submission and dataset with price dynamics

In [4]:
sub = pd.read_csv(PATH+'/kaggle_sample_submission.csv')
df_pp = pd.read_csv(PATH+'/player_price.csv')

#### Change name of column in submission table

In [5]:
sub["player_id"] = sub["id"].apply(lambda x: int(x.split("_")[0]))
sub["Date"] = sub["id"].apply(lambda x: np.datetime64(x.split("_")[1]))

#### Change name of column in price dynamics table

In [6]:
df_pp = df_pp[df_pp["player_id"].isin(sub["player_id"].unique())]
df_pp['Date'] = pd.to_datetime(df_pp['timestamp'], unit='ms')
df_pp.drop(["timestamp", "ps_price", "player_name"], axis=1, inplace=True)

In [7]:
df_pp.head(2)

Unnamed: 0,xbox_price,player_id,Date
0,0,1,2016-09-23
1,10000000,1,2016-09-24


#### Prepare Data for training model in width style

In [8]:
def transform_data(X_train, Y_train):
    
    X_train["Date"] = X_train["Date"].astype(str)
    dataForModel = pd.pivot_table(X_train, values="xbox_price", 
                                  index='player_id', columns='Date',
                                  aggfunc=np.sum)
    dataForModel.sort_index(inplace=True)
    
    Y_train["Date"] = Y_train["Date"].astype(str)
    dataForPrediction = pd.pivot_table(Y_train, values="xbox_price", 
                                  index='player_id', columns='Date',
                                  aggfunc=np.sum)
    dataForPrediction.sort_index(inplace=True)
    
    return dataForModel, dataForPrediction

### Data Preparation

In [9]:
def prepareXY(df, n=7):
    x_train = df[:-n-1]
    y_train = df[n:]
    return x_train, y_train

In [10]:
def prepate_date(df, start=0, period=7, pr_period=7):
    
    end = start+period
    x_train = df.iloc[ : , start:end]
    x_train = x_train.fillna(0)
    
    y_train = df.iloc[ : , end:end+pr_period]
    y_train = y_train.fillna(0)
    
    x_test = df.iloc[ : , end:end+period]
    x_test = x_test.fillna(0)
    
    y_test = df.iloc[ : , end+period:end+period+pr_period]
    y_test = y_test.fillna(0)
    
    return x_train, y_train, x_test, y_test

In [11]:
def weighted_average(series, weights):
    result = 0.0
    weights.reverse()
    for n in range(len(weights)):
        result += series[-n-1] * weights[n]
    return result

### Function for Model

In [12]:
#Currently finction doesn't work
def model(x_train, y_train, x_test, y_test):
    models = []
    _y_test = y_test
    
    for day in target_df["Date"].unique():
        lr = LinearRegression(n_jobs=-1)
        lr.fit(x_train, y_train.loc[target_df["Date"] == day, "xbox_price"])
        models.append(lr)
    for i, day in enumerate(sub["Date"].unique()):
        _y_test.loc[sub["Date"] == day, "price"] = models[i].predict(x_test)

### Prepare function for validation

In [13]:
df = df_pp.iloc[:1]
df_pp.head()

Unnamed: 0,xbox_price,player_id,Date
0,0,1,2016-09-23
1,10000000,1,2016-09-24
2,0,1,2016-09-25
3,0,1,2016-09-26
4,10000000,1,2016-09-27


In [14]:
def split(X_train, Y_train, n_folds=35151):
    
    k = int(np.floor(float(X_train.shape[0]) / n_folds))
    #print('Size of each fold: {}'.format(k))

    last_index = 0
    errors = []
    for i in range(2, n_folds + 1):
        split = float(i-1)/i
        
        X = X_train[:(k*i)]
        y = Y_train[:(k*i)]
        #print('Size of train + test: ', X.shape)
        
        index = int(np.floor(X.shape[0] * split))

        #print("Index to split:" + str(index))
        #print('Splitting the first ' + str(i) + ' chunks at ' + str(i-1) + '/' + str(i) )
        
        X_trainFolds = X[last_index:index]        
        y_trainFolds = y[last_index:index]
        
        # fold used to test the model
        X_testFold = X[index:]
        y_testFold = y[index:]
        
        #prepare Data for model
        last_index = index
        #print('X train shape ' + str(X_trainFolds.shape) + ' Y train shape ' + str(y_trainFolds.shape) )
        #print('X test shape ' + str(X_testFold.shape) + ' Y test shape ' + str(y_testFold.shape) )
    
        X_trainFolds,Y_trainFolds  = transform_data(X_trainFolds, y_trainFolds)
        
        X_testFold, Y_testFold = transform_data(X_testFold, y_testFold)
        

        models = []
        #errors = []
        _y_test = Y_testFold.copy()
        lr = LinearRegression(n_jobs=-1)
        for day in Y_trainFolds.columns.unique():
            #lr = LinearRegression(n_jobs=-1)
            index =  Y_trainFolds.columns.unique().tolist().index(day)
            lr.fit(X_trainFolds.ix[:,index:],  Y_trainFolds[day])
            X_trainFolds[day] = Y_trainFolds[day]
            
            #models.append(lr)
        
        for j, day in enumerate(Y_testFold.columns.unique()):
            _y_test[day] = 0
            _y_test[day] = lr.predict(X_testFold)
            #error = np.sqrt(((_y_test.values - Y_testFold.values) ** 2).mean())
            error = np.mean(np.abs((Y_testFold.values - _y_test.values) / Y_testFold.values)) * 100
            errors.append(error)
        #print("Error", np.mean(np.array(errors)))
    #display(_y_test)
    #display(Y_testFold)
    return errors

In [15]:
#n_folds = int(np.floor(float(train.shape[0]) / 7))


In [16]:
def weighted_average(series, weights):
    result = 0.0
    weights.reverse()
    for n in range(len(weights)):
        result += series[-n-1] * weights[n]
    return result

In [17]:
def exponential_smoothing(series, alpha):
    result = [series[0]] # first value is same as series
    for n in range(1, len(series)):
        result.append(alpha * series[n] + (1 - alpha) * result[n-1])
    return result

In [None]:
train = df_pp[df_pp.player_id == 1]

In [None]:
er = []
for player_id in df_pp.player_id.unique():
    train = df_pp[df_pp.player_id == player_id]
    train = train.reset_index()
    res = exponential_smoothing(train['xbox_price'], 0.25)
    train['xbox_price'] = pd.Series(res)
    X_train, Y_train = prepareXY(train, 7)
    n_folds = int(np.floor(float(X_train.shape[0]) / 7))
    errors = split(X_train, Y_train, n_folds=n_folds)
    er.append(errors)