In [109]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as pp
import sweetviz as sv
from autoviz.AutoViz_Class import AutoViz_Class
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,MinMaxScaler,RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,ParameterGrid
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score
import lightgbm as lgb
import optuna
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce
%matplotlib inline
plt.style.use('ggplot')

In [32]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
oil_data = pd.read_csv('oil.csv')
store_data = pd.read_csv('stores.csv')
holiday_events = pd.read_csv('holidays_events.csv')
transaction_data = pd.read_csv('transactions.csv')


In [33]:
#autoviz = AutoViz_Class().AutoViz('train.csv')

In [34]:
#report = sv.DataframeReport(train_df)
#report.show_notebook()

In [110]:
#pipeline構築
num_features = ['onpromotion']
cat_features = ['family','store_nbr','day','month','dayofweek','year']

numeric_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer()),
    ('scaler',StandardScaler())
])

cat_transformer = ce.OneHotEncoder(handle_unknown='ignore')

preprocesser = ColumnTransformer(transformers=[
    ('num_transform',numeric_transformer,num_features),
    ('cat_transform',cat_transformer,cat_features)
])

pipeline = Pipeline(steps=[
    ('preprocesser',preprocesser),
    ('regresser',lgb.LGBMRegressor())
])


In [111]:
set_config(display='diagram')
pipeline

In [60]:
#時系列データの変換
train_df['date'] = pd.to_datetime(train_df['date'], format="%Y-%m-%d %H:%M:%S")
train_df['day'] = train_df['date'].dt.day
train_df['month'] = train_df['date'].dt.month
train_df['dayofweek'] = train_df['date'].dt.dayofweek
train_df['year'] = train_df['date'].dt.year
train_df.head() 

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,day,month,dayofweek,year
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,1,1,1,2013
1,1,2013-01-01,1,BABY CARE,0.0,0,1,1,1,2013
2,2,2013-01-01,1,BEAUTY,0.0,0,1,1,1,2013
3,3,2013-01-01,1,BEVERAGES,0.0,0,1,1,1,2013
4,4,2013-01-01,1,BOOKS,0.0,0,1,1,1,2013


In [112]:
from sklearn.model_selection import train_test_split
X = train_df.drop(columns=['date','id','sales'])
y = train_df['sales']
y = y.astype('int')
X_train,X_test,y_train,y_test = train_test_split(X,y,shuffle=True,test_size=1)

In [62]:
pipeline.fit(X_train,y_train)

In [63]:
test_df['date'] = pd.to_datetime(test_df['date'], format="%Y-%m-%d %H:%M:%S")
test_df['day'] = test_df['date'].dt.day
test_df['month'] = test_df['date'].dt.month
test_df['dayofweek'] = test_df['date'].dt.dayofweek
test_df['year'] = test_df['date'].dt.year


In [65]:
X_for_submit = test_df[['store_nbr','family','onpromotion','day','month','dayofweek','year']]
submit_data = pd.DataFrame(X_for_submit)
submit = test_df[['id']]
submit['sales'] = pipeline.predict(submit_data)
submit.to_csv('submit1.csv',index=False)



In [115]:
param_grid = [
    {
        "preprocesser__num_transform__imputer__strategy": ["mean", "median"],
        "preprocesser__num_transform__scaler": [StandardScaler(), MinMaxScaler(), RobustScaler()], 
        "preprocesser__cat_transform__encoder": [ce.OneHotEncoder(handle_unknown='ignore')],      
        "regresser": [lgb.LGBMRegressor()]
    }
]
grid_search = GridSearchCV(pipeline, param_grid, cv=10, verbose=3,n_jobs=-1)


In [116]:
grid_search.fit(X_train,y_train)#ここが上手くいかない

Fitting 10 folds for each of 6 candidates, totalling 60 fits


ValueError: Invalid parameter encoder for estimator OneHotEncoder(handle_unknown='ignore'). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
params = {
    'silent': 1,
    'max_depth': 6,
    'min_child_weight': 1,
    'eta': 0.1,
    'tree_method': 'exact',
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'predictor': 'cpu_predictor'
}
dtrain = xgb.DMatrix(X_train,label=y_train)
dtest = xgb.DMatrix(X_test,label=y_test)
model = xgb.train(params=params,
                  dtrain=dtrain,
                  num_boost_round=1000,
                  early_stopping_rounds=5,
                  evals=[(dtest, 'test')])

In [None]:
pred = model.predict(xgb.DMatrix(X_test),
                     ntree_limit=model.best_ntree_limit)

In [None]:
plt.figure(figsize=(12, 12))
plt.scatter(y_test[:1000], pred[:1000])
plt.show()

In [None]:
X_for_submit = df_test[['store_nbr','family','onpromotion','day','month','dayofweek','year']]
submit = df_test[['id']]
submit['sales'] = model.predict(xgb.DMatrix(X_for_submit),
                     ntree_limit=model.best_ntree_limit)
submit

In [None]:
submit.to_csv('/content/drive/MyDrive/sales/submit.csv',index=False)