https://www.kaggle.com/sarvajna/random-forest-with-variable-importance

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

pd.set_option('display.max_columns', 500)
import nltk
import datetime
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
path = '../input/competitive-data-science-predict-future-sales/'

train = pd.read_csv(path + "sales_train.csv")
items = pd.read_csv(path + "items.csv")
item_categories = pd.read_csv(path + "item_categories.csv")
shops = pd.read_csv(path + "shops.csv")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#make text features
def tfidf_features_25(df, col) :
    feature_count = 25
    tfidf = TfidfVectorizer(max_features = feature_count)
    
    aa = pd.DataFrame(tfidf.fit_transform(df[col]).toarray())
    
    cols = aa.columns
    for idx in range(feature_count) :
        df[col + '_name_tfidf_' + str(idx)] = aa[cols[idx]]
    
    return df

# text features for items df

In [None]:
items.head()

In [None]:
print('item category id 개수 : ', len(items.item_category_id.unique()))
print(items.item_category_id.value_counts().head())

In [None]:
print('items shape : ', items.shape)
print('item id 개수 : ', len(items.item_id.unique()))

In [None]:
items.head()

In [None]:
items['item_name_length'] = items['item_name'].apply(lambda x : len(x))
items['item_name_word_count'] = items['item_name'].apply(lambda x : len(x.split(' ')))

## tf-idf

In [None]:
items = tfidf_features_25(items, 'item_name')

In [None]:
print(items.shape)
items.head()

# text features of item category df

In [None]:
item_categories.head()

In [None]:
print('item_category data shape : ', item_categories.shape)
print('item category id 개수 : ', len(item_categories.item_category_id.unique()))

In [None]:
item_categories['item_category_name_length'] = item_categories['item_category_name'].apply(lambda x : len(x))
item_categories['item_category_name_word_count'] = item_categories['item_category_name'].apply(lambda x : len(x.split(' ')))

In [None]:
item_categories = tfidf_features_25(item_categories, 'item_category_name')
print(item_categories.shape)
item_categories.head()

# text features for shops df

In [None]:
shops.head()

In [None]:
shops['shop_name_length'] = shops['shop_name'].map(lambda x : len(x)) #Length of each shop_name(including punctuation in the shop_name)
shops['shop_name_word_count'] = shops['shop_name'].map(lambda x : len(x.split(' ')))

In [None]:
shops = tfidf_features_25(shops, 'shop_name')
print(shops.shape)
shops.head()

# prepare dataset

In [None]:
my_path = '../input/20190729-fe/'
df = pd.read_csv(my_path + '20190729_df.csv')
df.head(3)

In [None]:
df.columns

In [None]:
df = pd.merge(df, items, how = 'left', on = 'item_id')
df.shape

In [None]:
df = pd.merge(df, item_categories, how = 'left', on = 'item_category_id')
df.shape

In [None]:
df = pd.merge(df, shops, how = 'left', on = 'shop_id')
df.shape

In [None]:
df.drop(['shop_name', 'item_category_name', 'item_name'], axis = 1, inplace = True)

In [None]:
df.to_csv('20190731_df.csv', index = False)

# Modeling

In [None]:
print ('there are no duplicated columns!') if len(df.columns) == len(set(df.columns)) else('중복 칼럼이 있음')

In [None]:
y = df['item_cnt_month']
df.drop('item_cnt_month', axis = 1, inplace = True)

X_train = df[df.date_block_num < 30]
y_train = y[df[df.date_block_num < 30].index]
X_valid = df[(df.date_block_num >= 30) & (df.date_block_num < 34)]
y_valid = y[df[(df.date_block_num >= 30) & (df.date_block_num < 34)].index]

X_test = df[df.date_block_num == 34] #test data

print('train shape X : {} y : {}'.format(X_train.shape, y_train.shape))
print('valid shape X : {} y : {}'.format(X_valid.shape, y_valid.shape))
print('test shape X : {}'.format(X_test.shape))

In [None]:
del train, items, item_categories, shops

In [None]:
import gc
del df
gc.collect()

## lightGBM

In [None]:
from lightgbm import LGBMRegressor
lgb = LGBMRegressor(n_jobs = -1, n_estimators = 1000)

lgb.fit(X_train, y_train, eval_metric='rmse', eval_set=[(X_train, y_train), (X_valid, y_valid)],
       verbose = 20, early_stopping_rounds=30)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
feature_imp = pd.DataFrame(sorted(zip(lgb.feature_importances_, X_train.columns)), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features')
plt.tight_layout()
plt.show()

## XGBoost

In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(n_jobs = -1, n_estimators = 1000)

In [None]:
xgb.fit(X_train, y_train ,eval_metric='rmse', eval_set=[(X_train, y_train), (X_valid, y_valid)],
         verbose = 20, early_stopping_rounds=30)

In [None]:
feature_imp = pd.DataFrame(sorted(zip(xgb.feature_importances_, X_train.columns)), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('XGBoost Features')
plt.tight_layout()
plt.show()

# Submission

In [None]:
y_pred_lgb = lgb.predict(X_test).clip(0,20)
y_pred_xgb = xgb.predict(X_test).clip(0,20)

In [None]:
y_pred_lgb = np.round(y_pred_lgb, 5)
y_pred_xgb = np.round(y_pred_xgb, 5)

In [None]:
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')
test.head()

In [None]:
sub1 = pd.DataFrame({
    'ID' : test.index,
    'item_cnt_month' : y_pred_lgb
})

sub2 = pd.DataFrame({
    'ID' : test.index,
    'item_cnt_month' : y_pred_xgb
})

sub1.to_csv('20190731_lgb_FE_pred.csv', index = False)
sub2.to_csv('20190731_xgb_Fe_pred.csv', index = False)