## **Problem Statement**
The task is to forecast the total amount of products sold in every shop for the test set. Note that the list of shops and products slightly changes every month. Creating a robust model that can handle such situations is part of the challenge.


In [None]:
import pandas as pd

In [None]:
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

In [None]:
train_df=pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")

In [None]:
test_df=pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")

In [None]:
sub=pd.read_csv("../input/competitive-data-science-predict-future-sales/sample_submission.csv")

In [None]:
train_df.head()

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
shop_df=pd.read_csv("../input/competitive-data-science-predict-future-sales/shops.csv")

In [None]:
shop_df.head()

In [None]:
items_df=pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv")

In [None]:
items_df.head()

In [None]:
item_cat=pd.read_csv("../input/competitive-data-science-predict-future-sales/item_categories.csv")

In [None]:
item_cat.head()

In [None]:
train_df.describe()

In [None]:
train_df.describe().T

In [None]:
# getting the information about the data

train_df.info()

In [None]:
train_df.isnull().sum()

In [None]:
#check for null values in each data frame
print("No. of Null values in the train set :", train_df.isnull().sum().sum())
print("No. of Null values in the test set :", test_df.isnull().sum().sum())
print("No. of Null values in the item set :", items_df.isnull().sum().sum())
print("No. of Null values in the shops set :", shop_df.isnull().sum().sum())
print("No. of Null values in the item_categories set :", item_cat.isnull().sum().sum())

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.rcParams['figure.figsize'] = (19, 9)
sns.barplot(item_cat['item_category_id'], item_cat['item_category_id'], palette = 'colorblind')
plt.title('Count for Different Items Categories', fontsize = 30)
plt.xlabel('Item Categories', fontsize = 15)
plt.ylabel('Items in each Categories', fontsize = 15)
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (19, 9)
sns.countplot(train_df['date_block_num'])
plt.title('Date blocks according to months', fontsize = 30)
plt.xlabel('Different blocks of months', fontsize = 15)
plt.ylabel('No. of Purchases', fontsize = 15)
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (13, 7)
sns.distplot(train_df['item_price'], color = 'red')
plt.title('Distribution of the price of Items', fontsize = 30)
plt.xlabel('Range of price of items', fontsize = 15)
plt.ylabel('Distrbution of prices over items', fontsize = 15)
plt.show()

In [None]:
train_df.nunique()

In [None]:
train_df['item_id'].value_counts()

In [None]:

# checking the no. of unique item present in the stores

x = train_df['item_id'].nunique()
print("The No. of Unique Items Present in the stores available: ", x)

In [None]:
# checking the no. of unique item present in the stores

x = item_cat['item_category_id'].nunique()
print("The No. of Unique categories for Items Present in the stores available: ", x)

In [None]:
# checking the no. of unique shops given in the dataset

x = train_df['shop_id'].nunique()


In [None]:
print("No. of Unique Shops are :", x)

In [None]:
# making a word cloud for item categories name

from wordcloud import WordCloud
from wordcloud import STOPWORDS

plt.rcParams['figure.figsize'] = (15, 12)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(background_color = 'lightblue',
                      max_words = 200, 
                      stopwords = stopwords,
                     width = 1200,
                     height = 800,
                     random_state = 42).generate(str(item_cat['item_category_name']))


plt.title('Wordcloud for Item Category Names', fontsize = 30)
plt.axis('off')
plt.imshow(wordcloud, interpolation = 'bilinear')

In [None]:
# making a word cloud for item name

from wordcloud import WordCloud
from wordcloud import STOPWORDS

plt.rcParams['figure.figsize'] = (15, 12)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(background_color = 'pink',
                      max_words = 200, 
                      stopwords = stopwords,
                     width = 1200,
                     height = 800,
                     random_state = 42).generate(str(items_df['item_name']))


plt.title('Wordcloud for Item Names', fontsize = 30)
plt.axis('off')
plt.imshow(wordcloud, interpolation = 'bilinear')

In [None]:
# making a word cloud for shop name

from wordcloud import WordCloud
from wordcloud import STOPWORDS

plt.rcParams['figure.figsize'] = (15, 12)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(background_color = 'gray',
                      max_words = 200, 
                      stopwords = stopwords,
                     width = 1200,
                     height = 800,
                     random_state = 42).generate(str(shop_df['shop_name']))


plt.title('Wordcloud for Shop Names', fontsize = 30)
plt.axis('off')
plt.imshow(wordcloud, interpolation = 'bilinear')

In [None]:
# making a new column day
train_df['date'] = pd.to_datetime(train_df['date'], errors='coerce')

In [None]:
# making a new column month
train_df['month'] = pd.to_datetime(train_df['date'], errors='coerce')

# making a new column year
train_df['year'] = pd.to_datetime(train_df['date'], errors='coerce')

# making a new column week
train_df['week'] =pd.to_datetime(train_df['date'], errors='coerce')

# checking the new columns
train_df.columns

In [None]:
train_df.head()

In [None]:
# checking which days are most busisiest for the shops

plt.rcParams['figure.figsize'] = (15, 7)
sns.countplot(train_df['date'])
plt.title('The most busiest days for the shops', fontsize = 30)
plt.xlabel('Days', fontsize = 15)
plt.ylabel('Frequency', fontsize = 15)

plt.show()

In [None]:
# checking which months are most busisiest for the shops

plt.rcParams['figure.figsize'] = (15, 7)
sns.countplot(train_df['month'], palette = 'dark')
plt.title('The most busiest months for the shops', fontsize = 30)
plt.xlabel('Months', fontsize = 15)
plt.ylabel('Frequency', fontsize = 15)

plt.show()

In [None]:
# checking the columns of the train data

train_df.columns

In [None]:
train_df.dtypes

In [None]:
# converting the data into monthly sales data

# making a dataset with only monthly sales data
data = train_df.groupby([train_df['date'].apply(lambda x: x.strftime('%Y-%m')),'item_id','shop_id']).sum().reset_index()

# specifying the important attributes which we want to add to the data
data = data[['date','item_id','shop_id','item_cnt_day']]

# at last we can select the specific attributes from the dataset which are important 
data = data.pivot_table(index=['item_id','shop_id'], columns = 'date', values = 'item_cnt_day', fill_value = 0).reset_index()

# looking at the newly prepared datset
data.shape

In [None]:
# let's merge the monthly sales data prepared to the test data set

test_df=pd.merge(test_df, data, on = ['item_id', 'shop_id'], how = 'left')

# filling the empty values found in the dataset
test_df.fillna(0, inplace = True)

# checking the dataset
test_df.head()

In [None]:
# now let's create the actual training data

x_train = test_df.drop(['2015-10', 'item_id', 'shop_id'], axis = 1)
y_train = test_df['2015-10']

# deleting the first column so that it can predict the future sales data
x_test = test_df.drop(['2013-01', 'item_id', 'shop_id'], axis = 1)

# checking the shapes of the datasets
print("Shape of x_train :", x_train.shape)
print("Shape of x_test :", x_test.shape)
print("Shape of y_test :", y_train.shape)

In [None]:
# let's check the x_train dataset

x_train.head()

In [None]:
# let's check the x_test data

x_test.head()

In [None]:
# splitting the data into train and valid dataset

from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size = 0.2, random_state = 0)

# checking the shapes
print("Shape of x_train :", x_train.shape)
print("Shape of x_valid :", x_valid.shape)
print("Shape of y_train :", y_train.shape)
print("Shape of y_valid :", y_valid.shape)

In [None]:
# MODELING

from lightgbm import LGBMRegressor

model_lgb = LGBMRegressor( n_estimators=200,
                           learning_rate=0.03,
                           num_leaves=32,
                           colsample_bytree=0.9497036,
                           subsample=0.8715623,
                           max_depth=8,
                           reg_alpha=0.04,
                           reg_lambda=0.073,
                           min_split_gain=0.0222415,
                           min_child_weight=40)
model_lgb.fit(x_train, y_train)

y_pred_lgb = model_lgb.predict(x_test)

In [None]:
# Get the test set predictions and clip values to the specified range
y_pred_lgb = model_lgb.predict(x_test).clip(0., 20.)

# Create the submission file and submit
preds = pd.DataFrame(y_pred_lgb, columns=['item_cnt_month'])
preds.to_csv('submission.csv',index_label='ID')