In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

### Reading data into the file

In [None]:
items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')                            
print(items.shape, items.dtypes)
items.head()

In [None]:
sales_train = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
print(sales_train.shape, sales_train.dtypes)
sales_train.head() 

In [None]:
sales_train['date'] = pd.to_datetime(sales_train.date, format='%d.%m.%Y')
sales_train.head()

In [None]:
item_categories = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
print(item_categories.shape, item_categories.dtypes)
item_categories.head()

In [None]:
test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')
print(test.shape, test.dtypes)
test.head()

In [None]:
shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
print(shops.shape, shops.dtypes)
shops.head()

In [None]:
sample = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')
sample.head()

* With these we get a basic insight of the datatypes and we resolve any mismatch in the dtype and the column content. 
* We also see that there is no missing values in the data from analysing the given data.

# Exploratory Data Analysis

In [None]:
category = items['item_category_id'].value_counts().to_frame()
category.rename(columns={'item_category_id': 'value_counts'}, inplace=True)
category.index.name = 'item_category_id'
category.sort_values(by='item_category_id', inplace = True)

plt.figure(figsize=(16, 14))
sns.barplot(x='item_category_id', y='value_counts', data =category.reset_index())
plt.xlabel('Item Category Id')
plt.ylabel('Number of items')
plt.ylim(0, 2500)
plt.title('number of items in each category')
plt.show()

In [None]:
month = pd.DataFrame(sales_train.groupby(['date_block_num']).sum().item_cnt_day).reset_index()
month.rename(columns={'item_cnt_day':'items_sold'}, inplace=True)

plt.figure(figsize=(16, 14))
sns.barplot(x ='date_block_num', y='items_sold', data=month.reset_index());
plt.title('Sum of sales per month')
plt.xlabel('Date block Number')
plt.ylabel('Number of items sold')
plt.show()

In [None]:
month = pd.DataFrame(sales_train.groupby(['date_block_num']).sum().item_cnt_day).reset_index()
month.rename(columns={'item_cnt_day':'items_sold'}, inplace=True)

plt.figure(figsize=(16, 14))
sns.barplot(x ='date_block_num', y='items_sold', data=month.reset_index());
plt.title('Sum of sales per month')
plt.xlabel('Date block Number')
plt.ylabel('Number of items sold')
plt.show()

In [None]:
sales_shop = pd.DataFrame(sales_train.groupby(['shop_id']).sum().item_cnt_day).reset_index()
sales_shop.rename(columns={'item_cnt_day':'shop_sales'}, inplace=True)

plt.figure(figsize=(16, 14))
sns.barplot(x ='shop_id', y='shop_sales', data=sales_shop)
plt.title('Sales per shop')
plt.xlabel('Shop id')
plt.ylabel('Sales Sum')
plt.show()

In [None]:
sales_item = pd.DataFrame(sales_train.groupby(['item_id']).sum().item_cnt_day).reset_index()
sales_item.rename(columns={'item_cnt_day':'item_sales'}, inplace=True)

plt.figure(figsize=(16, 14))
sns.barplot(x ='item_id', y='item_sales', data=sales_item)
plt.title('Sales per item')
plt.xlabel('item id')
plt.ylabel('Sales Sum')
plt.show()

In [None]:
item_cat = sales_train.merge(items, on='item_id')
item_cat = item_cat.groupby('item_category_id').item_cnt_day.sum()

plt.figure(figsize=(16, 14))
sns.barplot(x ='item_category_id', y='item_cnt_day', data=item_cat.reset_index())
plt.xlabel('Item Category id')
plt.ylabel('Item Count')
plt.title('Sales per item category')
plt.show()

In [None]:
sales_train['day'] = sales_train['date'].dt.day
sales_train['month'] = sales_train['date'].dt.month
sales_train['year'] = sales_train['date'].dt.year
sales_train['week'] = sales_train['date'].dt.week
sales_train.columns

In [None]:
plt.figure(figsize=(16, 14))
sns.countplot(sales_train['day'])
plt.title('Busiest days for the shops')
plt.xlabel('Days')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(16, 14))
sns.countplot(sales_train['month'])
plt.title('Busiest month for the shops')
plt.xlabel('Month')
plt.ylabel('Frequency')
plt.show()

In [None]:
sales_train['revenue'] = sales_train['item_price'] * sales_train['item_cnt_day']

plt.figure(figsize=(16, 14))
sns.distplot(sales_train['revenue'], color = 'blue')
plt.title('Distribution of Revenue')
plt.xlabel('Range of Revenue')
plt.ylabel('Revenue')
plt.show()

### Finding outliers

In [None]:
plt.figure()
sns.boxplot(x=sales_train.item_cnt_day)

In [None]:
sns.boxplot(x=sales_train.item_price)

In [None]:
sales_train = sales_train[sales_train.item_price<100000]
sales_train = sales_train[sales_train.item_cnt_day<1001]

## Generating Required Features Set

In [None]:
sales_train.columns, sales_train.shape

In [None]:
# making a dataset with only monthly sales data
dataset = sales_train.groupby([sales_train['date'].apply(lambda x: x.strftime('%Y-%m')),'item_id','shop_id']).sum().reset_index()
dataset.head(), dataset.shape

In [None]:
# specifying the important attributes which we want to add to the data
dataset = dataset[['date','item_id','shop_id','item_cnt_day']]

# at last we can select the specific attributes from the dataset which are important 
dataset = dataset.pivot_table(index=['item_id','shop_id'], columns = 'date', values = 'item_cnt_day', fill_value = 0).reset_index()
dataset.head(), dataset.shape

In [None]:
# let's merge the monthly sales data prepared to the test data set
testset = pd.merge(test, dataset, on = ['item_id', 'shop_id'], how = 'left')

# filling the empty values found in the dataset
testset.fillna(0, inplace = True)

# checking the dataset
testset.head(), testset.shape

In [None]:
# now let's create the actual training data

x = dataset.drop(['2015-10', 'item_id', 'shop_id'], axis = 1)
y = dataset['2015-10']

x_train, x_val, y_train , y_val = train_test_split(x, y, test_size = 0.2)
# deleting the first column so that it can predict the future sales data
x_test = testset.drop(['2013-01', 'item_id', 'shop_id', 'ID'], axis = 1)

# checking the shapes of the datasets
print("Shape of x_train :", x_train.shape)
print("Shape of x_test :", x_test.shape)
print("Shape of y_train :", y_train.shape)

## 1. Miltiple Linear Regression

In [None]:
from sklearn.preprocessing import StandardScaler
feature_scaler = StandardScaler()
dataset = feature_scaler.fit_transform(dataset)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score


In [None]:
from sklearn.metrics import mean_squared_error
lm = LinearRegression()
all_accuracies = -1 * cross_val_score(estimator=lm, X=x_train, y=y_train, cv=5, scoring='neg_mean_squared_error')
print(np.sqrt(all_accuracies.mean()))
lm.fit(x_train, y_train)
yx = lm.predict(x_train)
np.sqrt(mean_squared_error(y_train, yx))

In [None]:
all_accuracies = cross_val_score(estimator=lm, X=x_train, y=y_train, cv=5)
print(all_accuracies.mean())

In [None]:
lm.fit(x_train, y_train)
yhat = lm.predict(x_test).clip(0, 20)
yhat

In [None]:
preds = pd.DataFrame(yhat, columns=['item_cnt_month'])
preds.index.name = 'ID'
preds

## 2. Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
poly = PolynomialFeatures(degree=2)
train_x_poly = poly.fit_transform(x_train)

clf = linear_model.LinearRegression()
all_accuracies = -1 * cross_val_score(estimator=clf, X=train_x_poly, y=y_train, cv=5, scoring='neg_mean_squared_error')
print(np.sqrt(all_accuracies.mean()))
train_y_ = clf.fit(train_x_poly, y_train)
yx = train_y_.predict(train_x_poly)
np.sqrt(mean_squared_error(y_train, yx))

In [None]:
x_test_poly = poly.fit_transform(x_test)
yhat = train_y_.predict(x_test_poly).clip(0, 20)
yhat

In [None]:
preds = pd.DataFrame(yhat, columns=['item_cnt_month'])
preds.index.name = 'ID'

## 3. Ridge Regression (L2 regularization)

In [None]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
RidgeModel = Ridge(alpha = 0.1)
from sklearn.metrics import mean_squared_error
poly = PolynomialFeatures(degree=2)
train_x_poly = poly.fit_transform(x_train)
x_test_poly = poly.fit_transform(x_test)

In [None]:
RidgeModel.fit(train_x_poly, y_train)
yhat = RidgeModel.predict(x_test_poly)
y_tr = RidgeModel.predict(train_x_poly)
mse = mean_squared_error(y_train, y_tr)
np.sqrt(mse)

In [None]:
preds = pd.DataFrame(yhat, columns=['item_cnt_month'])
preds.index.name = 'ID'
preds.to_csv('/kaggle/working/submission.csv')

In [None]:
RidgeModel.fit(x_train, y_train)
yhat = RidgeModel.predict(x_test)
y_tr = RidgeModel.predict(x_train)
mse = mean_squared_error(y_train, y_tr)
np.sqrt(mse)

## 4. Lasso Regression

In [None]:
from sklearn.linear_model import Lasso
ls = Lasso(alpha = 0.1)
ls.fit(x_train, y_train)
yhat = ls.predict(x_test)
y_tr = ls.predict(x_train)
mse = mean_squared_error(y_train, y_tr)
np.sqrt(mse)

In [None]:
preds = pd.DataFrame(yhat, columns=['item_cnt_month'])
preds.index.name = 'ID'
preds

## 5. Elastic Net Regression

In [None]:
from sklearn.linear_model import ElasticNet
en = ElasticNet()
en.fit(x_train, y_train)
yhat = en.predict(x_test)
y_tr = en.predict(x_train)
mse = mean_squared_error(y_train, y_tr)
np.sqrt(mse)

In [None]:
preds = pd.DataFrame(yhat, columns=['item_cnt_month'])
preds.index.name = 'ID'
preds

## 6. Principal Component Regression

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 1)
pca.fit(x_train, y_train)
pca.score(x_train)
pca.score(x_test)

## 7. Support Vector Machine

In [None]:
from sklearn import svm
sv = svm.SVC(kernel='rbf')
sv.fit(x_train, y_train) 
yhat = sv.predict(x_test)
y_tr = sv.predict(x_train)
mse = mean_squared_error(y_train, y_tr)
np.sqrt(mse)

## 8. Using Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor()
rf.fit(x_train, y_train)
yhat = rf.predict(x_test).clip(0, 20)
y_tr = rf.predict(x_train)
mse = mean_squared_error(y_train, y_tr)
np.sqrt(mse)

In [None]:
yhat = rf.predict(x_test).clip(0, 20)
preds = pd.DataFrame(yhat, columns=['item_cnt_month'])
preds.index.name = 'ID'
preds.to_csv('/kaggle/working/randomforest.csv')
preds