In [None]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

%matplotlib inline
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
train_data = pd.read_csv('C:\\Users\\Owner\\Desktop\\Proga\\REPOS\\mentorship_EPAM\\data\\train.csv', index_col = "id")
train_data.head()

In [None]:
train_data.tail()

In [None]:
train_data.info()

In [None]:
train_data["family"] = train_data["family"].str.lower()     # for easier reading 
train_data.describe(percentiles=[.25, 0.375, .5, .75, .875]).drop(["count"])       

# I dropped "count" row to get rid of exponentioal number presentation
# And added .875 percentile to understand approximately how many objects have no onpromotion

In [None]:
train_data["family"].describe()

<i>The only categorical feature (in this table) "family" has 33 possible unique values (not so a lot) <br>
That means we can easily use one-hot-encoding during model training</i>

In [None]:
family_values = train_data["family"].unique()
family_values

In [None]:
plt.figure(figsize=(20, 40))
for i, value in zip(range(len(family_values)), family_values):
  plt.subplot(len(family_values)//3, 3, i+1)
  current_value_data = train_data[train_data["family"] == value].groupby(["date"]).mean()
  plt.xlim=(0, 1750)
  plt.ylim=(0, current_value_data["sales"].max())
  plt.scatter(x=np.arange(len(current_value_data.index)), y=current_value_data["sales"])
  plt.title(value)
plt.xlim=(0, 1750)
plt.show()

<i>Actually, these plots don't really help us, because, looking on them, we can only catch some common "trends", which describe global changes in some products sales.<br>For example, we can be sure that on the 1th January every year number of sales is equal to 0 (actually, it is necessary to check, but I think it is obvious). <br> Also we can say that the number of sales in general has a positive dynamic (for example, 'automotive', 'bread/bakery', 'grocery i', 'personal care', ...).<br>Some of the product types have a negative dynamic (such as lingerie).<br>Some goods have very interesting sales distribution (books, produce, froxen foods, ladieswear, ...), and we can't say right now, what is the reason of that.<br>Also some goods families have seasonal increase in sales('school and office supplies', 'liquor, wine, beer', 'grocery ii', 'frozen foods'). </i>

In [None]:
plt.figure(figsize=(20, 110))
for i, family_value in zip(range(len(family_values)), family_values):
    plt.subplot(17, 2, i+1)
    average_sales = train_data[train_data['family'] == family_value].groupby('date').mean()['sales']
    trend = average_sales.rolling(
        window=365,
        center=True,
        min_periods=183,
    ).mean()
    ax = average_sales.plot(alpha=0.5)
    ax = trend.plot(ax=ax, linewidth=3)
    plt.title(family_value)
plt.show()

In [None]:
plt.figure(figsize=(20, 70))
for i, value in zip(range(len(family_values)), family_values):
  plt.subplot(len(family_values)//3, 3, i+1)
  current_value_data = train_data[train_data['family'] == value].groupby(["date"]).mean()
  sns.distplot(current_value_data['sales'], color='g', bins=100, hist_kws={'alpha': 0.4});
  plt.title(value)
plt.show()

<i>These plots show that, roughly speaking, sales distributions of all families divided on two parts: <br>1) Normal or close to normal (such as 'automotive', 'bread/bakery', 'cleaning', 'eggs', 'grocery', 'lingerie', ...). Interesting fact that most of distributions from this category have right asymmetry (asymmetry coefficient is positive). The prove is below.<br>2) Distribution, where the biggest density is concentrated in zero or near zero. Other data is distributed differently (some values such as 'home and kitchen i', 'ladieswear' have something like normal distributions). It means that such goods categories aren't essential for people, that is why a number of sales during the day mostly is equal to 0.</i>

In [None]:
for value in family_values:
    print(value, ': ', round(train_data[train_data['family'] == value].groupby(["date"]).mean().skew()['sales'], 2))

In [None]:
sales_and_onpromotion_data = train_data.drop(['date', 'store_nbr', 'family'], axis=1)
sales_and_onpromotion_data.corr()

In [None]:
plt.figure(figsize=(20, 40))
for i, value in zip(range(len(family_values)), family_values):
  plt.subplot(len(family_values)//3, 3, i+1)
  current_value_data = train_data[train_data["family"] == value].groupby(["date"]).mean()
  plt.scatter(x='onpromotion', y='sales', data=current_value_data)
  plt.title(value)
plt.show()

<i>It becomes understandable that promotions have a pretty good influence on sales INCREASING (in general, talking about all the data).<br>As we can see on the plots above, most of the 'family' values (but not all of them!) prove this fact. Interesting fact that 'books' didn't have any promotions during he whole period of observations.<br>Moreover, correlation is influenced by outliers, so this coefficient may not be accurate.<br>Nevertheless, 'onpromotion' feature is useful for the predictions.</i>

In [None]:
stores_data = pd.read_csv('C:\\Users\\Owner\\Desktop\\Proga\\REPOS\\mentorship_EPAM\\data\\stores.csv')
stores_data.head()

In [None]:
stores_data.info()

<i>Store id doesn't help itself with the sales predictions, so it is necessary to replace the store id with the corresponding information about it</i>

In [None]:
train_data = train_data.merge(stores_data, on="store_nbr", how="left")
train_data = train_data.drop('store_nbr', axis=1)
train_data.head()

In [None]:
print('Number of \'city\', \'state\', \'type\' unique values: \n')
for feature in ['city', 'state', 'type']:
    print(feature, len(train_data[feature].unique()))

In [None]:
plt.figure(figsize=(25,9))
sns.countplot(x=train_data['city'], alpha=0.7, data=train_data)

In [None]:
plt.figure(figsize=(25,9))
sns.countplot(x=train_data['state'], alpha=0.7, data=train_data)

<i>It seems that the presence of both features 'state' and 'city' isn't necessary, that is why we should delete one of them.<br>I'll choose 'state' feature, because 'city' feature gives us more information (there can be few cities in the state).<br>'city' feature has only 7 more values than 'state' feature, that is why speaking about the model complexity, there should not be much difference.</i>

In [None]:
train_data = train_data.drop('state', axis=1)
train_data.head()

In [None]:
train_data_copy = train_data.copy()
train_data_copy.head()

In [None]:
plt.figure(figsize=(20, 100))
for i, family_value in zip(range(len(family_values)), family_values):
  plt.subplot(17, 2, i+1)
  for city in train_data_copy['city'].unique():
      average_sales = train_data_copy[(train_data_copy['family'] == family_value) &
                                 (train_data_copy['city'] == city)].groupby('date').mean()['sales']
      trend = average_sales.rolling(
          window=365,
          center=True,
          min_periods=183,
      ).mean()
      ax = average_sales.plot(alpha=0)
      ax = trend.plot(ax=ax, linewidth=3)
  plt.title(family_value)
plt.show()

In [None]:
plt.figure(figsize=(7,5))
sns.countplot(x=train_data['type'], alpha=0.7, data=train_data)

In [None]:
plt.figure(figsize=(20, 100))
for i, family_value in zip(range(len(family_values)), family_values):
  plt.subplot(17, 2, i+1)
  for store_type in train_data_copy['type'].unique():
      #plt.plot(np.arange(len(current_type_and_family_data.index)), current_type_and_family_data['sales'])
      average_sales = train_data_copy[(train_data_copy['family'] == family_value) &
                                 (train_data_copy['type'] == store_type)].groupby('date').mean()['sales']
      trend = average_sales.rolling(
          window=365,
          center=True,
          min_periods=183,
      ).mean()
      ax = average_sales.plot(alpha=0)
      ax = trend.plot(ax=ax, linewidth=3)
  plt.title(family_value)
  plt.legend(train_data_copy['type'].unique())
plt.show()

<i>As we can see, the store type has a great impact on the target (type D is leader and it is logical, because the plot above shows us, that type D stores are the most), that is why 'type' feature is very important.</i>

In [None]:
plt.figure(figsize=(7,5))
sns.countplot(x=train_data['cluster'], alpha=0.7, data=train_data)

In [None]:
sales_and_cluster_data = train_data_copy.drop(['date', 'family', 'onpromotion', 'city', 'type'], axis=1)
sales_and_cluster_data.corr()

In [None]:
plt.figsize=(10, 7)
plt.scatter(x='cluster', y='sales', data=sales_and_cluster_data)

<i>'cluster' feature doesn't correlate with the target, and I can't see any dependecies between these 2 features. But I think that this feature can be useful, because it connects similar stores together.</i>

In [None]:
oil_data = pd.read_csv('C:\\Users\\Owner\\Desktop\\Proga\\REPOS\\mentorship_EPAM\\data\\oil.csv')
oil_data["time"] = np.arange(len(oil_data.index))
oil_data.head()

In [None]:
oil_data.info()

In [None]:
oil_data_copy = oil_data.copy()
oil_data_copy.plot(kind="scatter", x="time", y="dcoilwtico")

In [None]:
train_data = train_data.merge(oil_data, on='date', how="left")
train_data.head()

In [None]:
train_data.columns

In [None]:
train_data.drop(columns=['time'])

In [None]:
train_data = train_data.drop(['time'], axis=1)

In [None]:
train_data_oil = train_data.fillna(train_data['dcoilwtico'].mean())
train_data_oil.head()

In [None]:
train_data.corr()

In [None]:
train_data_oil.corr()

In [None]:
holidays_events_data = pd.read_csv('C:\\Users\\Owner\\Desktop\\Proga\\REPOS\\mentorship_EPAM\\data\\holidays_events.csv')
holidays_events_data.head()

In [None]:
transactions_data = pd.read_csv('C:\\Users\\Owner\\Desktop\\Proga\\REPOS\\mentorship_EPAM\\data\\transactions.csv')
transactions_data.head()