In [None]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import warnings
from matplotlib import pylab as plt
from statsmodels.graphics.gofplots import qqplot
from IPython.core.interactiveshell import InteractiveShell

#### Merge the 12 months of sales data into a single CSV file

In [None]:
# let's make a list compreension for all the data in the folder
files = [file for file in os.listdir('../data')] 
# let's make a pandas DataFrame
all_months_data = pd.DataFrame()
# makes a loop for concat the data
for file in files:
    data = pd.read_csv('../data/' + file)
    all_months_data = pd.concat([all_months_data, data])
# export all data to csv    
all_months_data.to_csv("all_data.csv", index=False)

### Read in updated DataFrame
###### Let's see the data and how it looks.

In [None]:
sales_data = pd.read_csv('all_data.csv') # read data
sales_data # show data

In [None]:
"Head"
sales_data.head() # Checking the first 5 rows of data
# Checking the last 5 rows of data

In [None]:
"Tail"
sales_data.tail() 

### Data Preprocessing
Data preprocessing can refer to manipulation or dropping of data before it is used in order to ensure or
enhance performance, and is an important step in the data mining process. The phrase "garbage in, garbage out" 
is particularly applicable to data mining and machine learning projects.

In [None]:
# getting the information 
sales_data.info()

### Uniqueness Categorical Variables
Let's have a look at categorical variables. How many unique values of these variables.

In [None]:
categorical = sales_data.select_dtypes(['category', 'object']).columns # getting the Uniqueness catrgorical variable
for col in categorical:
    print('{} : {} unique value(s)'.format(col, sales_data[col].nunique()))

### How many missing data points do we have?
Ok, now we know that we do have some missing values. Let's see how many we have in each column.

In [None]:
# get the number of missing data points per column
missing_values_count = sales_data.isnull().sum()
# look at the # of missing points in the first ten columns
missing_values_count[0:10]

It might be helpful to see what percentage of the values in our dataset were missing to give us 
a better sense of the scale of this problem:

In [None]:
# how many total missing values do we have?
total_cells = np.product(sales_data.shape)
total_missing = missing_values_count.sum()
# percent of data that is missing
percent_missing = (total_missing / total_cells) * 100
print(f"{percent_missing:.2f}%")

### Clean up the Data!¶
The first step in this is figuring out what we need to clean. I have found in practice, 
that you find things you need to clean as you perform operations and get errors. 
Based on the error, you decide how you should go about cleaning the data.

In [None]:
# let's drop the rows of NaN data!
sales_data = sales_data.dropna(how='all')
# okay, let's check it again!
"NaN Value:"
sales_data[sales_data.isna().any(axis=1)]
# future warning! ValueError: invalid literal for int() with base 10: 'Or'
"Clean Future Warnings:"
sales_data = sales_data[sales_data['Order Date'].str[0:2] != 'Or']
sales_data

### Convert Quantity Ordered column and Price Each column¶
Let's convert the Quantity Ordered column and Price Each column to Numeric Type, 
because we will add some future features, and we 
need to multiply this two column.

In [None]:
# convert the data
sales_data['Quantity Ordered'], sales_data['Price Each'] = sales_data['Quantity Ordered'].astype('int64'), sales_data['Price Each'].astype('float')
# and check it 
sales_data.info()

### Convert Order Date column
And let's convert Order Date column too, so we can take the Year, Month, and the other date easily.

In [None]:
# convert it using to_datetime() funct
sales_data['Order Date'] = pd.to_datetime(sales_data['Order Date'])
# let's see it
sales_data

### Recap Data
This is some point that we have.

We have total  186850  records and  6  columns cateogircal type

The total of missing value that we have is  0.29167 %

Order ID :  178438  unique value(s)

Product :  20  unique value(s)

Quantity Ordered :  10  unique value(s)

Price Each :  24  unique value(s)

Order Date :  142396  unique value(s)

Purchase Address :  140788  unique value(s)


#### Next, we will try to do some exploration and visualization. But we need to do some Data Preparation first.

### Data Preparation
Data preparation is the act of manipulating raw data into a form that can 
readily and accurately be analysed, e.g. for business purposes. Data Preparation 
is a pre-processing step in which data from one or more sources is cleaned and transformed 
to improve its quality prior to its use in business analytics.


### Add Month, Hour, Minute, Sales, Cities Column

In [None]:
def augment_data(data):
    
    """
    Adding new features to
    our data, adding Month Data,
    Hour Data, Minute Data, Sales Data,
    and Cities Column
    
    Returning:
        data with new features
    """
    
    # funtction to get the city in the data
    def get_city(address):
        return address.split(',')[1]
    
    # funtction to get the state in the data
    def get_state(address):
        return address.split(',')[2].split(' ')[1]

    # let's get the year data in order date column
    data['Year'] = data['Order Date'].dt.year
    
    # let's get the month data in order date column
    data['Month'] = data['Order Date'].dt.month
    
    # let's get the houe data in order date column
    data['Hour'] = data['Order Date'].dt.hour 
    
    # let's get the minute data in order date column
    data['Minute'] = data['Order Date'].dt.minute 
    
    # let's make the sales column by multiplying the quantity ordered colum with price each column
    data['Sales'] = data['Quantity Ordered'] * data['Price Each'] 
    
    # let's get the cities data in order date column
    data['Cities'] = data['Purchase Address'].apply(lambda x: f"{get_city(x)} ({get_state(x)})") 
    
    return data # returning data

# and see it
sales_data = augment_data(sales_data)
sales_data.head()

### Data Analysis
Data Analysis is the process of systematically applying statistical and/or 
logical techniques to describe and illustrate, condense and recap, and evaluate data. 
Indeed, researchers generally analyze for patterns in observations through the entire
data collection phase (Savenye, Robinson,  2004 ).
analyze and investigate data sets and summarize their main characteristics, often employing
data visualization methods.

Or, the easier, you can say in Data Analysis we (Data Scientist or Data Analyst) what ever you 
want to call that, in this section, we're looking for the correlation and also the relationships 
between every data (features and labels) or the variables using and applying the statistical 
and visualization methods for looking some patterns.

In [None]:
sns.set_style("whitegrid") # set the seaborn style
# let's make a correlation matrix for `cop_data`
plt.figure(figsize=(24, 18)) # figure the size
sns.heatmap(sales_data.corr(), annot=True) # create a heatmap
plt.title("Sales Data Correlation", weight="bold", fontsize=35, pad=30) # title
plt.xticks(weight="bold", fontsize=15) # x-ticks
plt.yticks(weight="bold", fontsize=15); # y-ticks

In [None]:
# Let's see the correlation from `sales_data`
(sales_data.corr()['Sales'] # transform it into data corr
           .sort_values(ascending=False) # sort values
           .to_frame() # change it into data frame
           .T) # transpose it

In [None]:
# statistical measure of sales data without object type of data
sales_data_numeric = sales_data.describe(include=[np.number]) 
"Statistical Measure of Sales Data in Numeric Data"
sales_data_numeric

In [None]:
# statistical measure of sales data without numeric type of data
sales_data_object = sales_data.describe(exclude=[np.number],datetime_is_numeric=True)
"Statistical Measure of Sales Data in Object / Str Data"
sales_data_object

### Univariate Analysis
Univariate analysis is perhaps the simplest form of statistical analysis. Like other forms of statistics, 
it can be inferential or descriptive. The key fact is that only one variable is involved. 
Univariate analysis can yield misleading results in cases in which multivariate analysis is more appropriate.

In [None]:
# checking and visualizing the type of distribution of a feature column
def univariate_analysis(data, color, title1, title2):
    
    """
    Showing visualization of univariate
    analysis with displot and qqplot
    visualization from seaborn and statsmodel
    library.
    
    Parameters
    ----------
    data : DataFrame, array, or list of arrays, optional
        Dataset for plotting. If ``x`` and ``y`` are absent, this is
        interpreted as wide-form. Otherwise it is expected to be long-form. 
    title1: The title of the visualization, title1 for displot visualization
        And title2 for quantile plot from statsmodel.
    title2: The title of the visualization, title1 for displot visualization
        And title2 for quantile plot from statsmodel.
        
    Returns
    -------
    fig : matplotlib figure
        Returns the Figure object with the plot drawn onto it.
    """
    
    fig, (ax1, ax2) = plt.subplots( # subplots
        ncols=2, # num of cols
        nrows=1, # num of rows
        figsize=(20, 6) # set the width and high
    )

    sns.distplot( # create a distplot visualization
        data, # data
        ax=ax1, # axes 1
        kde=True, # kde
        color=color # color
    )
    
    ax1.set_title( # set the title 1
        title1, 
        weight="bold", # weight
        fontsize=25, # font-size
        pad=30 # padding
    ) 
    
    qqplot( # qqplot (quantile plot)
        data, # data
        ax=ax2, # axes 2
        line='s' # line 
    )
    
    ax2.set_title( # set the title 2
        title2, 
        weight="bold", # weight
        fontsize=25, # font-size
        pad=30 # padding
    )
    
    return fig # returning the figure

Let's try to find the proportion that lies in between two standard deviation ($\sigma$) from mean ($\mu$) using Chebychev's Theorem, and let's try to interprete...

Chebychev's Theorem

$$ \begin{aligned} 1 - \frac{1}{k^2}: k &= 2 -> 1 - \frac{1}{2^2} = \frac{3}{4} -> 75 \\ k &= 3 -> 1 - \frac{1}{3^2} = \frac{8}{9} -> 88.9 \end{aligned} $$
How to find Standard Deviation ($\sigma$)?

Here's the Formula:

$$ \begin{aligned} \sigma &= \sqrt{\sigma^2} = \sqrt{\frac{\sum{(x - \mu)^2}}{N}} \\ s &= \sqrt{s^2} = \sqrt{\frac{\sum{(x - \bar{x})^2}}{n - 1}} \end{aligned} $$
How to find Mean ($\mu$)?

Here's the Formula:

$$ \begin{aligned} \mu = \frac{\sum{x}}{N} \\ \bar{x} = \frac{\sum{x}}{n} \end{aligned} $$

In [None]:
# Quantity Ordered Data
univariate_analysis( # call the function
    data=sales_data['Quantity Ordered'], # put the data
    color='red', # pick the color
    title1='Quantity Ordered Data Distribution', # title1
    title2='Quantile Plot' # title2
);

Here we can see it, the average customer buys $1$ item/product more often, there are also a few customers who buy $2$ or $4$ items/product at once, more than that it is very rare.

Quantity Ordered
Find the proportion that lies in between two standard deviation ($\sigma$) from mean ($\mu$), and let's try to interprete that. and In the Quantity Ordered Data, the $\mu = 1.12$ and the $\sigma = 0.44$, then without further ado let's calculate it.

Calculation:
$1.12 - 2(0.44) = 0.2$
$1.12 + 2(0.44) = 2$
Interpretation:
At least $75\%$ of the Sales Data Quantity Ordered population in the USA has a Quantity Ordered range from $0 - 2$ item/product.

In [None]:
# Price Each Data
univariate_analysis( # call the function
    data=sales_data['Price Each'], # put the data
    color='blue', # pick the color 
    title1='Price Each Data Distribution', # title1 
    title2='Quantile Plot' # title2
);

### Price Each 

Find the proportion that lies in between two standard deviation ($\sigma$) from mean ($\mu$), and let's try to interprete that. and in the Price Each Data, the $\mu = 184.3$ and the $\sigma = 332.7$, then without further ado let's calculate it.

Calculation:
$184.3 - 2(332.7) = -481$
$184.3 + 2(332.7) = 849.7$
Interpretation:
At least $75\%$ of the population Sales Price data for each item/product in the USA has a price range for each item/product from $0 - 849.7$ (USD).

In [None]:
# Sales Data
univariate_analysis( # call the function
    data=sales_data['Sales'], # put the data 
    color='black', # pick the color
    title1='Sales Data Distribution', # title1 
    title2='Quantile Plot' # title2
);

### Sales
Find the proportion that lies in between two standard deviation ( σ ) from mean ( μ ), and let's try to interprete that. and in the Sales Data, the  μ=185.4  and the  σ=332.9 , then without further ado let's calculate it.

Calculation:
185.4−2(332.9)=−480 
185.4+2(332.9)=851.19 
Interpretation:
At least  75%  of population Sales Data customers in USA have Sales range from  0−851.19  (USD).

In [None]:
# checking skewness value
# if value lies between -0.5 to 0.5  then it is normal otherwise skewed
skew_value = sales_data.skew().sort_values(ascending=False).to_frame()
skew_value

It can be seen that most of the data we have are in the form of a normal distribution, and there are two skewed data.

### Task:

In [None]:
# let's plot it
plt.figure(figsize=(24, 10)) # figuring the size
# makes count plot 
sns.countplot(
    x="Year", 
    data=sales_data
)
plt.title( # title
    "What was the best Year for sales? How much was earned that Year?", 
    weight="bold", # weiqht
    fontsize=35, # font-size
    pad=30 # padding
)
plt.xlabel( # x-label
    "Years", 
    weight="bold", # weight
    color="purple", # color
    fontsize=25, # font-size
    loc="center" # location
)
plt.xticks( # x-ticks
    weight="bold", # weight
    fontsize=15 # font-size
)
plt.ylabel( # y-label
    "Sales in USD ($)", 
    weight="bold", # weight
    color="green", # color
    fontsize=20 # font-size
)
plt.yticks( # y-ticks
    weight="bold", # weight 
    fontsize=15 # font-size
);

### Answer:
When viewed from the data above,  2019  was the best year that had the highest number of sales,
which was  $34,483,365 , compared to  2020  which only had  $8,670  in sales, this is due to the lack of 
data in  2020  which caused a data imbalance.

In [None]:
# group the Month cols
sum_of_month_and_earned = sales_data.groupby('Month').sum().astype('int')
# let's plot it
plt.figure(figsize=(24, 14)) # figuring the size
# makes bar plot 
sns.barplot( # barplot
    x=sum_of_month_and_earned.index, # x-axis
    y=sum_of_month_and_earned["Sales"], # y-axis
    data=sum_of_month_and_earned, # data
    palette="deep" # palette
)
plt.title( # title
    "What was the best month for sales? How much was earned that month?", 
    weight="bold", # weight
    fontsize=35, # font-size
    pad=30 # padding
)
plt.xlabel( # x-label
    "Months", 
    weight="bold", # weight
    color="purple", # color
    fontsize=25, # font-size
    loc="center" # location
)
plt.xticks( # x-ticks
    weight="bold", # weight
    fontsize=15 # font-size
)
plt.ylabel( # y-label
    "Sales in USD ($)", 
    weight="bold", # weight
    color="green", # color
    fontsize=20 # font-size
)
plt.yticks( # y-ticks
    weight="bold", # weight 
    fontsize=15 # font-size
);

### Answer:
The best month to sell is shown in the visualization above is December which has a record number of sales reaching  $4,613,443 , sales,This may be because in December there is Christmas, where many people buy groceries to make cakes or toys as gifts for loved ones.

In [None]:
# group of the highest number of sales in city
highest_number_of_sales = sales_data.groupby('Cities').sum().astype('int')
# let's plot it
plt.figure(figsize=(24, 14)) # figuring the size
# makes bar plot 
sns.barplot( # barplot
    x=highest_number_of_sales.index, # x-axis
    y=highest_number_of_sales["Sales"], # y-axis
    data=highest_number_of_sales, # data
    palette="deep" # palette
)
plt.title( # title
    "What City had the highest number of Sales?", 
    weight="bold", # weight
    fontsize=35, # font-size
    pad=30 # padding
)
plt.xlabel( # x-label
    "Cities", 
    weight="bold", # weight
    color="purple", # color
    fontsize=25, # font-size
    loc="center" # location
)
plt.xticks( # x-ticks
    weight="bold", # weight
    fontsize=15, # font-size
    rotation=10
)
plt.ylabel( # y-label
    "Sales in USD ($)", 
    weight="bold", # weight
    color="green", # color
    fontsize=20 # font-size
)
plt.yticks( # y-ticks
    weight="bold", # weight 
    fontsize=15 # font-size
);

### Answer:
The city that has the most sales data in the above visualization is San Francisco,
with total sales reaching  $8,262,203 .

In [None]:
# let's prepare the value for the x-axis
hours = [hour for hour, df in sales_data.groupby('Hour')]
# let's plot it
plt.figure(figsize=(24, 10)) # figuring the size
# makes bar plot 
plt.plot( # plot
    hours, # x-axis
    sales_data.groupby(['Hour']).count() # data
)
# let's add grid
plt.grid(True)
plt.title( # title
    "What time should we display adverstisement to maximize likelihood of customer's buying product?", 
    weight="bold", # weight
    fontsize=35, # font-size
    pad=30
)
plt.xlabel( # x-label
    "Hours", 
    weight="bold", # weight
    color="purple", # color
    fontsize=25, # font-size
    loc="center" # location
)
plt.xticks( # x-ticks
    ticks=hours, # labels
    weight="bold", # weight
    fontsize=15 # font-size
)
plt.ylabel( # y-label
    "Number of Orders", 
    weight="bold", # weight
    color="black", # color
    fontsize=20 # font-size
)
plt.yticks( # y-ticks
    weight="bold", # weight 
    fontsize=15 # font-size
);