<a href="https://colab.research.google.com/github/namozhdehi/KSR/blob/main/Store_sales_EDA_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Exploratory Data Analysis**

# Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import glob
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

# Load The Data

In [2]:
df=pd.read_csv(r"train.csv")

# Explore The Data

In [3]:
df.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


In [4]:
df.tail()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
1017204,1111,2,2013-01-01,0,0,0,0,a,1
1017205,1112,2,2013-01-01,0,0,0,0,a,1
1017206,1113,2,2013-01-01,0,0,0,0,a,1
1017207,1114,2,2013-01-01,0,0,0,0,a,1
1017208,1115,2,2013-01-01,0,0,0,0,a,1


In [5]:
df.shape

(1017209, 9)

In [6]:
df.columns

Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday'],
      dtype='object')

# **Variable Identification and Typecasting**

In [7]:
df.dtypes

Unnamed: 0,0
Store,int64
DayOfWeek,int64
Date,object
Sales,int64
Customers,int64
Open,int64
Promo,int64
StateHoliday,object
SchoolHoliday,int64


**dtype:** object

There are some variables visible at each types, so let's narrow this down by looking at one datatype at once. We will start with int

# **Integer Data Type**

In [8]:
df.dtypes[df.dtypes == 'int64']

Unnamed: 0,0
Store,int64
DayOfWeek,int64
Sales,int64
Customers,int64
Open,int64
Promo,int64
SchoolHoliday,int64


**dtype:** int

**Summary:**

*   **Store:** Correct data type (integer).
*   **DayOfWeek:** Correct data type (integer).
*   **Sales:** Correct data type (integer).
*   **Customers:** Correct data type (integer).
*   **Open:** Correct data type (integer).
*   **Promo:** Correct data type (integer).
*   **SchoolHoliday:** Correct data type (integer).

In [9]:
df.dtypes[df.dtypes == 'object']

Unnamed: 0,0
Date,object
StateHoliday,object


dtype: object
Summary:

*   **Date** column is expected to be in the format of a string so, it need to be converted to datetime format.

*   **StateHoliday** column is currently represented as an object (likely because it contains categorical values like '0', 'a', 'b', 'c'). This column can be converted to a categorical data type to reduce memory usage and optimize processing time during analysis.

# **Date related variables**

In [10]:
# Convert date field to datetime
df['Date'] = pd.to_datetime(df['Date'])
df.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


# **Object related variables**

In [11]:
df['StateHoliday'].value_counts()

Unnamed: 0_level_0,count
StateHoliday,Unnamed: 1_level_1
0,855087
0,131072
a,20260
b,6690
c,4100


In [12]:
# Perform dummy encoding for the 'StateHoliday' column
df = pd.get_dummies(df, columns=['StateHoliday'], prefix='StateHoliday', drop_first=True)

# Change the data type of all columns to int, except the 'Date' column
for col in df.columns:
    if col != 'Date':
        df[col] = df[col].astype(int)

# Verify the result by checking the first few rows
df.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,SchoolHoliday,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c
0,1,5,2015-07-31,5263,555,1,1,1,1,0,0,0
1,2,5,2015-07-31,6064,625,1,1,1,1,0,0,0
2,3,5,2015-07-31,8314,821,1,1,1,1,0,0,0
3,4,5,2015-07-31,13995,1498,1,1,1,1,0,0,0
4,5,5,2015-07-31,4822,559,1,1,1,1,0,0,0


# **Univariate Analysis: Numerical Variables**

In [13]:
# Numerical datatypes
df.select_dtypes(include=['int32', 'int64', 'float64','Int64']).dtypes

Unnamed: 0,0
Store,int64
DayOfWeek,int64
Sales,int64
Customers,int64
Open,int64
Promo,int64
SchoolHoliday,int64
StateHoliday_0,int64
StateHoliday_a,int64
StateHoliday_b,int64


In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def UVA_numeric(data, var_group):
    """
    This function takes a dataframe and a variable group as arguments
    and converts the variable group to numeric.
    It then calculates the descriptives of the variable.
    """
    size = len(var_group)
    cols = 3  # Number of columns per row
    rows = (size // cols) + (size % cols > 0)  # Calculate number of rows needed

    plt.figure(figsize=(7*cols, 5*rows), dpi=100)

    for j, i in enumerate(var_group):
        # Check if the column exists in the DataFrame
        if i not in data.columns:
            print(f"Column '{i}' not found in the DataFrame.")
            continue

        try:
            # Attempt to convert the column to numeric
            data[i] = pd.to_numeric(data[i], errors='coerce')
        except TypeError as e:
            # Handle TypeError, print the original exception and problematic value
            print(f"Error converting column '{i}': {e}")
            problematic_value = data[i].iloc[0]  # Get the first problematic value
            print(f"Problematic value: {problematic_value}, Type: {type(problematic_value)}")
            continue

        # Calculating descriptives of variable
        mini = data[i].min()
        maxi = data[i].max()
        ran = data[i].max() - data[i].min()
        mean = data[i].mean()
        median = data[i].median()
        st_dev = data[i].std()
        skew = data[i].skew()
        kurt = data[i].kurtosis()

        # Calculating points of standard deviation
        points = mean - st_dev, mean + st_dev

        # Plotting the variable with every information
        plt.subplot(rows, cols, j + 1)
        sns.kdeplot(data[i], shade=True)

        # Plotting standard deviation, min/max, mean, and median using lines
        plt.axvline(x=points[0], color='black', linestyle='--', label="std_dev")
        plt.axvline(x=points[1], color='black', linestyle='--')
        plt.axvline(x=mini, color='orange', linestyle='-', label="min/max")
        plt.axvline(x=maxi, color='orange', linestyle='-')
        plt.axvline(x=mean, color='red', linestyle='-', label="mean")
        plt.axvline(x=median, color='blue', linestyle='-', label="median")

        plt.xlabel('{}'.format(i), fontsize=20)
        plt.ylabel('density')
        plt.title('std_dev = {}; kurtosis = {};\nskew = {}; range = {}\nmean = {}; median = {}'.format(
            (round(points[0], 2), round(points[1], 2)),
            round(kurt, 2),
            round(skew, 2),
            (round(mini, 2), round(maxi, 2), round(ran, 2)),
            round(mean, 2),
            round(median, 2)))

        plt.legend(loc='upper right')  # Place the legend in the upper right corner

    plt.tight_layout()
    plt.show()

In [None]:
store_col = df.columns
UVA_numeric(df,store_col)