In [1]:

# Import necessary libraries
import pandas as pd # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.linear_model import LinearRegression # type: ignore

In [2]:
# Loading dataset
data = pd.read_csv('dataset.csv')


In [3]:
#Here we will Perform some EDA
#looking for Rows and columns 

print("Here are ", data.shape, "instances and columns in the dataset")

Here are  (3000, 12) instances and columns in the dataset


In [4]:
#Just to have glance on the dataset
data.head()

Unnamed: 0,Product_Category,Price,Promotion,Advertising_Spend,Holiday,Region,Store_ID,Customer_ID,Temperature,Weather_Condition,Inventory_Level,Sales
0,Clothing,44,No,460,,South,S056,C02060,5,Sunny,29,216
1,Electronics,56,Yes,4088,,North,S019,C03715,9,Rainy,87,731
2,Health & Beauty,96,Yes,3505,,East,S024,C02458,21,Sunny,898,664
3,Clothing,23,Yes,1258,,South,S020,C00860,-13,Rainy,305,851
4,Home & Garden,34,Yes,204,New Year's Day,North,S067,C04934,26,Sunny,744,179




<h3>As we know that the dataset that we are working with contains categorical values so we will perform one hot coding. 
So we need to know that which columns contain the categorical values</h3>



In [5]:
#this will print the dataset column names
col_names = data.columns

col_names

Index(['Product_Category', 'Price', 'Promotion', 'Advertising_Spend',
       'Holiday', 'Region', 'Store_ID', 'Customer_ID', 'Temperature',
       'Weather_Condition', 'Inventory_Level', 'Sales'],
      dtype='object')

In [6]:
# view summary of dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Product_Category   3000 non-null   object
 1   Price              3000 non-null   int64 
 2   Promotion          3000 non-null   object
 3   Advertising_Spend  3000 non-null   int64 
 4   Holiday            325 non-null    object
 5   Region             3000 non-null   object
 6   Store_ID           3000 non-null   object
 7   Customer_ID        3000 non-null   object
 8   Temperature        3000 non-null   int64 
 9   Weather_Condition  3000 non-null   object
 10  Inventory_Level    3000 non-null   int64 
 11  Sales              3000 non-null   int64 
dtypes: int64(5), object(7)
memory usage: 281.4+ KB


In [7]:
nul_val = data.isnull().sum()

nul_val

Product_Category        0
Price                   0
Promotion               0
Advertising_Spend       0
Holiday              2675
Region                  0
Store_ID                0
Customer_ID             0
Temperature             0
Weather_Condition       0
Inventory_Level         0
Sales                   0
dtype: int64

<h3>It looks like there is no any null values and our data set is completely clean.</h3>

<h3>Now we will separate the Columns according to there variable types.</h3>

In [8]:

# to check which columns are categorical 
categorical = [var for var in data.columns if data[var].dtype=='O']

print('There are {} categorical variables\n'.format(len(categorical)))

print('The categorical variables are :', categorical)

There are 7 categorical variables

The categorical variables are : ['Product_Category', 'Promotion', 'Holiday', 'Region', 'Store_ID', 'Customer_ID', 'Weather_Condition']


In [9]:
# view the categorical variables

data[categorical].head()

Unnamed: 0,Product_Category,Promotion,Holiday,Region,Store_ID,Customer_ID,Weather_Condition
0,Clothing,No,,South,S056,C02060,Sunny
1,Electronics,Yes,,North,S019,C03715,Rainy
2,Health & Beauty,Yes,,East,S024,C02458,Sunny
3,Clothing,Yes,,South,S020,C00860,Rainy
4,Home & Garden,Yes,New Year's Day,North,S067,C04934,Sunny


In [10]:
# view frequency of categorical variables

for var in categorical: 
    
    print(data[var].value_counts())

Product_Category
Clothing            608
Food & Beverages    606
Home & Garden       602
Health & Beauty     599
Electronics         585
Name: count, dtype: int64
Promotion
Yes    1517
No     1483
Name: count, dtype: int64
Holiday
Thanksgiving Day    62
Independence Day    60
Labor Day           55
Memorial Day        53
New Year's Day      49
Christmas Day       46
Name: count, dtype: int64
Region
North    800
East     745
South    740
West     715
Name: count, dtype: int64
Store_ID
S056    46
S082    41
S078    41
S026    41
S071    40
        ..
S092    21
S072    19
S058    18
S033    17
S076    16
Name: count, Length: 100, dtype: int64
Customer_ID
C07025    4
C06741    4
C00085    3
C01244    3
C02943    3
         ..
C06134    1
C09699    1
C02100    1
C04047    1
C09832    1
Name: count, Length: 2617, dtype: int64
Weather_Condition
Rainy     756
Cloudy    755
Snowy     752
Sunny     737
Name: count, dtype: int64


<h3>We will check the number of labels in each column</h3>

In [11]:
# check for number of labels in categorical variables

for var in categorical:
    
    print(var, ' contains ', len(data[var].unique()), ' labels')

Product_Category  contains  5  labels
Promotion  contains  2  labels
Holiday  contains  7  labels
Region  contains  4  labels
Store_ID  contains  100  labels
Customer_ID  contains  2617  labels
Weather_Condition  contains  4  labels


<h3>Now we will perform feature engineering fro the Date</h3>

In [12]:
data.head()

Unnamed: 0,Product_Category,Price,Promotion,Advertising_Spend,Holiday,Region,Store_ID,Customer_ID,Temperature,Weather_Condition,Inventory_Level,Sales
0,Clothing,44,No,460,,South,S056,C02060,5,Sunny,29,216
1,Electronics,56,Yes,4088,,North,S019,C03715,9,Rainy,87,731
2,Health & Beauty,96,Yes,3505,,East,S024,C02458,21,Sunny,898,664
3,Clothing,23,Yes,1258,,South,S020,C00860,-13,Rainy,305,851
4,Home & Garden,34,Yes,204,New Year's Day,North,S067,C04934,26,Sunny,744,179


In [13]:
# find categorical variables

categorical = [var for var in data.columns if data[var].dtype=='O']

print('There are {} categorical variables\n'.format(len(categorical)))

print('The categorical variables are :', categorical)


There are 7 categorical variables

The categorical variables are : ['Product_Category', 'Promotion', 'Holiday', 'Region', 'Store_ID', 'Customer_ID', 'Weather_Condition']


<h3>Product_Category</h3>

In [14]:
print('Product_Category contains', len(data.Product_Category.unique()), 'labels\n')

data.Product_Category.value_counts()

Product_Category contains 5 labels



Product_Category
Clothing            608
Food & Beverages    606
Home & Garden       602
Health & Beauty     599
Electronics         585
Name: count, dtype: int64

In [15]:
pd.get_dummies(data.Product_Category, drop_first=False).head()

Unnamed: 0,Clothing,Electronics,Food & Beverages,Health & Beauty,Home & Garden
0,True,False,False,False,False
1,False,True,False,False,False
2,False,False,False,True,False
3,True,False,False,False,False
4,False,False,False,False,True



<h3>Promotion</h3>

In [16]:
print('Promotion contains', len(data.Promotion.unique()), 'labels\n')

data.Promotion.value_counts()

Promotion contains 2 labels



Promotion
Yes    1517
No     1483
Name: count, dtype: int64

In [17]:
pd.get_dummies(data.Promotion, drop_first=False).head()

Unnamed: 0,No,Yes
0,True,False
1,False,True
2,False,True
3,False,True
4,False,True


<h3>Holiday</h3>

In [18]:
print('Holiday contains', len(data.Holiday.unique()), 'labels\n')

data.Holiday.value_counts()

Holiday contains 7 labels



Holiday
Thanksgiving Day    62
Independence Day    60
Labor Day           55
Memorial Day        53
New Year's Day      49
Christmas Day       46
Name: count, dtype: int64

In [19]:
pd.get_dummies(data.Holiday, drop_first=False).head()

Unnamed: 0,Christmas Day,Independence Day,Labor Day,Memorial Day,New Year's Day,Thanksgiving Day
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,True,False


<h3>Region</h3>

In [20]:
print('Region contains', len(data.Region.unique()), 'labels\n')

data.Region.value_counts()

Region contains 4 labels



Region
North    800
East     745
South    740
West     715
Name: count, dtype: int64

In [21]:
pd.get_dummies(data.Region, drop_first=False).head()

Unnamed: 0,East,North,South,West
0,False,False,True,False
1,False,True,False,False
2,True,False,False,False
3,False,False,True,False
4,False,True,False,False


<h3>Store_ID</h3>

In [22]:
print('Store_ID contains', len(data.Store_ID.unique()), 'labels\n')

data.Store_ID.value_counts()

Store_ID contains 100 labels



Store_ID
S056    46
S082    41
S078    41
S026    41
S071    40
        ..
S092    21
S072    19
S058    18
S033    17
S076    16
Name: count, Length: 100, dtype: int64

In [23]:
pd.get_dummies(data.Store_ID, drop_first=False).head()

Unnamed: 0,S001,S002,S003,S004,S005,S006,S007,S008,S009,S010,...,S091,S092,S093,S094,S095,S096,S097,S098,S099,S100
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


<h3>Customer_ID</h3>

In [24]:
print('Customer_ID contains', len(data.Customer_ID.unique()), 'labels\n')

data.Customer_ID.value_counts()

Customer_ID contains 2617 labels



Customer_ID
C07025    4
C06741    4
C00085    3
C01244    3
C02943    3
         ..
C06134    1
C09699    1
C02100    1
C04047    1
C09832    1
Name: count, Length: 2617, dtype: int64

In [25]:
pd.get_dummies(data.Customer_ID, drop_first=False).head()

Unnamed: 0,C00006,C00009,C00012,C00024,C00028,C00030,C00037,C00046,C00047,C00058,...,C09941,C09942,C09944,C09945,C09949,C09950,C09960,C09972,C09976,C09983
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


<h3>Weather_Condition</h3>

In [26]:
print('Weather_Condition contains', len(data.Weather_Condition.unique()), 'labels\n')

data.Weather_Condition.value_counts()

Weather_Condition contains 4 labels



Weather_Condition
Rainy     756
Cloudy    755
Snowy     752
Sunny     737
Name: count, dtype: int64

In [27]:
pd.get_dummies(data.Weather_Condition, drop_first=False).head()

Unnamed: 0,Cloudy,Rainy,Snowy,Sunny
0,False,False,False,True
1,False,True,False,False
2,False,False,False,True
3,False,True,False,False
4,False,False,False,True


In [28]:
for col in categorical:
    dummy = pd.get_dummies(data[col], prefix=col, drop_first=True)
    data = pd.concat([data, dummy], axis=1)
    data.drop(col, axis=1, inplace=True)

# Verify data types
print(data.dtypes)

Price                      int64
Advertising_Spend          int64
Temperature                int64
Inventory_Level            int64
Sales                      int64
                           ...  
Customer_ID_C09976          bool
Customer_ID_C09983          bool
Weather_Condition_Rainy     bool
Weather_Condition_Snowy     bool
Weather_Condition_Sunny     bool
Length: 2736, dtype: object


<h2>Now Defining Variable <b>X</b> and <b>Y</b></h2>

In [29]:
print(data.columns)

Index(['Price', 'Advertising_Spend', 'Temperature', 'Inventory_Level', 'Sales',
       'Product_Category_Electronics', 'Product_Category_Food & Beverages',
       'Product_Category_Health & Beauty', 'Product_Category_Home & Garden',
       'Promotion_Yes',
       ...
       'Customer_ID_C09945', 'Customer_ID_C09949', 'Customer_ID_C09950',
       'Customer_ID_C09960', 'Customer_ID_C09972', 'Customer_ID_C09976',
       'Customer_ID_C09983', 'Weather_Condition_Rainy',
       'Weather_Condition_Snowy', 'Weather_Condition_Sunny'],
      dtype='object', length=2736)


In [31]:
# Assuming 'data' is your DataFrame
selected_columns = [ 'Price', 'Advertising_Spend', 'Temperature']

# Keep only the selected columns
X = data[selected_columns]

# X = data.drop(['Sales',], axis=1)
Y = data['Sales']

In [32]:
X.shape

(3000, 3)

In [33]:
# split X and y into training and testing sets

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)


In [34]:
print(X_train)


      Price  Advertising_Spend  Temperature
2370     96               9297          -17
1774      6               9867           29
731      27               7257           38
271      35               5240           -6
1077     98               5429           30
...     ...                ...          ...
763      44                 37           25
835      94               1122            4
1653     72               1694           34
2607     75               6005           35
2732     99               9052           25

[2400 rows x 3 columns]


In [35]:
# train a logistic regression model on the training set
# instantiate the model
model = LinearRegression()


# fit the model
model.fit(X_train, Y_train)

In [36]:
Y_pred = model.predict(X_test)

Y_pred

array([481.4359128 , 498.87235105, 497.22180208, 498.68667318,
       474.87802162, 524.7341623 , 499.43515774, 513.49866532,
       498.75134622, 494.99267943, 490.9579101 , 489.87631847,
       499.076183  , 497.21887737, 485.37796811, 496.60705495,
       488.45113937, 492.4854455 , 481.34696909, 482.56844688,
       503.41624668, 487.76799472, 502.39454259, 482.3722971 ,
       487.23714386, 511.21173658, 520.43490346, 496.82619794,
       513.56161796, 504.58088791, 515.05525592, 508.32185619,
       510.11489757, 491.35961302, 505.34708881, 484.97609005,
       499.99579892, 480.53300978, 509.158265  , 490.58842553,
       501.54323146, 493.11438422, 493.28530466, 492.63925202,
       504.6649172 , 509.7465095 , 493.15555822, 480.77162329,
       502.45554829, 499.05472296, 497.77641039, 480.91201191,
       505.70251281, 499.27413041, 508.90855608, 490.03827249,
       503.33500914, 501.43628055, 493.49367722, 498.64743674,
       495.59918394, 509.61066701, 490.9814386 , 484.15