In [8]:
## 1. Imputation of Missing Data

import pandas as pd
from sklearn.impute import KNNImputer

# Load your dataset
df = pd.read_csv(r'C:\Users\muska\Downloads\train.csv')

# Mean Imputation for 'Item_Weight'
df['Item_Weight'] = df['Item_Weight'].fillna(df['Item_Weight'].mean())

# KNN Imputation for 'Item_Weight' and 'Item_Outlet_Sales'
imputer = KNNImputer(n_neighbors=2)
df[['Item_Weight', 'Item_Outlet_Sales']] = imputer.fit_transform(df[['Item_Weight', 'Item_Outlet_Sales']])

print(df.head())

  Item_Identifier  Item_Weight Item_Fat_Content  Item_Visibility  \
0           FDA15         9.30          Low Fat         0.016047   
1           DRC01         5.92          Regular         0.019278   
2           FDN15        17.50          Low Fat         0.016760   
3           FDX07        19.20          Regular         0.000000   
4           NCD19         8.93          Low Fat         0.000000   

               Item_Type  Item_MRP Outlet_Identifier  \
0                  Dairy  249.8092            OUT049   
1            Soft Drinks   48.2692            OUT018   
2                   Meat  141.6180            OUT049   
3  Fruits and Vegetables  182.0950            OUT010   
4              Household   53.8614            OUT013   

   Outlet_Establishment_Year Outlet_Size Outlet_Location_Type  \
0                       1999      Medium               Tier 1   
1                       2009      Medium               Tier 3   
2                       1999      Medium               Tier

In [1]:
import pandas as pd

# Provide path to the CSV file
df = pd.read_csv(r'C:\Users\muska\Downloads\train.csv')

df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976


In [9]:
## 2. Categorical Variable Encoding

import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Create a OneHotEncoder instance
encoder = OneHotEncoder(sparse_output=False)  # Use sparse_output instead of sparse

# Encode the categorical variable 'Item_Fat_Content'
encoded = encoder.fit_transform(df[['Item_Fat_Content']])

# Create a DataFrame with the encoded features
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['Item_Fat_Content']))

# Concatenate the original DataFrame (without the encoded column) with the new encoded DataFrame
df_encoded = pd.concat([df.drop('Item_Fat_Content', axis=1).reset_index(drop=True), encoded_df], axis=1)

print(df_encoded.head())

  Item_Identifier  Item_Weight  Item_Visibility              Item_Type  \
0           FDA15         9.30         0.016047                  Dairy   
1           DRC01         5.92         0.019278            Soft Drinks   
2           FDN15        17.50         0.016760                   Meat   
3           FDX07        19.20         0.000000  Fruits and Vegetables   
4           NCD19         8.93         0.000000              Household   

   Item_MRP Outlet_Identifier  Outlet_Establishment_Year Outlet_Size  \
0  249.8092            OUT049                       1999      Medium   
1   48.2692            OUT018                       2009      Medium   
2  141.6180            OUT049                       1999      Medium   
3  182.0950            OUT010                       1998         NaN   
4   53.8614            OUT013                       1987        High   

  Outlet_Location_Type        Outlet_Type  Item_Outlet_Sales  \
0               Tier 1  Supermarket Type1          3735.13

In [10]:
## 3. Feature Derivation

import pandas as pd

# Deriving a new feature: sales_per_mrp
# Ensure we handle divisions by zero if 'Item_MRP' is zero
df['sales_per_mrp'] = df['Item_Outlet_Sales'] / df['Item_MRP'].replace(0, 1)  # Replace 0 with 1 to avoid division by zero

# Display the original columns and the new feature
print(df[['Item_Outlet_Sales', 'Item_MRP', 'sales_per_mrp']].head())


   Item_Outlet_Sales  Item_MRP  sales_per_mrp
0          3735.1380  249.8092      14.951963
1           443.4228   48.2692       9.186454
2          2097.2700  141.6180      14.809346
3           732.3800  182.0950       4.021967
4           994.7052   53.8614      18.467868


In [11]:
## 4. Outlier Treatment

import pandas as pd

# Calculate Q1 (25th percentile) and Q3 (75th percentile) for Item_Outlet_Sales
Q1 = df['Item_Outlet_Sales'].quantile(0.25)
Q3 = df['Item_Outlet_Sales'].quantile(0.75)
IQR = Q3 - Q1

# Filtering out outliers
df_outliers_removed = df[~((df['Item_Outlet_Sales'] < (Q1 - 1.5 * IQR)) | 
                            (df['Item_Outlet_Sales'] > (Q3 + 1.5 * IQR)))]

# Display the dataframe without outliers
print(df_outliers_removed[['Item_Outlet_Sales']].head())

   Item_Outlet_Sales
0          3735.1380
1           443.4228
2          2097.2700
3           732.3800
4           994.7052


In [12]:
## 5. Feature Scaling

import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Apply MinMaxScaler to relevant numerical columns
df[['Item_Weight', 'Item_MRP']] = scaler.fit_transform(df[['Item_Weight', 'Item_MRP']])

# Display the scaled dataframe
print(df[['Item_Weight', 'Item_MRP']].head())

   Item_Weight  Item_MRP
0     0.282525  0.927507
1     0.081274  0.072068
2     0.770765  0.468288
3     0.871986  0.640093
4     0.260494  0.095805


In [13]:
## 6. Target Variable Transformation

import pandas as pd
import numpy as np

df['salary_log'] = np.log1p(df['Item_Outlet_Sales'])  

# Display the original and log-transformed salary values
print(df[['Item_Outlet_Sales', 'salary_log']].head())  # Displaying first few rows

   Item_Outlet_Sales  salary_log
0          3735.1380    8.225808
1           443.4228    6.096776
2          2097.2700    7.648868
3           732.3800    6.597664
4           994.7052    6.903451


In [14]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                8523 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
 12  sales_per_mrp              8523 non-null   float64
 13  salary_log                 8523 non-null   float

In [15]:
df = pd.read_csv(r'C:\Users\muska\Downloads\train.csv')
print(df.columns)

Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales'],
      dtype='object')


In [16]:
print(df.head())

  Item_Identifier  Item_Weight Item_Fat_Content  Item_Visibility  \
0           FDA15         9.30          Low Fat         0.016047   
1           DRC01         5.92          Regular         0.019278   
2           FDN15        17.50          Low Fat         0.016760   
3           FDX07        19.20          Regular         0.000000   
4           NCD19         8.93          Low Fat         0.000000   

               Item_Type  Item_MRP Outlet_Identifier  \
0                  Dairy  249.8092            OUT049   
1            Soft Drinks   48.2692            OUT018   
2                   Meat  141.6180            OUT049   
3  Fruits and Vegetables  182.0950            OUT010   
4              Household   53.8614            OUT013   

   Outlet_Establishment_Year Outlet_Size Outlet_Location_Type  \
0                       1999      Medium               Tier 1   
1                       2009      Medium               Tier 3   
2                       1999      Medium               Tier

In [17]:
# Apply one-hot encoding to all object-type columns
df = pd.get_dummies(df, columns=['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'], drop_first=True)

# Check the transformed DataFrame
print(df.dtypes)
print(df.head())

Item_Identifier                     object
Item_Weight                        float64
Item_Visibility                    float64
Item_MRP                           float64
Outlet_Identifier                   object
Outlet_Establishment_Year            int64
Item_Outlet_Sales                  float64
Item_Fat_Content_Low Fat              bool
Item_Fat_Content_Regular              bool
Item_Fat_Content_low fat              bool
Item_Fat_Content_reg                  bool
Item_Type_Breads                      bool
Item_Type_Breakfast                   bool
Item_Type_Canned                      bool
Item_Type_Dairy                       bool
Item_Type_Frozen Foods                bool
Item_Type_Fruits and Vegetables       bool
Item_Type_Hard Drinks                 bool
Item_Type_Health and Hygiene          bool
Item_Type_Household                   bool
Item_Type_Meat                        bool
Item_Type_Others                      bool
Item_Type_Seafood                     bool
Item_Type_S

In [18]:
df.shape[1]

33

In [21]:
df.Item_Fat_Content_Regular

0       False
1        True
2       False
3        True
4       False
        ...  
8518    False
8519     True
8520    False
8521     True
8522    False
Name: Item_Fat_Content_Regular, Length: 8523, dtype: bool