In [274]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

## Read train data

In [275]:
df = pd.read_csv("train_v9rqX0R.csv")
print(df.shape)
df.head()

(8523, 12)


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


## EDA

In [276]:
df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [277]:
# Define imputer
imputer = IterativeImputer(random_state=100, max_iter=10)

In [278]:
df_train = df[['Item_Weight', 'Item_Visibility', 'Item_MRP']]
df_train.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP
0,9.3,0.016047,249.8092
1,5.92,0.019278,48.2692
2,17.5,0.01676,141.618
3,19.2,0.0,182.095
4,8.93,0.0,53.8614


In [279]:
# df['Item_Weight'] = df['Item_Weight'].fillna(0)
# df['Outlet_Size'] = df['Outlet_Size'].fillna('Missing')

# fit on the dataset
imputer.fit(df_train)

In [280]:
df_imputed = imputer.transform(df_train)

In [281]:
df[['Item_Weight', 'Item_Visibility', 'Item_MRP']] = df_imputed
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [None]:
# df.isna().sum()

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [282]:
df['Outlet_Size'] = df['Outlet_Size'].fillna('Missing')
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Missing,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [284]:
df.isna().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

## Feature Engineering

### Encode the columns having upto 4 distinct values

In [285]:
le = LabelEncoder()
df_encoded = df[['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Outlet_Establishment_Year']]

df_encoded = df_encoded.apply(LabelEncoder().fit_transform)

In [286]:
df_encoded = pd.concat([df[['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Item_Outlet_Sales']], df_encoded], axis=1)
df_encoded.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Outlet_Sales,Item_Fat_Content,Item_Type,Outlet_Size,Outlet_Location_Type,Outlet_Type,Outlet_Establishment_Year
0,9.3,0.016047,249.8092,3735.138,1,4,1,0,1,4
1,5.92,0.019278,48.2692,443.4228,2,14,1,2,2,8
2,17.5,0.01676,141.618,2097.27,1,10,1,0,1,4
3,19.2,0.0,182.095,732.38,2,6,2,2,0,3
4,8.93,0.0,53.8614,994.7052,1,9,0,2,1,1


In [None]:
# df_encoded = pd.get_dummies(df, columns=['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Outlet_Establishment_Year'], drop_first=True, dtype=int)
# df_encoded.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Visibility,Item_MRP,Outlet_Identifier,Item_Outlet_Sales,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Fat_Content_low fat,Item_Fat_Content_reg,...,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3,Outlet_Establishment_Year_1987,Outlet_Establishment_Year_1997,Outlet_Establishment_Year_1998,Outlet_Establishment_Year_1999,Outlet_Establishment_Year_2002,Outlet_Establishment_Year_2004,Outlet_Establishment_Year_2007,Outlet_Establishment_Year_2009
0,FDA15,9.3,0.016047,249.8092,OUT049,3735.138,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,DRC01,5.92,0.019278,48.2692,OUT018,443.4228,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
2,FDN15,17.5,0.01676,141.618,OUT049,2097.27,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,FDX07,19.2,0.0,182.095,OUT010,732.38,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
4,NCD19,8.93,0.0,53.8614,OUT013,994.7052,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [253]:
# df_encoded.drop(columns=['Item_Identifier', 'Outlet_Identifier'], inplace=True)
# df_encoded.head()

## Extract the numerical columns for training 

In [287]:
value = df_encoded['Item_Outlet_Sales']
data= df_encoded.drop('Item_Outlet_Sales', axis=1)
# data = df[['Item_Weight', 'Item_Visibility', 'Item_MRP']]
# normalized_df=(data-data.mean())/data.std()

X = data.values
y = value.values

print(X.shape, y.shape)

(8523, 9) (8523,)


## Use a GBM Regressor

In [288]:
# model = RandomForestRegressor(
#     n_estimators=100,
#     random_state=0,
#     oob_score=True  
# )
model = GradientBoostingRegressor(n_estimators=300,
                                 learning_rate=0.001,
                                 random_state=100,
                                 max_features= 10
    )

In [289]:
model.fit(X, y)

## Predict of test data

In [290]:
df = pd.read_csv("test_AbJTz2l.csv")
df_copy = df.copy()
print(df.shape)
df.head()

(5681, 11)


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3


In [291]:
df.isna().sum()

Item_Identifier                 0
Item_Weight                   976
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1606
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [292]:
# df['Item_Weight'] = df['Item_Weight'].fillna(0)
# df['Outlet_Size'] = df['Outlet_Size'].fillna('Missing')
# df.isna().sum()

df_test = df[['Item_Weight', 'Item_Visibility', 'Item_MRP']]
df_test.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP
0,20.75,0.007565,107.8622
1,8.3,0.038428,87.3198
2,14.6,0.099575,241.7538
3,7.315,0.015388,155.034
4,,0.118599,234.23


In [293]:
imputer.fit(df_test)

In [294]:
df_imputed = imputer.transform(df_test)

In [295]:
df[['Item_Weight', 'Item_Visibility', 'Item_MRP']] = df_imputed
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,13.021946,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3


In [296]:
# df['Item_Fat_Content'] = oe.fit_transform(df[['Item_Fat_Content']])
# df['Outlet_Size'] = oe.fit_transform(df[['Outlet_Size']])
# df['Outlet_Location_Type'] = oe.fit_transform(df[['Outlet_Location_Type']])
# df['Item_Type'] = oe.fit_transform(df[['Item_Type']])
# df['Outlet_Type'] = oe.fit_transform(df[['Outlet_Type']])
# df['Outlet_Establishment_Year'] = oe.fit_transform(df[['Outlet_Establishment_Year']])
# df.head()
df.isna().sum()

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1606
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [297]:
df['Outlet_Size'] = df['Outlet_Size'].fillna('Missing')

In [298]:
df_encoded = df[['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Outlet_Establishment_Year']]

df_encoded = df_encoded.apply(LabelEncoder().fit_transform)

In [299]:
df_encoded = pd.concat([df[['Item_Weight', 'Item_Visibility', 'Item_MRP']], df_encoded], axis=1)
df_encoded.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Fat_Content,Item_Type,Outlet_Size,Outlet_Location_Type,Outlet_Type,Outlet_Establishment_Year
0,20.75,0.007565,107.8622,1,13,1,0,1,4
1,8.3,0.038428,87.3198,4,4,2,1,1,7
2,14.6,0.099575,241.7538,1,11,2,2,0,3
3,7.315,0.015388,155.034,1,13,2,1,1,7
4,13.021946,0.118599,234.23,2,4,1,2,3,0


In [272]:
# df = pd.get_dummies(df, columns=['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Outlet_Establishment_Year'], drop_first=True, dtype=int)
# df.head()

In [230]:
df.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Fat_Content_low fat,Item_Fat_Content_reg,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,...,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3,Outlet_Establishment_Year_1987,Outlet_Establishment_Year_1997,Outlet_Establishment_Year_1998,Outlet_Establishment_Year_1999,Outlet_Establishment_Year_2002,Outlet_Establishment_Year_2004,Outlet_Establishment_Year_2007,Outlet_Establishment_Year_2009
0,20.75,0.007565,107.8622,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,8.3,0.038428,87.3198,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,14.6,0.099575,241.7538,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,7.315,0.015388,155.034,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,13.021946,0.118599,234.23,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [300]:
X_test = df_encoded.values

predictions = model.predict(X_test)

df['Item_Outlet_Sales'] = predictions
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1,2083.115128
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,Missing,Tier 2,Supermarket Type1,1997.447205
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,Missing,Tier 3,Grocery Store,1764.1133
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,Missing,Tier 2,Supermarket Type1,2313.897848
4,FDY38,13.021946,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3,2978.001599


In [301]:
df_copy['Item_Outlet_Sales'] = df['Item_Outlet_Sales']

In [302]:
df_copy.to_csv('submission_gbm4.csv', columns=['Item_Identifier', 'Outlet_Identifier','Item_Outlet_Sales'], index=False)