In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
train = pd.read_csv('train_v9rqX0R.csv')
print(train.shape)
print(train.info())
print("Duplicates", train.duplicated().sum())
train

(8523, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB
None
Duplicates 0


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976


In [5]:
missing_data = train.isnull().sum()
missing_percent = (train.isnull().sum() / len(train)) * 100

missing_df = pd.DataFrame({'Missing Count': missing_data, 'Missing Percentage': missing_percent})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values(by='Missing Percentage', ascending=False)

print("Missing Data Summary:")
print(missing_df)

Missing Data Summary:
             Missing Count  Missing Percentage
Outlet_Size           2410           28.276428
Item_Weight           1463           17.165317


In [6]:
train['Outlet_Size'].unique()

array(['Medium', nan, 'High', 'Small'], dtype=object)

In [7]:
train['Outlet_Size'].mode()

Unnamed: 0,Outlet_Size
0,Medium


In [8]:
train[['Outlet_Identifier', 'Outlet_Size', 'Outlet_Type', 'Outlet_Location_Type']].drop_duplicates()

Unnamed: 0,Outlet_Identifier,Outlet_Size,Outlet_Type,Outlet_Location_Type
0,OUT049,Medium,Supermarket Type1,Tier 1
1,OUT018,Medium,Supermarket Type2,Tier 3
3,OUT010,,Grocery Store,Tier 3
4,OUT013,High,Supermarket Type1,Tier 3
7,OUT027,Medium,Supermarket Type3,Tier 3
8,OUT045,,Supermarket Type1,Tier 2
9,OUT017,,Supermarket Type1,Tier 2
11,OUT046,Small,Supermarket Type1,Tier 1
19,OUT035,Small,Supermarket Type1,Tier 2
23,OUT019,Small,Grocery Store,Tier 1


In [9]:
train.loc[train['Outlet_Type'] == 'Grocery Store', 'Outlet_Size'] = 'Small'
train[['Outlet_Identifier', 'Outlet_Size', 'Outlet_Type', 'Outlet_Location_Type']].drop_duplicates()

Unnamed: 0,Outlet_Identifier,Outlet_Size,Outlet_Type,Outlet_Location_Type
0,OUT049,Medium,Supermarket Type1,Tier 1
1,OUT018,Medium,Supermarket Type2,Tier 3
3,OUT010,Small,Grocery Store,Tier 3
4,OUT013,High,Supermarket Type1,Tier 3
7,OUT027,Medium,Supermarket Type3,Tier 3
8,OUT045,,Supermarket Type1,Tier 2
9,OUT017,,Supermarket Type1,Tier 2
11,OUT046,Small,Supermarket Type1,Tier 1
19,OUT035,Small,Supermarket Type1,Tier 2
23,OUT019,Small,Grocery Store,Tier 1


In [10]:
store_summary = train[['Outlet_Identifier', 'Outlet_Size', 'Outlet_Type', 'Outlet_Location_Type']].drop_duplicates().reset_index(drop=True)
item_counts = train.groupby('Outlet_Identifier')['Item_Identifier'].nunique().reset_index()
item_counts.columns = ['Outlet_Identifier', 'Item_Count']
store_summary = pd.merge(store_summary, item_counts, on='Outlet_Identifier', how='left')

store_summary

Unnamed: 0,Outlet_Identifier,Outlet_Size,Outlet_Type,Outlet_Location_Type,Item_Count
0,OUT049,Medium,Supermarket Type1,Tier 1,930
1,OUT018,Medium,Supermarket Type2,Tier 3,928
2,OUT010,Small,Grocery Store,Tier 3,555
3,OUT013,High,Supermarket Type1,Tier 3,932
4,OUT027,Medium,Supermarket Type3,Tier 3,935
5,OUT045,,Supermarket Type1,Tier 2,929
6,OUT017,,Supermarket Type1,Tier 2,926
7,OUT046,Small,Supermarket Type1,Tier 1,930
8,OUT035,Small,Supermarket Type1,Tier 2,930
9,OUT019,Small,Grocery Store,Tier 1,528


In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

df_impute = train[['Outlet_Size', 'Outlet_Type', 'Outlet_Location_Type']].copy()

le_type = LabelEncoder()
le_loc = LabelEncoder()
le_size = LabelEncoder()

df_impute['Outlet_Type'] = le_type.fit_transform(df_impute['Outlet_Type'])
df_impute['Outlet_Location_Type'] = le_loc.fit_transform(df_impute['Outlet_Location_Type'])

known_size = df_impute[df_impute['Outlet_Size'].notnull()].copy()
unknown_size = df_impute[df_impute['Outlet_Size'].isnull()].copy()

known_size['Outlet_Size_Encoded'] = le_size.fit_transform(known_size['Outlet_Size'])

X_train = known_size[['Outlet_Type', 'Outlet_Location_Type']]
y_train = known_size['Outlet_Size_Encoded']
X_predict = unknown_size[['Outlet_Type', 'Outlet_Location_Type']]

rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)

predicted_encoded = rf_model.predict(X_predict)
predicted_labels = le_size.inverse_transform(predicted_encoded)

train.loc[train['Outlet_Size'].isnull(), 'Outlet_Size'] = predicted_labels

print("Imputed values for OUT045 and OUT017:")
train[train['Outlet_Identifier'].isin(['OUT045', 'OUT017'])][['Outlet_Identifier', 'Outlet_Size', 'Outlet_Type', 'Outlet_Location_Type']].drop_duplicates()

Imputed values for OUT045 and OUT017:


Unnamed: 0,Outlet_Identifier,Outlet_Size,Outlet_Type,Outlet_Location_Type
8,OUT045,Small,Supermarket Type1,Tier 2
9,OUT017,Small,Supermarket Type1,Tier 2


In [12]:
# Calculate missing values
missing_data = train.isnull().sum()
missing_percent = (train.isnull().sum() / len(train)) * 100

# Create a DataFrame for the missing data analysis
missing_df = pd.DataFrame({'Missing Count': missing_data, 'Missing Percentage': missing_percent})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values(by='Missing Percentage', ascending=False)

print("Missing Data Summary:")
print(missing_df)

Missing Data Summary:
             Missing Count  Missing Percentage
Item_Weight           1463           17.165317


In [13]:
train.Item_Identifier.describe()

Unnamed: 0,Item_Identifier
count,8523
unique,1559
top,FDW13
freq,10


In [14]:
train.Item_Weight.describe()

Unnamed: 0,Item_Weight
count,7060.0
mean,12.857645
std,4.643456
min,4.555
25%,8.77375
50%,12.6
75%,16.85
max,21.35


In [15]:
items = train[['Item_Identifier', 'Item_Weight']].drop_duplicates()
items = items.sort_values(by='Item_Identifier')
items_count = pd.DataFrame(items['Item_Identifier'].value_counts())
items = items.join(items_count, on='Item_Identifier', how='left')
items.columns = ['Item_Identifier', 'Item_Weight', 'Item_Count']
items = items.sort_values(by='Item_Count')
items.reset_index(drop=True)
items

Unnamed: 0,Item_Identifier,Item_Weight,Item_Count
2657,DRB24,8.785,1
1431,DRB13,6.115,1
503,FDC40,16.000,1
1418,FDC39,7.405,1
2481,FDC38,15.700,1
...,...,...,...
2084,NCZ53,,2
1267,NCZ53,9.600,2
1896,NCZ54,,2
214,NCZ54,14.650,2


In [16]:
items_with_missing = items[items['Item_Count'] == 2]

print("\nVerifying the pattern (Value vs NaN):")
items_with_missing.sort_values(by='Item_Identifier')


Verifying the pattern (Value vs NaN):


Unnamed: 0,Item_Identifier,Item_Weight,Item_Count
2879,DRA24,,2
1148,DRA24,19.35,2
1876,DRA59,8.27,2
6057,DRA59,,2
3465,DRB01,7.39,2
...,...,...,...
4523,NCZ30,,2
2084,NCZ53,,2
1267,NCZ53,9.60,2
1896,NCZ54,,2


In [17]:
item_weight_map = train[['Item_Identifier', 'Item_Weight']].dropna().drop_duplicates()
item_weight_map = item_weight_map.set_index('Item_Identifier')['Item_Weight']

train['Item_Weight'] = train['Item_Weight'].fillna(train['Item_Identifier'].map(item_weight_map))

missing_data = train.isnull().sum()
missing_percent = (train.isnull().sum() / len(train)) * 100

missing_df = pd.DataFrame({'Missing Count': missing_data, 'Missing Percentage': missing_percent})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values(by='Missing Percentage', ascending=False)

print("Missing Data Summary:")
print(missing_df)

Missing Data Summary:
             Missing Count  Missing Percentage
Item_Weight              4            0.046932


In [18]:
missing_rows = train[train.isnull().any(axis=1)]

print(f"Total rows with missing data: {len(missing_rows)}")
print("\nSample of missing rows:")
missing_rows

Total rows with missing data: 4

Sample of missing rows:


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
927,FDN52,,Regular,0.130933,Frozen Foods,86.9198,OUT027,1985,Medium,Tier 3,Supermarket Type3,1569.9564
1922,FDK57,,Low Fat,0.079904,Snack Foods,120.044,OUT027,1985,Medium,Tier 3,Supermarket Type3,4434.228
4187,FDE52,,Regular,0.029742,Dairy,88.9514,OUT027,1985,Medium,Tier 3,Supermarket Type3,3453.5046
5022,FDQ60,,Regular,0.191501,Baking Goods,121.2098,OUT019,1985,Small,Tier 1,Grocery Store,120.5098


In [19]:
items = train[['Item_Weight', 'Item_Fat_Content', 'Item_Type', 'Item_MRP']].drop_duplicates()
items_Baking_Goods = items[(items.Item_Type == 'Baking Goods') & (items.Item_Fat_Content == 'Regular')]
correlation = items_Baking_Goods['Item_Weight'].corr(items_Baking_Goods['Item_MRP'])
print(f"Correlation between Item_Weight and Item_MRP: {correlation:.4f}")
items_Baking_Goods[['Item_Weight', 'Item_MRP']]

Correlation between Item_Weight and Item_MRP: 0.2194


Unnamed: 0,Item_Weight,Item_MRP
5,10.395,51.4008
21,8.315,144.5444
48,15.850,60.6220
71,15.850,59.2220
84,12.600,171.3764
...,...,...
8361,17.750,154.0656
8381,6.215,226.1062
8441,16.500,95.2068
8465,16.000,180.5634


In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

train_subset = items_Baking_Goods.dropna(subset=['Item_Weight', 'Item_MRP'])

X_train = train_subset[['Item_MRP']]
y_train = train_subset['Item_Weight']

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_train_pred = rf_model.predict(X_train)

r2 = r2_score(y_train, y_train_pred)
mse = mean_squared_error(y_train, y_train_pred)
rmse = np.sqrt(mse)

print("--- Model Training Performance ---")
print(f"R-squared Score: {r2:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

missing_mrp = np.array([[121.2098]])
predicted_weight = rf_model.predict(missing_mrp)[0]

print(f"Predicted Item_Weight for MRP 121.2098: {predicted_weight:.4f}")

train.loc[5022, 'Item_Weight'] = predicted_weight

--- Model Training Performance ---
R-squared Score: 0.9006
Mean Squared Error (MSE): 2.4149
Root Mean Squared Error (RMSE): 1.5540
Predicted Item_Weight for MRP 121.2098: 7.1279




In [21]:
items = train[['Item_Weight', 'Item_Fat_Content', 'Item_Type', 'Item_MRP']].drop_duplicates()
items_Dairy = items[(items.Item_Type == 'Dairy') & (items.Item_Fat_Content == 'Regular')]
correlation = items_Dairy['Item_Weight'].corr(items_Dairy['Item_MRP'])
print(f"Correlation between Item_Weight and Item_MRP: {correlation:.4f}")
items_Dairy

Correlation between Item_Weight and Item_MRP: 0.3250


Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Type,Item_MRP
11,18.500,Regular,Dairy,144.1102
28,5.925,Regular,Dairy,45.5086
49,10.195,Regular,Dairy,196.8794
67,13.650,Regular,Dairy,186.0240
183,5.750,Regular,Dairy,112.8176
...,...,...,...,...
8134,5.325,Regular,Dairy,53.2298
8170,11.350,Regular,Dairy,181.9608
8184,20.250,Regular,Dairy,248.1092
8194,18.250,Regular,Dairy,157.8630


In [22]:
train_subset = items_Dairy.dropna(subset=['Item_Weight', 'Item_MRP'])

X_train = train_subset[['Item_MRP']]
y_train = train_subset['Item_Weight']

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_train_pred = rf_model.predict(X_train)

r2 = r2_score(y_train, y_train_pred)
mse = mean_squared_error(y_train, y_train_pred)
rmse = np.sqrt(mse)

print("--- Model Training Performance ---")
print(f"R-squared Score: {r2:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

missing_mrp = np.array([[88.9514]])
predicted_weight = rf_model.predict(missing_mrp)[0]

print(f"Predicted Item_Weight for MRP 88.9514: {predicted_weight:.4f}")

train.loc[4187, 'Item_Weight'] = predicted_weight

--- Model Training Performance ---
R-squared Score: 0.9239
Mean Squared Error (MSE): 1.7553
Root Mean Squared Error (RMSE): 1.3249
Predicted Item_Weight for MRP 88.9514: 11.3500




In [23]:
items = train[['Item_Weight', 'Item_Fat_Content', 'Item_Type', 'Item_MRP']].drop_duplicates()
items_Snack_Foods	 = items[(items.Item_Type == 'Snack Foods') & (items.Item_Fat_Content == 'Low Fat')]
correlation = items_Snack_Foods['Item_Weight'].corr(items_Snack_Foods['Item_MRP'])
print(f"Correlation between Item_Weight and Item_MRP: {correlation:.4f}")
items_Snack_Foods

Correlation between Item_Weight and Item_MRP: 0.1056


Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Type,Item_MRP
7,19.000,Low Fat,Snack Foods,107.7622
32,18.700,Low Fat,Snack Foods,256.6672
39,15.250,Low Fat,Snack Foods,87.6198
42,13.600,Low Fat,Snack Foods,192.9136
55,7.905,Low Fat,Snack Foods,249.0408
...,...,...,...,...
8479,8.395,Low Fat,Snack Foods,99.1042
8480,7.315,Low Fat,Snack Foods,154.5340
8492,9.300,Low Fat,Snack Foods,104.4964
8495,9.695,Low Fat,Snack Foods,160.4920


In [24]:
train_subset = items_Snack_Foods.dropna(subset=['Item_Weight', 'Item_MRP'])

X_train = train_subset[['Item_MRP']]
y_train = train_subset['Item_Weight']

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_train_pred = rf_model.predict(X_train)

r2 = r2_score(y_train, y_train_pred)
mse = mean_squared_error(y_train, y_train_pred)
rmse = np.sqrt(mse)

print("--- Model Training Performance ---")
print(f"R-squared Score: {r2:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

missing_mrp = np.array([[120.0440]])
predicted_weight = rf_model.predict(missing_mrp)[0]

print(f"Predicted Item_Weight for MRP 120.0440: {predicted_weight:.4f}")

train.loc[1922, 'Item_Weight'] = predicted_weight

--- Model Training Performance ---
R-squared Score: 0.8221
Mean Squared Error (MSE): 3.6939
Root Mean Squared Error (RMSE): 1.9220
Predicted Item_Weight for MRP 120.0440: 7.9246




In [25]:
items = train[['Item_Weight', 'Item_Fat_Content', 'Item_Type', 'Item_MRP']].drop_duplicates()
items_Frozen_Foods	 = items[(items.Item_Type == 'Frozen Foods') & (items.Item_Fat_Content == 'Regular')]
correlation = items_Frozen_Foods['Item_Weight'].corr(items_Frozen_Foods['Item_MRP'])
print(f"Correlation between Item_Weight and Item_MRP: {correlation:.4f}")
items_Frozen_Foods

Correlation between Item_Weight and Item_MRP: -0.0531


Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Type,Item_MRP
8,16.20,Regular,Frozen Foods,96.9726
9,19.20,Regular,Frozen Foods,187.8214
24,13.85,Regular,Frozen Foods,165.0210
63,15.50,Regular,Frozen Foods,51.0692
75,8.39,Regular,Frozen Foods,114.0176
...,...,...,...,...
8345,7.47,Regular,Frozen Foods,215.6218
8419,19.20,Regular,Frozen Foods,189.8214
8429,19.75,Regular,Frozen Foods,181.5660
8432,14.00,Regular,Frozen Foods,54.3640


In [26]:
train_subset = items_Frozen_Foods.dropna(subset=['Item_Weight', 'Item_MRP'])

X_train = train_subset[['Item_MRP']]
y_train = train_subset['Item_Weight']

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_train_pred = rf_model.predict(X_train)

r2 = r2_score(y_train, y_train_pred)
mse = mean_squared_error(y_train, y_train_pred)
rmse = np.sqrt(mse)

print("--- Model Training Performance ---")
print(f"R-squared Score: {r2:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

missing_mrp = np.array([[86.9198]])
predicted_weight = rf_model.predict(missing_mrp)[0]

print(f"Predicted Item_Weight for MRP 86.9198: {predicted_weight:.4f}")

train.loc[927, 'Item_Weight'] = predicted_weight

--- Model Training Performance ---
R-squared Score: 0.8781
Mean Squared Error (MSE): 2.1980
Root Mean Squared Error (RMSE): 1.4826
Predicted Item_Weight for MRP 86.9198: 14.4348




In [27]:
missing_rows = train[train.isnull().any(axis=1)]

print(f"Total rows with missing data: {len(missing_rows)}")
print("\nSample of missing rows:")
missing_rows

Total rows with missing data: 0

Sample of missing rows:


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales


In [28]:
test = pd.read_csv('test_AbJTz2l.csv')
print(test.shape)
print(test.info())
print("Duplicates", test.duplicated().sum())
test

(5681, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5681 entries, 0 to 5680
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            5681 non-null   object 
 1   Item_Weight                4705 non-null   float64
 2   Item_Fat_Content           5681 non-null   object 
 3   Item_Visibility            5681 non-null   float64
 4   Item_Type                  5681 non-null   object 
 5   Item_MRP                   5681 non-null   float64
 6   Outlet_Identifier          5681 non-null   object 
 7   Outlet_Establishment_Year  5681 non-null   int64  
 8   Outlet_Size                4075 non-null   object 
 9   Outlet_Location_Type       5681 non-null   object 
 10  Outlet_Type                5681 non-null   object 
dtypes: float64(3), int64(1), object(7)
memory usage: 488.3+ KB
None
Duplicates 0


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.750,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.300,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.600,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.0340,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.2300,OUT027,1985,Medium,Tier 3,Supermarket Type3
...,...,...,...,...,...,...,...,...,...,...,...
5676,FDB58,10.500,Regular,0.013496,Snack Foods,141.3154,OUT046,1997,Small,Tier 1,Supermarket Type1
5677,FDD47,7.600,Regular,0.142991,Starchy Foods,169.1448,OUT018,2009,Medium,Tier 3,Supermarket Type2
5678,NCO17,10.000,Low Fat,0.073529,Health and Hygiene,118.7440,OUT045,2002,,Tier 2,Supermarket Type1
5679,FDJ26,15.300,Regular,0.000000,Canned,214.6218,OUT017,2007,,Tier 2,Supermarket Type1


In [29]:
missing_rows = test[test.isnull().any(axis=1)]

print(f"Total rows with missing data: {len(missing_rows)}")
print("\nSample of missing rows:")
missing_rows

Total rows with missing data: 2582

Sample of missing rows:


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
1,FDW14,8.300,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.600,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.0340,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.2300,OUT027,1985,Medium,Tier 3,Supermarket Type3
7,FDC48,,Low Fat,0.015782,Baking Goods,81.0592,OUT027,1985,Medium,Tier 3,Supermarket Type3
...,...,...,...,...,...,...,...,...,...,...,...
5669,FDN31,,Low Fat,0.072529,Fruits and Vegetables,188.0530,OUT027,1985,Medium,Tier 3,Supermarket Type3
5670,FDO03,10.395,Regular,0.037092,Meat,229.4352,OUT017,2007,,Tier 2,Supermarket Type1
5678,NCO17,10.000,Low Fat,0.073529,Health and Hygiene,118.7440,OUT045,2002,,Tier 2,Supermarket Type1
5679,FDJ26,15.300,Regular,0.000000,Canned,214.6218,OUT017,2007,,Tier 2,Supermarket Type1


In [30]:
# Calculate missing values
missing_data = test.isnull().sum()
missing_percent = (test.isnull().sum() / len(test)) * 100

# Create a DataFrame for the missing data analysis
missing_df = pd.DataFrame({'Missing Count': missing_data, 'Missing Percentage': missing_percent})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values(by='Missing Percentage', ascending=False)

print("Missing Data Summary:")
print(missing_df)

Missing Data Summary:
             Missing Count  Missing Percentage
Outlet_Size           1606           28.269671
Item_Weight            976           17.180074


In [32]:
set(list(test.Outlet_Identifier))

{'OUT010',
 'OUT013',
 'OUT017',
 'OUT018',
 'OUT019',
 'OUT027',
 'OUT035',
 'OUT045',
 'OUT046',
 'OUT049'}

In [33]:
set(list(train.Outlet_Identifier))

{'OUT010',
 'OUT013',
 'OUT017',
 'OUT018',
 'OUT019',
 'OUT027',
 'OUT035',
 'OUT045',
 'OUT046',
 'OUT049'}

In [35]:
outlet_size_map = train.groupby('Outlet_Identifier')['Outlet_Size'].first()

print("Outlet Size Map derived from Train:")
print(outlet_size_map)

test['Outlet_Size'] = test['Outlet_Size'].fillna(test['Outlet_Identifier'].map(outlet_size_map))
missing_data = test.isnull().sum()
missing_percent = (test.isnull().sum() / len(test)) * 100

missing_df = pd.DataFrame({'Missing Count': missing_data, 'Missing Percentage': missing_percent})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values(by='Missing Percentage', ascending=False)

print("Missing Data Summary:")
print(missing_df)

Outlet Size Map derived from Train:
Outlet_Identifier
OUT010     Small
OUT013      High
OUT017     Small
OUT018    Medium
OUT019     Small
OUT027    Medium
OUT035     Small
OUT045     Small
OUT046     Small
OUT049    Medium
Name: Outlet_Size, dtype: object
Missing Data Summary:
             Missing Count  Missing Percentage
Item_Weight            976           17.180074


In [37]:
train_ids = set(train['Item_Identifier'].unique())
test_ids = set(test['Item_Identifier'].unique())

new_items_in_test = test_ids - train_ids

print(f"Total unique items in Train: {len(train_ids)}")
print(f"Total unique items in Test: {len(test_ids)}")
print(f"Number of new items in Test (not in Train): {len(new_items_in_test)}")

missing_weight_new_items = test[
    (test['Item_Identifier'].isin(new_items_in_test)) &
    (test['Item_Weight'].isnull())
]

print(f"New items that ALSO have missing weights: {len(missing_weight_new_items)}")

if len(missing_weight_new_items) > 0:
    print(missing_weight_new_items[['Item_Identifier', 'Item_Type', 'Item_Weight']].head())

Total unique items in Train: 1559
Total unique items in Test: 1543
Number of new items in Test (not in Train): 0
New items that ALSO have missing weights: 0


In [38]:
weight_counts = train.groupby('Item_Identifier')['Item_Weight'].nunique()

inconsistent_items = weight_counts[weight_counts > 1]

print(f"Number of items with conflicting weights: {len(inconsistent_items)}")

if len(inconsistent_items) > 0:
    print("\nExamples of inconsistent items:")
    print(inconsistent_items.head())
else:
    print("Verification Successful: Every Item_Identifier has exactly one unique weight.")

Number of items with conflicting weights: 0
Verification Successful: Every Item_Identifier has exactly one unique weight (ignoring NaNs).


In [36]:
item_weight_map = train.groupby('Item_Identifier')['Item_Weight'].max()

test['Item_Weight'] = test['Item_Weight'].fillna(test['Item_Identifier'].map(item_weight_map))

print("Missing Item_Weight in Test after mapping from Train:", test['Item_Weight'].isnull().sum())

Missing Item_Weight in Test after mapping from Train: 0


In [39]:
missing_data = test.isnull().sum()
missing_percent = (test.isnull().sum() / len(test)) * 100

missing_df = pd.DataFrame({'Missing Count': missing_data, 'Missing Percentage': missing_percent})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values(by='Missing Percentage', ascending=False)

print("Missing Data Summary:")
print(missing_df)

Missing Data Summary:
Empty DataFrame
Columns: [Missing Count, Missing Percentage]
Index: []


In [40]:
train.to_csv('train_cleaned.csv', index=False)
test.to_csv('test_cleaned.csv', index=False)