In [1]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

In [2]:
# reading the data
df = pd.read_csv('../data/raw/train.csv') # . current directory, .. one folder up
df.head()

Unnamed: 0,City/District,Waste Type,Waste Generated (Tons/Day),Recycling Rate (%),Population Density (People/km²),Municipal Efficiency Score (1-10),Disposal Method,Cost of Waste Management (₹/Ton),Awareness Campaigns Count,Landfill Name,"Landfill Location (Lat, Long)",Landfill Capacity (Tons),Year
0,Mumbai,Plastic,6610,68,11191,9,Composting,3056,14,Mumbai Landfill,"22.4265, 77.4931",45575,2019
1,Mumbai,Organic,1181,56,11191,5,Composting,2778,12,Mumbai Landfill,"22.4265, 77.4931",45575,2019
2,Mumbai,E-Waste,8162,53,11191,8,Incineration,3390,13,Mumbai Landfill,"22.4265, 77.4931",45575,2019
3,Mumbai,Construction,8929,56,11191,5,Landfill,1498,14,Mumbai Landfill,"22.4265, 77.4931",45575,2019
4,Mumbai,Hazardous,5032,44,11191,7,Recycling,2221,16,Mumbai Landfill,"22.4265, 77.4931",45575,2019


In [3]:
#understanding the data and its types

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850 entries, 0 to 849
Data columns (total 13 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   City/District                      850 non-null    object
 1   Waste Type                         850 non-null    object
 2   Waste Generated (Tons/Day)         850 non-null    int64 
 3   Recycling Rate (%)                 850 non-null    int64 
 4   Population Density (People/km²)    850 non-null    int64 
 5   Municipal Efficiency Score (1-10)  850 non-null    int64 
 6   Disposal Method                    850 non-null    object
 7   Cost of Waste Management (₹/Ton)   850 non-null    int64 
 8   Awareness Campaigns Count          850 non-null    int64 
 9   Landfill Name                      850 non-null    object
 10  Landfill Location (Lat, Long)      850 non-null    object
 11  Landfill Capacity (Tons)           850 non-null    int64 
 12  Year    

In [4]:
# missing values and duplicates
print("<--------------------------------------->")
print("Missing Values:")
print(df.isnull().sum())
print("<--------------------------------------->")
print("Duplicates:"),
print( f"{df.duplicated().sum()} duplicate value(s)")

<--------------------------------------->
Missing Values:
City/District                        0
Waste Type                           0
Waste Generated (Tons/Day)           0
Recycling Rate (%)                   0
Population Density (People/km²)      0
Municipal Efficiency Score (1-10)    0
Disposal Method                      0
Cost of Waste Management (₹/Ton)     0
Awareness Campaigns Count            0
Landfill Name                        0
Landfill Location (Lat, Long)        0
Landfill Capacity (Tons)             0
Year                                 0
dtype: int64
<--------------------------------------->
Duplicates:
0 duplicate value(s)


As there are no duplicates and missing values we can go ahead and understand the data and its columns in detail.


In [5]:
#creating a copy of the data to work with
data = df.copy()
data.columns

Index(['City/District', 'Waste Type', 'Waste Generated (Tons/Day)',
       'Recycling Rate (%)', 'Population Density (People/km²)',
       'Municipal Efficiency Score (1-10)', 'Disposal Method',
       'Cost of Waste Management (₹/Ton)', 'Awareness Campaigns Count',
       'Landfill Name', 'Landfill Location (Lat, Long)',
       'Landfill Capacity (Tons)', 'Year'],
      dtype='object')

In [6]:
#renaming the columns for better understanding and readability

data.rename(columns={'City/District': 'City', 'Waste Type': 'Waste_Type', 'Waste Generated (Tons/Day)': 'Waste_Generated',
       'Recycling Rate (%)': 'Recycling_Rate', 'Population Density (People/km²)': 'Population_Density',
       'Municipal Efficiency Score (1-10)': 'Municipal_Efficiency_Score', 'Disposal Method': 'Disposal_Method',
       'Cost of Waste Management (₹/Ton)': 'Cost_of_Waste_Management', 'Awareness Campaigns Count': 'Awareness_Campaigns_Count',
       'Landfill Name': 'Landfill_Name', 'Landfill Location (Lat, Long)': 'Landfill_Location',
       'Landfill Capacity (Tons)': 'Landfill_Capacity', 'Year': 'Year'}, inplace=True)

data.columns

Index(['City', 'Waste_Type', 'Waste_Generated', 'Recycling_Rate',
       'Population_Density', 'Municipal_Efficiency_Score', 'Disposal_Method',
       'Cost_of_Waste_Management', 'Awareness_Campaigns_Count',
       'Landfill_Name', 'Landfill_Location', 'Landfill_Capacity', 'Year'],
      dtype='object')

In [7]:
#We will split latitudes and longitude into two separate columns 

data[["Landfill_Lat", "Landfill_Long"]] = data["Landfill_Location"].str.split(",", expand=True).astype(float)
data.drop(columns=["Landfill_Location"], inplace=True)
data.head()

Unnamed: 0,City,Waste_Type,Waste_Generated,Recycling_Rate,Population_Density,Municipal_Efficiency_Score,Disposal_Method,Cost_of_Waste_Management,Awareness_Campaigns_Count,Landfill_Name,Landfill_Capacity,Year,Landfill_Lat,Landfill_Long
0,Mumbai,Plastic,6610,68,11191,9,Composting,3056,14,Mumbai Landfill,45575,2019,22.4265,77.4931
1,Mumbai,Organic,1181,56,11191,5,Composting,2778,12,Mumbai Landfill,45575,2019,22.4265,77.4931
2,Mumbai,E-Waste,8162,53,11191,8,Incineration,3390,13,Mumbai Landfill,45575,2019,22.4265,77.4931
3,Mumbai,Construction,8929,56,11191,5,Landfill,1498,14,Mumbai Landfill,45575,2019,22.4265,77.4931
4,Mumbai,Hazardous,5032,44,11191,7,Recycling,2221,16,Mumbai Landfill,45575,2019,22.4265,77.4931


In [8]:
data.dtypes

City                           object
Waste_Type                     object
Waste_Generated                 int64
Recycling_Rate                  int64
Population_Density              int64
Municipal_Efficiency_Score      int64
Disposal_Method                object
Cost_of_Waste_Management        int64
Awareness_Campaigns_Count       int64
Landfill_Name                  object
Landfill_Capacity               int64
Year                            int64
Landfill_Lat                  float64
Landfill_Long                 float64
dtype: object

In [9]:
data.nunique()

City                           34
Waste_Type                      5
Waste_Generated               807
Recycling_Rate                 56
Population_Density             33
Municipal_Efficiency_Score      6
Disposal_Method                 4
Cost_of_Waste_Management      780
Awareness_Campaigns_Count      21
Landfill_Name                  34
Landfill_Capacity              34
Year                            5
Landfill_Lat                   34
Landfill_Long                  34
dtype: int64

One thing that we can notice that, City, Landfill Name, Landfill Capacity, Latitude and Longitude all have 34 unique values which points that each unique value for each of these features correspond to same row, due to this onw to one correspondensce between city and landfill name we can drop landill name.

In [10]:
#dividing the data into categorical and numerical columns

cols = [col for col in data.columns if col != "Recycling_Rate"]
cat_cols = [col for col in cols if data[col].dtype =='O']
num_cols = [col for col in cols if data[col].dtype !='O']

print("Total Columns (w/o target variable):", len(cols))
print(f"Categorical Columns: {len(cat_cols)}, {cat_cols}")
print(f"Numerical Columns: {len(num_cols)}", num_cols)

#checking the number of unique values in the categorical columns
print("_"*100)
print("\nSummary of categorical columns")
for col in cat_cols:
    print("."*100)
    print(col, ":", data[col].nunique())
    print(data[col].unique())

print("_"*100)

print("Summary of numerical columns")
print("-"*30)
data.describe()


Total Columns (w/o target variable): 13
Categorical Columns: 4, ['City', 'Waste_Type', 'Disposal_Method', 'Landfill_Name']
Numerical Columns: 9 ['Waste_Generated', 'Population_Density', 'Municipal_Efficiency_Score', 'Cost_of_Waste_Management', 'Awareness_Campaigns_Count', 'Landfill_Capacity', 'Year', 'Landfill_Lat', 'Landfill_Long']
____________________________________________________________________________________________________

Summary of categorical columns
....................................................................................................
City : 34
['Mumbai' 'Delhi' 'Bengaluru' 'Chennai' 'Kolkata' 'Hyderabad' 'Pune'
 'Ahmedabad' 'Jaipur' 'Lucknow' 'Surat' 'Kanpur' 'Nagpur' 'Patna' 'Bhopal'
 'Thiruvananthapuram' 'Indore' 'Vadodara' 'Guwahati' 'Coimbatore' 'Ranchi'
 'Amritsar' 'Jodhpur' 'Varanasi' 'Ludhiana' 'Agra' 'Meerut' 'Nashik'
 'Rajkot' 'Madurai' 'Jabalpur' 'Allahabad' 'Visakhapatnam' 'Gwalior']
...............................................................

Unnamed: 0,Waste_Generated,Recycling_Rate,Population_Density,Municipal_Efficiency_Score,Cost_of_Waste_Management,Awareness_Campaigns_Count,Landfill_Capacity,Year,Landfill_Lat,Landfill_Long
count,850.0,850.0,850.0,850.0,850.0,850.0,850.0,850.0,850.0,850.0
mean,5262.249412,57.076471,13489.705882,7.4,2778.458824,9.904706,58934.617647,2021.0,21.671862,83.638241
std,2786.984735,16.129994,6631.081494,1.722162,1276.32563,6.070772,19413.627292,1.415046,8.754971,7.745195
min,511.0,30.0,2335.0,5.0,503.0,0.0,22690.0,2019.0,8.9824,68.7432
25%,2865.75,43.0,7927.0,6.0,1647.5,5.0,45575.0,2020.0,13.4978,77.0472
50%,5283.0,56.0,12579.5,7.0,2853.0,10.0,61038.5,2021.0,19.5881,84.5308
75%,7757.25,71.0,19087.0,9.0,3855.0,15.0,71127.0,2022.0,30.6193,89.516
max,9980.0,85.0,24032.0,10.0,4999.0,20.0,98646.0,2023.0,35.9642,95.8263


In [11]:
#It does appears that city name and landfill name are just same only difference being the word "Landfill" at the end
#Lets confirm this
copy_df = data.copy()
copy_df["Redundancy_Check"]=copy_df["City"]+" Landfill"
print("Count_of_Mismatch:",(copy_df["Redundancy_Check"] != copy_df["Landfill_Name"]).sum())


Count_of_Mismatch: 0


So there is no mismatch which confirms that one of these varaibles is just redundant, we can consider dropping one. We will drop Landfill name

In [12]:
data.drop(columns=["Landfill_Name"], inplace=True)

In [13]:
#Feature Engineering
#Approximate Coordinates for the cities present in the dataset
city_coords = {
    'Mumbai': (19.08, 72.88),
    'Delhi': (28.65, 77.23),
    'Bengaluru': (12.97, 77.59),
    'Chennai': (13.08, 80.27),
    'Kolkata': (22.57, 88.36),
    'Hyderabad': (17.39, 78.49),
    'Pune': (18.52, 73.86),
    'Ahmedabad': (23.02, 72.57),
    'Jaipur': (26.91, 75.79),
    'Lucknow': (26.85, 80.95),
    'Surat': (21.17, 72.83),
    'Kanpur': (26.47, 80.33),
    'Nagpur': (21.15, 79.08),
    'Patna': (25.59, 85.14),
    'Bhopal': (23.26, 77.41),
    'Thiruvananthapuram': (8.52, 76.94),
    'Indore': (22.72, 75.88),
    'Vadodara': (22.31, 73.18),
    'Guwahati': (26.18, 91.75),
    'Coimbatore': (11.02, 76.96),
    'Ranchi': (23.34, 85.31),
    'Amritsar': (31.63, 74.87),
    'Jodhpur': (26.28, 73.02),
    'Varanasi': (25.32, 82.97),
    'Ludhiana': (30.90, 75.85),
    'Agra': (27.18, 78.01),
    'Meerut': (28.98, 77.71),
    'Nashik': (20.00, 73.78),
    'Rajkot': (22.30, 70.80),
    'Madurai': (9.93, 78.12),
    'Jabalpur': (23.17, 79.94),
    'Allahabad': (25.44, 81.85),
    'Visakhapatnam': (17.69, 83.22),
    'Gwalior': (26.22, 78.18),
}


from geopy.distance import geodesic
# Function to calculate distance between two points
def calculate_distance(row):
    if row["City"] in city_coords:
        city_coordinates = city_coords[row["City"]]
        landfill_coordinates = (row["Landfill_Lat"], row["Landfill_Long"])
        return geodesic(city_coordinates, landfill_coordinates).km
    return np.nan

data["Distance_to_Landfill_km"]  = data.apply(calculate_distance, axis=1)


data["Distance_to_Landfill_km"].isnull().sum()  # Check for any NaN values in the new column

np.int64(0)

We have engineered a new feature which measures distance of ladndfill from the city.
This is important as in real life the recycling rate might be effected given how far the landfill is located from the city. Its adds to cost and infulence real world decision making.

In [14]:
#We will also add a new feature to understand data better
#This is year since 2019 as it is the base year in the data and helps in understanding the trend over the years.

print("Years in data:",data["Year"].unique())

data["Years_Since_2019"]= df['Year'] - 2019

data.tail()

Years in data: [2019 2020 2021 2022 2023]


Unnamed: 0,City,Waste_Type,Waste_Generated,Recycling_Rate,Population_Density,Municipal_Efficiency_Score,Disposal_Method,Cost_of_Waste_Management,Awareness_Campaigns_Count,Landfill_Capacity,Year,Landfill_Lat,Landfill_Long,Distance_to_Landfill_km,Years_Since_2019
845,Gwalior,Plastic,6842,42,11280,8,Recycling,3546,15,54460,2023,10.9566,91.6565,2204.733124,4
846,Gwalior,Organic,5233,38,11280,10,Recycling,1146,5,54460,2023,10.9566,91.6565,2204.733124,4
847,Gwalior,E-Waste,9903,41,11280,7,Landfill,3260,4,54460,2023,10.9566,91.6565,2204.733124,4
848,Gwalior,Construction,7540,77,11280,6,Composting,4220,7,54460,2023,10.9566,91.6565,2204.733124,4
849,Gwalior,Hazardous,2480,58,11280,6,Composting,1081,19,54460,2023,10.9566,91.6565,2204.733124,4


In [15]:
#Encoding Categorical Features
cols = [col for col in data.columns if col != "Recycling_Rate" and col !="Year"]
cat_cols = [col for col in cols if data[col].dtype =='O']
num_cols = [col for col in cols if data[col].dtype !='O']

print("Total Columns (w/o target variable):", len(cols))
print(f"Categorical Columns: {len(cat_cols)}, {cat_cols}")
print(f"Numerical Columns: {len(num_cols)}", num_cols)

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cat = encoder.fit_transform(data[cat_cols])
data_encoded = pd.DataFrame(encoded_cat, columns= encoder.get_feature_names_out(cat_cols))
df_encoded= pd.concat([data.drop(columns=cat_cols), data_encoded], axis= 1)
df_encoded.drop(columns=["Year","Landfill_Lat","Landfill_Long"], inplace=True)
df_encoded.head()

Total Columns (w/o target variable): 13
Categorical Columns: 3, ['City', 'Waste_Type', 'Disposal_Method']
Numerical Columns: 10 ['Waste_Generated', 'Population_Density', 'Municipal_Efficiency_Score', 'Cost_of_Waste_Management', 'Awareness_Campaigns_Count', 'Landfill_Capacity', 'Landfill_Lat', 'Landfill_Long', 'Distance_to_Landfill_km', 'Years_Since_2019']


Unnamed: 0,Waste_Generated,Recycling_Rate,Population_Density,Municipal_Efficiency_Score,Cost_of_Waste_Management,Awareness_Campaigns_Count,Landfill_Capacity,Distance_to_Landfill_km,Years_Since_2019,City_Agra,...,City_Visakhapatnam,Waste_Type_Construction,Waste_Type_E-Waste,Waste_Type_Hazardous,Waste_Type_Organic,Waste_Type_Plastic,Disposal_Method_Composting,Disposal_Method_Incineration,Disposal_Method_Landfill,Disposal_Method_Recycling
0,6610,68,11191,9,3056,14,45575,606.595268,0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,1181,56,11191,5,2778,12,45575,606.595268,0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,8162,53,11191,8,3390,13,45575,606.595268,0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,8929,56,11191,5,1498,14,45575,606.595268,0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,5032,44,11191,7,2221,16,45575,606.595268,0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


* We can drop Year column as we already have Year since 2019 which captures the temporal trend, but we will keep it in our raw data (data)
* Also dropping lat and long column as we have already captured their information in other feature such as Distance from Landfill


In [16]:
#Numerical Feature Scaling
num_cols = ['Waste_Generated', 'Population_Density', 'Municipal_Efficiency_Score', 
            'Cost_of_Waste_Management', 'Awareness_Campaigns_Count',
              'Landfill_Capacity', 'Distance_to_Landfill_km', 'Years_Since_2019']
print(f"Columns to scale ({len(num_cols)}):", num_cols)
scaler = StandardScaler()
df_encoded[num_cols]=scaler.fit_transform(df_encoded[num_cols])
df_encoded.head()




Columns to scale (8): ['Waste_Generated', 'Population_Density', 'Municipal_Efficiency_Score', 'Cost_of_Waste_Management', 'Awareness_Campaigns_Count', 'Landfill_Capacity', 'Distance_to_Landfill_km', 'Years_Since_2019']


Unnamed: 0,Waste_Generated,Recycling_Rate,Population_Density,Municipal_Efficiency_Score,Cost_of_Waste_Management,Awareness_Campaigns_Count,Landfill_Capacity,Distance_to_Landfill_km,Years_Since_2019,City_Agra,...,City_Visakhapatnam,Waste_Type_Construction,Waste_Type_E-Waste,Waste_Type_Hazardous,Waste_Type_Organic,Waste_Type_Plastic,Disposal_Method_Composting,Disposal_Method_Incineration,Disposal_Method_Landfill,Disposal_Method_Recycling
0,0.483872,68,-0.34686,0.929612,0.217581,0.674989,-0.688562,-1.620291,-1.414214,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,-1.465258,56,-0.34686,-1.394418,-0.00036,0.345348,-0.688562,-1.620291,-1.414214,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,1.041074,53,-0.34686,0.348604,0.479424,0.510168,-0.688562,-1.620291,-1.414214,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.316444,56,-0.34686,-1.394418,-1.003829,0.674989,-0.688562,-1.620291,-1.414214,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.082665,44,-0.34686,-0.232403,-0.437026,1.00463,-0.688562,-1.620291,-1.414214,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [17]:
#Saving the processed data
df_encoded.to_csv("../data/processed/cleaned_data.csv", index=False)

#saving not scaled and encoded data
data.to_csv("prepared_data.csv", index=False)