In [5]:
import warnings
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
warnings.filterwarnings("ignore")

In [6]:
data = pd.read_csv('Datasets/lifestyle_sustainability_data.csv')

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499 entries, 0 to 498
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   ParticipantID                  499 non-null    int64 
 1   Age                            499 non-null    int64 
 2   Location                       499 non-null    object
 3   DietType                       499 non-null    object
 4   LocalFoodFrequency             499 non-null    object
 5   TransportationMode             499 non-null    object
 6   EnergySource                   499 non-null    object
 7   HomeType                       499 non-null    object
 8   HomeSize                       499 non-null    int64 
 9   ClothingFrequency              499 non-null    object
 10  SustainableBrands              499 non-null    bool  
 11  EnvironmentalAwareness         499 non-null    int64 
 12  CommunityInvolvement           381 non-null    object
 13  Month

In [10]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ParticipantID,499.0,250.024048,144.233925,1.0,125.5,250.0,374.5,500.0
Age,499.0,44.052104,14.904828,18.0,31.0,44.0,58.0,96.0
HomeSize,499.0,1518.046092,672.212235,407.0,951.5,1450.0,1995.0,2997.0
EnvironmentalAwareness,499.0,3.062124,1.342145,1.0,2.0,3.0,4.0,5.0
MonthlyElectricityConsumption,499.0,291.963928,120.482275,55.0,185.0,295.0,400.0,498.0
MonthlyWaterConsumption,499.0,3139.521042,1224.609306,536.0,2000.0,3350.0,4100.0,5250.0
Rating,499.0,3.430862,1.502585,1.0,2.0,4.0,5.0,5.0


In [12]:
data.head()

Unnamed: 0,ParticipantID,Age,Location,DietType,LocalFoodFrequency,TransportationMode,EnergySource,HomeType,HomeSize,ClothingFrequency,SustainableBrands,EnvironmentalAwareness,CommunityInvolvement,MonthlyElectricityConsumption,MonthlyWaterConsumption,Gender,UsingPlasticProducts,DisposalMethods,PhysicalActivities,Rating
0,1,35,Urban,Mostly Plant-Based,Often,Bike,Renewable,Apartment,800,Rarely,True,5,High,100,1500,Female,Rarely,Composting,High,5
1,2,28,Suburban,Balanced,Sometimes,Public Transit,Mixed,House,1500,Sometimes,True,4,Moderate,250,3000,Male,Sometimes,Recycling,Moderate,4
2,3,65,Rural,Mostly Animal-Based,Rarely,Car,Non-Renewable,House,2500,Often,False,2,Low,400,4500,Male,Often,Landfill,Low,1
3,4,42,Urban,Mostly Plant-Based,Often,Walk,Renewable,Apartment,950,Sometimes,True,4,Moderate,150,2000,Female,Rarely,Recycling,High,5
4,5,31,Suburban,Balanced,Sometimes,Public Transit,Mixed,House,1800,Often,True,3,Low,300,3500,Non-Binary,Sometimes,Combination,Moderate,3


In [13]:
data.isnull().sum().sort_values(ascending=False)

CommunityInvolvement             118
PhysicalActivities               108
ParticipantID                      0
Age                                0
DisposalMethods                    0
UsingPlasticProducts               0
Gender                             0
MonthlyWaterConsumption            0
MonthlyElectricityConsumption      0
EnvironmentalAwareness             0
SustainableBrands                  0
ClothingFrequency                  0
HomeSize                           0
HomeType                           0
EnergySource                       0
TransportationMode                 0
LocalFoodFrequency                 0
DietType                           0
Location                           0
Rating                             0
dtype: int64

In [16]:
data.duplicated().any()

False

In [18]:
cat_features = data.select_dtypes(include='O').columns
no_features = data.select_dtypes(include=np.number).drop(columns=['ParticipantID']).columns

In [20]:
fig = px.pie(data, names='SustainableBrands')
fig.update_layout(legend_title='SustainableBrands', title={'text': 'Distribution of SustainableBrands', 'y':0.95, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()

In [21]:
for cat in cat_features:
    fig = px.histogram(data, y=cat, color = cat).update_yaxes(categoryorder = 'total ascending')
    fig.update_layout(title={'text': f'Distribution of {cat}','y':0.95, 'x':0.5, 'xanchor': 'center', 'yanchor':'top'})
    fig.show()


In [22]:
for no in no_features:
    fig = px.histogram(data, x=no, marginal="box", hover_data=data.columns)
    fig.update_layout(title={'text': f'Distribution of {no}','y':0.95, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top'})
    fig.show()

In [23]:
for no in no_features:
    fig = px.scatter(data, x = no, y = 'Rating', color = no, marginal_x="histogram", marginal_y="rug", trendline="ols").update_yaxes(categoryorder = 'total ascending')
    fig.update_layout(title={'text': f'{no} and Rating Relationship','y':0.95,'x':0.5,'xanchor':'center', 'yanchor':'top'})
    fig.show()

In [26]:
data.drop(columns = ['ParticipantID'], inplace=True)

In [28]:
for cat in cat_features:
    data[cat].fillna('Missing', inplace=True)
data[cat_features].isnull().sum()

Location                0
DietType                0
LocalFoodFrequency      0
TransportationMode      0
EnergySource            0
HomeType                0
ClothingFrequency       0
CommunityInvolvement    0
Gender                  0
UsingPlasticProducts    0
DisposalMethods         0
PhysicalActivities      0
dtype: int64

In [29]:
encoder = LabelEncoder()
for cat in cat_features:
    data[cat] = encoder.fit_transform(data[cat])

In [30]:
data['Rating'] -= 1

In [33]:
scaler = MinMaxScaler()
df = pd.concat([data['Rating'].reset_index(drop=True),
                    pd.DataFrame(scaler.fit_transform(data.iloc[:, :-1]), columns=data.columns[:-1])],
                    axis=1)

In [34]:
df.head()

Unnamed: 0,Rating,Age,Location,DietType,LocalFoodFrequency,TransportationMode,EnergySource,HomeType,HomeSize,ClothingFrequency,SustainableBrands,EnvironmentalAwareness,CommunityInvolvement,MonthlyElectricityConsumption,MonthlyWaterConsumption,Gender,UsingPlasticProducts,DisposalMethods,PhysicalActivities
0,4,0.217949,1.0,1.0,0.333333,0.0,1.0,0.0,0.151737,0.666667,1.0,1.0,0.0,0.10158,0.204497,0.0,0.666667,0.333333,0.0
1,3,0.128205,0.5,0.0,1.0,0.666667,0.0,0.5,0.422008,1.0,1.0,0.75,1.0,0.440181,0.522698,0.333333,1.0,1.0,1.0
2,0,0.602564,0.0,0.5,0.666667,0.333333,0.5,0.5,0.808108,0.333333,0.0,0.25,0.333333,0.778781,0.840899,0.333333,0.333333,0.666667,0.333333
3,4,0.307692,1.0,1.0,0.333333,1.0,1.0,0.0,0.209653,1.0,1.0,0.75,1.0,0.214447,0.310564,0.0,0.666667,1.0,0.0
4,2,0.166667,0.5,0.0,1.0,0.666667,0.0,0.5,0.537838,0.333333,1.0,0.5,0.333333,0.553047,0.628765,0.666667,1.0,0.0,1.0


In [35]:
fig = px.imshow(df.corr())
fig.update_layout(title={'text': 'Correlation Between Numerical Attributes','y':0.95, 'x':0.5,'xanchor':'center', 'yanchor': 'top'})
fig.show()

In [36]:
X = data.drop(columns=['Rating'])
y = df['Rating']

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
models = {
    'LogisticRegression': LogisticRegression(),
    'RandomForest' : RandomForestClassifier(),
    'DecisionTree': DecisionTreeClassifier(),
    'SVC': SVC(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'XGBoost': XGBClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = round(accuracy_score(y_test, y_pred), 4)*100
    print(name, 'Analysis\n')
    print(f'Accuracy: {accuracy}%\n')

LogisticRegression Analysis

Accuracy: 40.0%

RandomForest Analysis

Accuracy: 72.0%

DecisionTree Analysis

Accuracy: 66.0%

SVC Analysis

Accuracy: 56.00000000000001%

KNeighborsClassifier Analysis

Accuracy: 56.99999999999999%

XGBoost Analysis

Accuracy: 66.0%



In [39]:
random_grid = {'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
               'max_features': ['sqrt', 'log2', None],
               'max_depth': [int(x) for x in np.linspace(10,110,num=11)]+[None],
               'min_samples_split': [2,5,10],
               'bootstrap': [True, False],
               'criterion': ['gini', 'entropy', 'log_loss']
               }
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions=random_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [40]:
rf_best = rf_random.best_estimator_
rf_best

In [41]:
rf_pred = rf_best.predict(X_test)
print('Random Forest Hyperparameter Tuned Analysis\n')
print('Random Forest Best Parameters:', rf_random.best_params_)
accuracy = round(accuracy_score(y_test, rf_pred), 4)*100
conf_matrix = confusion_matrix(y_test, rf_pred)
classification_rep = classification_report(y_test, rf_pred)
print(f"\nAccuracy: {accuracy}%")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{classification_rep}")

Random Forest Hyperparameter Tuned Analysis

Random Forest Best Parameters: {'n_estimators': 1000, 'min_samples_split': 5, 'max_features': 'sqrt', 'max_depth': 30, 'criterion': 'gini', 'bootstrap': False}

Accuracy: 72.0%
Confusion Matrix:
[[24  0  1  1  1]
 [ 1  1  1  0  3]
 [ 0  0 14  1  4]
 [ 2  0  0  8  7]
 [ 1  0  3  2 25]]
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.89      0.87        27
           1       1.00      0.17      0.29         6
           2       0.74      0.74      0.74        19
           3       0.67      0.47      0.55        17
           4       0.62      0.81      0.70        31

    accuracy                           0.72       100
   macro avg       0.78      0.61      0.63       100
weighted avg       0.74      0.72      0.70       100

