# Thermal Comfort Classifier V3.0
###### by Muhammad Zhafran A F, Danang Wahyu K, and Haidar Alghazian A

What's new?
* Using all features first to evaluate models
* Feature Importance
* Model evaluation using top features in feature importance

## Data Acquisition

In [None]:
import pandas as pd

In [None]:
url = "https://raw.githubusercontent.com/mzhafranaf/Final-Project-Orbit/main/darjeeling.csv"
dataset = pd.read_csv(url)

In [None]:
dataset.head()

In [None]:
dataset.info()

## Data Cleaning

In [None]:
dataset.isnull().sum()

There's unnamed feature that should remove from our data

In [None]:
dataset.drop(dataset.columns[dataset.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
dataset.info()

There's some numeric features that's declared as object type. So, we should convert it to float64 type.

In [None]:
data = dataset.dropna()
#data

In [None]:
# remove Location and Subject Code
data = data.drop(['Location','Subject Code'], axis=1)

In [None]:
objectFeatures = ['TP','air_ts', 'air_tp', 'comfort', 'prod', 'TA', 'Uphols', 'sh_sw']

for i in objectFeatures:
    data[i] = pd.to_numeric(data[i],errors = 'coerce')
data.info()

In [None]:
data.isnull().sum()

In [None]:
data = data.dropna()

Because we want to focused on TSV, we should know how many data for each class in TSV

In [None]:
data.TSV.unique()

In [None]:
data['TSV'].value_counts()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
sns.countplot(x='TSV', data=data, order=data.TSV.unique().sort())

In [None]:
# data.TSV.replace({-3 : 0,-2 : 0,-1 : 0,0 : 1,1 : 2,2 : 2,3 : 2}, inplace=True)

## Data Exploration

In [None]:
"""
Explore data on 3 section
1. Full Data --> all of parameter in dataset
2. Main Data --> only 6 parameter (ASHRAE-55)
3. Others Data --> exclude main parameter
"""
main_parameter = ["Ti", "RH", "Tmr", "Va", "met", "clo"]
target_label = ["TSV"]
others_parameter = []
for i in data.columns:
    if i not in (main_parameter+target_label):
        others_parameter.append(i)
            
full_data = data[main_parameter + others_parameter + target_label]
main_data = data[main_parameter + target_label]
others_data = data[others_parameter + target_label]

### Statistic

In [None]:
full_data.describe()

In [None]:
main_data.describe()

In [None]:
others_data.describe()

#### Outlier Analysis

In [None]:
plt.figure(figsize=(12,8))
plt.title("Full Data Boxplot", size=18)
sns.boxplot(data=full_data)

In [None]:
plt.figure(figsize=(12,8))
plt.title("Main Data Boxplot", size=18)
sns.boxplot(data=main_data)

In [None]:
plt.figure(figsize=(12,8))
plt.title("Others Data Boxplot", size=18)
sns.boxplot(data=others_data)

### Correation Matrix

#### Full

In [None]:
plt.figure(figsize=(20,12))
cor = full_data.corr()
sns.heatmap(cor,annot=True,cmap='rocket_r')

In [None]:
#Correlation with output variable
cor_target = abs(cor["TSV"])

#Selecting highly correlated features
# relevant_features = cor_target[cor_target>0.5]
relevant_features = cor_target[cor_target>0.3]
relevant_features

#### Main

In [None]:
plt.figure(figsize=(12,6))
cor = main_data.corr()
sns.heatmap(cor,annot=True,cmap='rocket_r')

In [None]:
#Correlation with output variable
cor_target = abs(cor["TSV"])

#Selecting highly correlated features
# relevant_features = cor_target[cor_target>0.5]
relevant_features = cor_target[cor_target>0.3]
relevant_features

#### Others

In [None]:
plt.figure(figsize=(16,9))
cor = others_data.corr()
sns.heatmap(cor,annot=True,cmap='rocket_r')

In [None]:
#Correlation with output variable
cor_target = abs(cor["TSV"])

#Selecting highly correlated features
# relevant_features = cor_target[cor_target>0.5]
relevant_features = cor_target[cor_target>0.3]
relevant_features

# Classification

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.inspection import permutation_importance

## Standarization

In [None]:
scaler = StandardScaler()

#Full data
y_full = full_data[['TSV']].values.ravel()
X_full = full_data.drop(['TSV'], axis=1)

X_full = pd.DataFrame(scaler.fit_transform(X),columns = X.columns)


#Main data
y_main = main_data[['TSV']].values.ravel()
X_main = main_data.drop(['TSV'], axis=1)

X_main = pd.DataFrame(scaler.fit_transform(X),columns = X.columns)


#Others data
y_main = others_data[['TSV']].values.ravel()
X_main = others_data.drop(['TSV'], axis=1)

X_others = pd.DataFrame(scaler.fit_transform(X),columns = X.columns)

## Splitting Data

In [None]:
X_full_train, X_full_test, y_full_train, y_full_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(X_full_train.shape, X_full_test.shape)
print(y_full_train.shape, y_full_test.shape)

In [None]:
X_main_train, X_main_test, y_main_train, y_main_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(X_main_train.shape, X_main_test.shape)
print(y_main_train.shape, y_main_test.shape)

In [None]:
X_others_train, X_others_test, y_others_train, y_others_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(X_others_train.shape, X_others_test.shape)
print(y_others_train.shape, y_others_test.shape)

## Modeling

In [None]:
model = RandomForestClassifier()

### Full Data

In [None]:
model.fit(X_full_train, y_full_train)
y_full_pred = model.predict(X_full_test)

In [None]:
print(classification_report(y_full_test, y_full_pred))

In [None]:
cm = confusion_matrix(y_full_test, y_pred_full)
sns.heatmap(cm, cmap ='Blues', annot =True, fmt='g')

#### Permutation Importance

In [None]:
#pip install eli5
from eli5 import show_weights
from eli5.sklearn import PermutationImportance

In [None]:
perm = PermutationImportance(model, random_state=1).fit(X_full_test, y_full_test)
show_weights(perm, feature_names = list(X_full_test.columns))

## Main Data

In [None]:
model.fit(X_main_train, y_main_train)
y_main_pred = model.predict(X_main_test)

In [None]:
print(classification_report(y_main_test, y_main_pred))

In [None]:
cm = confusion_matrix(y_main_test, y_main_pred)
sns.heatmap(cm, cmap ='Blues', annot =True, fmt='g')

#### Permutation Importance

In [None]:
perm = PermutationImportance(model, random_state=1).fit(X_main_test, y_main_test)
show_weights(perm, feature_names = list(X_main_test.columns))

## Others Data

In [None]:
model.fit(X_others_train, y_others_train)
y_others_pred = model.predict(X_others_test)

In [None]:
print(classification_report(y_others_test, y_others_pred))

In [None]:
cm = confusion_matrix(y_others_test, y_others_pred)
sns.heatmap(cm, cmap ='Blues', annot =True, fmt='g')

#### Permutation Importance

In [None]:
perm = PermutationImportance(model, random_state=1).fit(X_others_test, y_others_test)
show_weights(perm, feature_names = list(X_others_test.columns))