# Machine learning - Wellbeing in Shanghai

## Import and preparation

In [1]:
import pandas as pd
from sklearn import tree
import numpy as np
import math
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
import pickle

In [2]:
df = pd.read_pickle("C:/Users/jeane/Documents/Travail/UTSEUS/Urban Data Hackathon/pickles/meters/data4hapiness_Scaled.pk")

In [3]:
nameColumn = []
for column in df:
    if column != "clean" and column != "smell" and column != "noise" and column != "hapiness":
        nameColumn.append(column)

In [4]:
X = df[nameColumn]
Y = df['hapiness']
subcat_diff = list(set(df['hapiness']))
subcat_diff

['DirtyNotsmellyNoisy',
 'DirtyNotsmellyQuite',
 'DirtySmellyQuite',
 'CleanSmellyQuite',
 'DirtySmellyNoisy',
 'CleanSmellyNoisy',
 'CleanNotsmellyNoisy',
 'CleanNotsmellyQuite']

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

## Machine learning models

There are 8 different classes :

    - CleanNotsmellyNoisy
    - CleanNotsmellyQuite
    - CleanSmellyNoisy
    - CleanSmellyQuite
    - DirtyNotsmellyNoisy
    - DirtyNotsmellyQuite
    - DirtySmellyNoisy
    - DirtySmellyQuite
    
So with a random distribution, we get a score of 12.5.
Our goal; having a higher prediction.

### Random Forest Classifier

In [6]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=40, random_state=0, n_estimators = 100)
clf = clf.fit(X_train, Y_train)
prediction = clf.predict(X_test)
param_grid = {'max_depth': [17], "n_estimators" : [15]}
search = GridSearchCV(RandomForestClassifier(random_state=0), param_grid, cv= ShuffleSplit(n_splits=5))
search.fit(X,Y)
np.mean(prediction == Y_test)

0.3697478991596639

In [7]:
scores = cross_val_score(clf, X, Y, cv=ShuffleSplit(n_splits=5))
np.mean(scores)

0.4111111111111111

In [8]:
print("Classification report:")
print(classification_report(Y_test, prediction))

Classification report:
                     precision    recall  f1-score   support

CleanNotsmellyNoisy       0.00      0.00      0.00         7
CleanNotsmellyQuite       0.42      0.54      0.47        56
   CleanSmellyQuite       0.00      0.00      0.00         2
DirtyNotsmellyNoisy       0.00      0.00      0.00        14
DirtyNotsmellyQuite       0.31      0.40      0.35        35
   DirtySmellyQuite       0.00      0.00      0.00         5

           accuracy                           0.37       119
          macro avg       0.12      0.16      0.14       119
       weighted avg       0.29      0.37      0.32       119



  'precision', 'predicted', average, warn_for)


### Passive Agressive Classifier

In [9]:
from sklearn.linear_model import PassiveAggressiveClassifier

clf = PassiveAggressiveClassifier()
clf = clf.fit(X_train, Y_train)
prediction = clf.predict(X_test)
search = GridSearchCV(PassiveAggressiveClassifier(), {}, cv= ShuffleSplit(n_splits=5))
search.fit(X,Y)
np.mean(prediction == Y_test)

0.35294117647058826

In [10]:
search.best_estimator_

PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
                            early_stopping=False, fit_intercept=True,
                            loss='hinge', max_iter=1000, n_iter_no_change=5,
                            n_jobs=None, random_state=None, shuffle=True,
                            tol=0.001, validation_fraction=0.1, verbose=0,
                            warm_start=False)

In [11]:
scores = cross_val_score(clf, X, Y, cv=ShuffleSplit(n_splits=5))
np.mean(scores)

0.3111111111111111

In [12]:
print("Classification report:")
print(classification_report(Y_test, prediction))

Classification report:
                     precision    recall  f1-score   support

CleanNotsmellyNoisy       0.00      0.00      0.00         7
CleanNotsmellyQuite       0.54      0.45      0.49        56
   CleanSmellyNoisy       0.00      0.00      0.00         0
   CleanSmellyQuite       0.00      0.00      0.00         2
DirtyNotsmellyNoisy       0.20      0.21      0.21        14
DirtyNotsmellyQuite       0.33      0.40      0.36        35
   DirtySmellyNoisy       0.00      0.00      0.00         0
   DirtySmellyQuite       0.00      0.00      0.00         5

           accuracy                           0.35       119
          macro avg       0.13      0.13      0.13       119
       weighted avg       0.38      0.35      0.36       119



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


### Ada Booste Classifier

In [13]:
from sklearn.ensemble import AdaBoostClassifier


clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf = clf.fit(X_train, Y_train)
prediction = clf.predict(X_test)
param_grid = {"n_estimators" : [12, 13, 15]}
search = GridSearchCV(clf, param_grid, cv= ShuffleSplit(n_splits=5))
search.fit(X_train, Y_train)
np.mean(prediction == Y_test)

0.4789915966386555

In [14]:
search.best_params_

{'n_estimators': 13}

In [15]:
scores = cross_val_score(clf, X_train, Y_train, cv=ShuffleSplit(n_splits=5))
np.mean(scores)

0.3583333333333334

In [16]:
print("Classification report:")
print(classification_report(Y_test, prediction))

Classification report:
                     precision    recall  f1-score   support

CleanNotsmellyNoisy       0.00      0.00      0.00         7
CleanNotsmellyQuite       0.47      0.98      0.64        56
   CleanSmellyQuite       0.00      0.00      0.00         2
DirtyNotsmellyNoisy       0.00      0.00      0.00        14
DirtyNotsmellyQuite       0.67      0.06      0.11        35
   DirtySmellyQuite       0.00      0.00      0.00         5

           accuracy                           0.48       119
          macro avg       0.19      0.17      0.12       119
       weighted avg       0.42      0.48      0.33       119



  'precision', 'predicted', average, warn_for)


In [17]:
model = AdaBoostClassifier(n_estimators=100, random_state=0).fit(X, Y) 
pickle.dump(model, open("C:/Users/jeane/Documents/Travail/UTSEUS/Urban Data Hackathon/pickles/meters/Rendu_final_US01.pkl", 'wb'))

### Linear SVC

In [18]:
from sklearn.svm import LinearSVC

clf = LinearSVC(random_state=0)
clf = clf.fit(X_train, Y_train)
prediction = clf.predict(X_test)
param_grid = {'C': [10000]}
search = GridSearchCV(LinearSVC(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(X_train, Y_train)
np.mean(prediction == Y_test)



0.35294117647058826

In [19]:
scores = cross_val_score(clf, X, Y, cv=ShuffleSplit(n_splits=5))
np.mean(scores)



0.3611111111111111

In [20]:
print("Classification report:")
print(classification_report(Y_test, prediction))

Classification report:
                     precision    recall  f1-score   support

CleanNotsmellyNoisy       0.12      0.14      0.13         7
CleanNotsmellyQuite       0.55      0.46      0.50        56
   CleanSmellyNoisy       0.00      0.00      0.00         0
   CleanSmellyQuite       0.00      0.00      0.00         2
DirtyNotsmellyNoisy       0.18      0.14      0.16        14
DirtyNotsmellyQuite       0.31      0.37      0.34        35
   DirtySmellyQuite       0.00      0.00      0.00         5

           accuracy                           0.35       119
          macro avg       0.17      0.16      0.16       119
       weighted avg       0.38      0.35      0.36       119



  'recall', 'true', average, warn_for)


### Decision Tree Classifier

In [21]:
from sklearn.tree import DecisionTreeClassifier

clf = tree.DecisionTreeClassifier(max_depth=2)
clf = clf.fit(X_train, Y_train)
tree.plot_tree(clf.fit(X_train, Y_train))

[Text(248.0, 308.0, 'X[131] <= 0.463\nentropy = 0.664\nsamples = 239\nvalue = [20, 102, 2, 1, 22, 89, 1, 2]'),
 Text(124.0, 184.79999999999998, 'X[28] <= 0.353\nentropy = 0.673\nsamples = 221\nvalue = [20, 87, 2, 1, 21, 87, 1, 2]'),
 Text(62.0, 61.599999999999966, 'entropy = 0.661\nsamples = 197\nvalue = [16, 85, 1, 1, 18, 73, 1, 2]'),
 Text(186.0, 61.599999999999966, 'entropy = 0.608\nsamples = 24\nvalue = [4, 2, 1, 0, 3, 14, 0, 0]'),
 Text(372.0, 184.79999999999998, 'X[25] <= 0.836\nentropy = 0.29\nsamples = 18\nvalue = [0, 15, 0, 0, 1, 2, 0, 0]'),
 Text(310.0, 61.599999999999966, 'entropy = 0.117\nsamples = 16\nvalue = [0, 15, 0, 0, 1, 0, 0, 0]'),
 Text(434.0, 61.599999999999966, 'entropy = 0.0\nsamples = 2\nvalue = [0, 0, 0, 0, 0, 2, 0, 0]')]

In [22]:
clf = tree.DecisionTreeClassifier(max_depth=5)
clf = clf.fit(X_train, Y_train)
prediction = clf.predict(X_test)
param_grid = {'max_depth': [2, 4, 6, 8, 10, 20, 40], 'criterion': ['gini', 'entropy']}
search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(X_train, Y_train)
np.mean(scores)

0.3611111111111111

In [23]:
search.best_params_

{'criterion': 'gini', 'max_depth': 4}

In [24]:
scores = cross_val_score(clf, X, Y, cv=ShuffleSplit(n_splits=5))
np.mean(scores)

0.3833333333333333

In [25]:
print("Classification report:")
print(classification_report(Y_test, prediction))

Classification report:
                     precision    recall  f1-score   support

CleanNotsmellyNoisy       0.00      0.00      0.00         7
CleanNotsmellyQuite       0.38      0.38      0.38        56
   CleanSmellyQuite       0.00      0.00      0.00         2
DirtyNotsmellyNoisy       0.00      0.00      0.00        14
DirtyNotsmellyQuite       0.21      0.37      0.27        35
   DirtySmellyQuite       0.00      0.00      0.00         5

           accuracy                           0.29       119
          macro avg       0.10      0.12      0.11       119
       weighted avg       0.24      0.29      0.26       119



  'precision', 'predicted', average, warn_for)


We can see that the best Machine learning model is Ada Boost Classifier if we favor the "accuracy" score (0.48), and the Linear SVC if we favor the "weighted avg" score (0.36).