# Classification Approch

In [1]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (confusion_matrix, classification_report,
ConfusionMatrixDisplay,PrecisionRecallDisplay,RocCurveDisplay)
from sklearn.pipeline import Pipeline

#Grid Search
from sklearn.model_selection import GridSearchCV

#Model(s)
from sklearn.ensemble import RandomForestClassifier


## Work flow
0. [ ] Clean and Engeering Data for X and y
1. [ ] Split Data in Train/Test for X and y
2. [ ] Scaler on Training X & X test
3. [ ] Create Model(s)
4. [ ] Create Pipeline and HyperParameters
5. [ ] Fit/Train Model(s) on X Train
6. [ ] Evaluate Model(s) on X test
7. [ ] Adjust Param as Necessary
8. [ ] Bonus: Save Model(s)

### PreProcess


In [3]:
names = ['wifi 1', 'wifi 2', 'wifi 3', 'wifi 4', 'wifi 5', 'wifi 6', 'wifi 7', 'room']
df= pd.read_csv('../wifi_localization.txt',names=names,sep='\t')
df

Unnamed: 0,wifi 1,wifi 2,wifi 3,wifi 4,wifi 5,wifi 6,wifi 7,room
0,-64,-56,-61,-66,-71,-82,-81,1
1,-68,-57,-61,-65,-71,-85,-85,1
2,-63,-60,-60,-67,-76,-85,-84,1
3,-61,-60,-68,-62,-77,-90,-80,1
4,-63,-65,-60,-63,-77,-81,-87,1
...,...,...,...,...,...,...,...,...
1995,-59,-59,-48,-66,-50,-86,-94,4
1996,-59,-56,-50,-62,-47,-87,-90,4
1997,-62,-59,-46,-65,-45,-87,-88,4
1998,-62,-58,-52,-61,-41,-90,-85,4


#### Split Data in Train/Test for X and y


In [4]:
X = df.drop(['room'],axis=1)
y= df['room']

X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.33, random_state=1099, stratify=y)


### Model

In [6]:
rfc = RandomForestClassifier()
pipe = Pipeline([('rfc',rfc)])

#### Create Model(s)

In [7]:
help(rfc)

Help on RandomForestClassifier in module sklearn.ensemble._forest object:

class RandomForestClassifier(ForestClassifier)
 |  RandomForestClassifier(n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None, monotonic_cst=None)
 |
 |  A random forest classifier.
 |
 |  A random forest is a meta estimator that fits a number of decision tree
 |  classifiers on various sub-samples of the dataset and uses averaging to
 |  improve the predictive accuracy and control over-fitting.
 |  Trees in the forest use the best split strategy, i.e. equivalent to passing
 |  `splitter="best"` to the underlying :class:`~sklearn.tree.DecisionTreeRegressor`.
 |  The sub-sample size is controlled with the `max_samples` parameter i

In [8]:
#hyp param value
n_estimators = list(range(100,700,50))
criterion = ["gini", "entropy", "log_loss"]
max_depth =[1, 2, 4, 6, None]
max_features = ["sqrt", "log2"]

In [9]:
hyp_param ={
    'rfc__n_estimators':n_estimators,
    'rfc__criterion':criterion,
    'rfc__max_depth':max_depth,
    'rfc__max_features':max_features
}

In [14]:
full_model = GridSearchCV(
    estimator=pipe,
    param_grid=hyp_param,
    scoring='accuracy',
    cv=5,
    verbose=1)

#### Train Model

In [15]:
full_model.fit(X_train,y_train)

Fitting 5 folds for each of 360 candidates, totalling 1800 fits


  _data = np.array(data, dtype=dtype, copy=copy,


### Evaluate model

In [16]:
y_pred = full_model.predict(X_test)

#### Test On data

In [17]:
ConfusionMatrixDisplay(
    confusion_matrix(y_test,y_pred),
    display_labels = full_model.classes_
)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7fdd68774fb0>

In [18]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.98      0.99      0.98       165
           2       0.99      0.94      0.96       165
           3       0.93      0.98      0.95       165
           4       0.99      0.99      0.99       165

    accuracy                           0.97       660
   macro avg       0.97      0.97      0.97       660
weighted avg       0.97      0.97      0.97       660



### Final Model(s)


In [30]:
full_model.best_params_

{'rfc__criterion': 'log_loss',
 'rfc__max_depth': None,
 'rfc__max_features': 'log2',
 'rfc__n_estimators': 300}

In [31]:
final_model= RandomForestClassifier(
    criterion='log_loss',
    max_depth= None,
    max_features= 'log2',
    n_estimators=300
)

#### Train on all Data 

In [32]:
final_model.fit(X,y)

#### Save with joblib

In [33]:
import joblib
joblib.dump(value=final_model,filename='random farest')

['random farest']

# Congratulations!!!

#### Created and trained by  Matin1099.
