# 天气图像分类

分类天气图像，根据之前提取之前的特征分类处理

In [1]:
# import modules
import numpy as np
import  matplotlib.pyplot as pl 

## 读取numpy保存的二进制数据

直接使用numpy读取保存的'.bin'数据，并调整对应的矩阵维度

In [23]:
# features_path = "../datasets/classification/weather_classification/features.bin"
# labels_path = "../datasets/classification/weather_classification/labels.bin"

features_path = '../datasets/MWD/features.bin'
labels_path = '../datasets/MWD/labels.bin'

In [24]:
features = np.fromfile(features_path, dtype=np.float32)
labels = np.fromfile(labels_path, dtype=np.uint8)

# 更改矩阵维度
labels = labels.reshape((labels.shape[0], -1))
features = features.reshape((labels.shape[0], -1))

In [25]:
features.shape

(50768, 200)

In [26]:
labels.shape[0]

50768

In [27]:
y_0 = (labels == 0)
y_0.sum()

20302

In [28]:
y_1 = (labels == 1)
y_1.sum()

10214

In [29]:
y_2 = (labels == 2)
y_2.sum()

20252

## 机器学习分类尝试

### 数据集划分

首先是将数据集划分为训练集和测试集

In [30]:
# import modules
from sklearn.model_selection import train_test_split

In [31]:
X_train, X_valid, y_train, y_valid = train_test_split(features, labels, test_size = 0.2)

In [32]:
X_train.shape

(40614, 200)

In [33]:
X_valid.shape

(10154, 200)

### 机器学习模型拟合分类

In [34]:
# DecisionTree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

dt_model = DecisionTreeClassifier(random_state = 42)
dt_model.fit(X_train, y_train)
dt_prediction = dt_model.predict(X_valid)
dt_conf = confusion_matrix(y_valid, dt_prediction)
print('decission tree valid confusion matrix:\n{}'.format(dt_conf))
print('decision tree valid accuracy:{}'.format(accuracy_score(y_valid, dt_prediction)))

decission tree valid confusion matrix:
[[2808  636  650]
 [ 571 1014  407]
 [ 709  480 2879]]
decision tree valid accuracy:0.6599369706519598


In [35]:
# randomForest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

rfc_model = RandomForestClassifier(n_estimators=178,n_jobs=-1)
rfc_model.fit(X_train, y_train.ravel())

rfc_prediction = rfc_model.predict(X_valid)
rfc_conf = confusion_matrix(y_valid, rfc_prediction)
print('random forest classifier confusion matrix:\n{}'.format(rfc_conf))
print('random forest classifier accuracy:{}'.format(accuracy_score(y_valid, rfc_prediction)))

random forest classifier confusion matrix:
[[2984  526  584]
 [ 439 1228  325]
 [ 441  316 3311]]
random forest classifier accuracy:0.7408902895410676


In [36]:
# AdaBoost
# from sklearn.ensemble import AdaBoostClassifier

# ada_clf = AdaBoostClassifier(
#     DecisionTreeClassifier(), n_estimators=100,
#     algorithm="SAMME.R", learning_rate=0.5)
# ada_clf.fit(X_train, y_train)

# ada_prediction = ada_clf.predict(X_valid)
# ada_conf = confusion_matrix(y_valid, rfc_prediction)
# print('adaboost classifier confusion matrix:\n{}'.format(ada_conf))
# print('adaboost classifier accuracy:{}'.format(accuracy_score(y_valid, ada_prediction)))

In [46]:
# xgboost
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

xgb_model = XGBClassifier(n_estimators=1000, n_jobs=-1, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train.ravel(), early_stopping_rounds=5, eval_set=[(X_valid, y_valid.ravel())])

# xgb_model = XGBClassifier(n_jobs=-1)
# xgb_model.fit(X_train, y_train.ravel())

prediction = xgb_model.predict(X_valid)
conf = confusion_matrix(y_valid, prediction)
print('xgboost classifier confusion matrix:\n{}'.format(conf))
print('xgboost classifier accuracy:{}'.format(accuracy_score(y_valid, prediction)))

[0]	validation_0-mlogloss:1.00009
[1]	validation_0-mlogloss:0.93696
[2]	validation_0-mlogloss:0.89222
[3]	validation_0-mlogloss:0.86002
[4]	validation_0-mlogloss:0.83502
[5]	validation_0-mlogloss:0.81628
[6]	validation_0-mlogloss:0.80173
[7]	validation_0-mlogloss:0.78787
[8]	validation_0-mlogloss:0.77883
[9]	validation_0-mlogloss:0.77078
[10]	validation_0-mlogloss:0.76347
[11]	validation_0-mlogloss:0.75758
[12]	validation_0-mlogloss:0.75222
[13]	validation_0-mlogloss:0.74748
[14]	validation_0-mlogloss:0.74353
[15]	validation_0-mlogloss:0.73917
[16]	validation_0-mlogloss:0.73551
[17]	validation_0-mlogloss:0.73181
[18]	validation_0-mlogloss:0.72682
[19]	validation_0-mlogloss:0.72441
[20]	validation_0-mlogloss:0.72244
[21]	validation_0-mlogloss:0.71999
[22]	validation_0-mlogloss:0.71814
[23]	validation_0-mlogloss:0.71416
[24]	validation_0-mlogloss:0.71177
[25]	validation_0-mlogloss:0.70933
[26]	validation_0-mlogloss:0.70794
[27]	validation_0-mlogloss:0.70518
[28]	validation_0-mlogloss:0.7

In [38]:
from sklearn.metrics import cohen_kappa_score
cohen_kappa_score(y_valid, prediction)

0.5791475564856916

In [39]:
# SVM too slow to train
# from sklearn.svm import SVC
# from sklearn.metrics import classification_report
# svc_model = SVC(kernel='rbf', random_state=42, gamma=0.10, C=0.4)
# svc_model.fit(X_train, y_train.ravel())

# svc_prediction = svc_model.predict(X_valid)
# print(classification_report(X_valid, svc_prediction))

## 保存模型

对比了以上的三个模型，其中准确率最高的是xgboost分类器，决定保存该模型进行后续的处理

In [52]:
# save xgboost classifier model
import pickle

savemodel_path = '../models/xgb_clf.pickle.dat'
pickle.dump(xgb_model, open(savemodel_path, "wb"))

In [53]:
# load model
loaded_model = pickle.load(open(savemodel_path, 'rb'))
y_pred = loaded_model.predict(X_valid)

In [54]:
(y_pred.ravel() == y_valid.ravel()).sum()/len(y_valid)

0.747488674414024

In [55]:
confusion_matrix(y_pred, y_valid)

array([[2995,  447,  461],
       [ 531, 1254,  266],
       [ 568,  291, 3341]])