# 浅橙模型

In [1]:
# load modules
import pandas as pd
import numpy as np

In [2]:
# features
cols = [u'mcid_counts', u'imei_counts', u'mac_counts',
        u'aid_counts', u'idfa_counts', u'idfv_counts',
        u'cracked.value', u'did.15m.anomaly', u'did.15m.value',
        u'did.1d.anomaly', u'did.1d.value', u'did.1h.anomaly', u'did.1h.value',
        u'did.1m.anomaly', u'did.1m.value', u'did.5m.anomaly', u'did.5m.value',
        u'did.6h.anomaly', u'did.6h.value', u'did.7d.anomaly', u'did.7d.value',
        u'idcIP.anomaly', u'idcIP.value',
        u'ipGeo.15m.anomaly', u'ipGeo.15m.value', u'ipGeo.1d.anomaly',
        u'ipGeo.1d.value', u'ipGeo.1h.anomaly', u'ipGeo.1h.value',
        u'ipGeo.1m.anomaly', u'ipGeo.1m.value', u'ipGeo.5m.anomaly',
        u'ipGeo.5m.value', u'ipGeo.6h.anomaly', u'ipGeo.6h.value',
        u'ipGeo.7d.anomaly', u'ipGeo.7d.value',
        u'ipSeg24.15m.anomaly', u'ipSeg24.15m.value', u'ipSeg24.1d.anomaly',
        u'ipSeg24.1d.value', u'ipSeg24.1h.anomaly', u'ipSeg24.1h.value',
        u'ipSeg24.1m.anomaly', u'ipSeg24.1m.value', u'ipSeg24.5m.anomaly',
        u'ipSeg24.5m.value', u'ipSeg24.6h.anomaly', u'ipSeg24.6h.value',
        u'ipSeg24.7d.anomaly', u'ipSeg24.7d.value', u'maxentID.15m.anomaly',
        u'maxentID.15m.value', u'maxentID.1d.anomaly', u'maxentID.1d.value',
        u'maxentID.1h.anomaly', u'maxentID.1h.value', u'maxentID.1m.anomaly',
        u'maxentID.1m.value', u'maxentID.5m.anomaly', u'maxentID.5m.value',
        u'maxentID.6h.anomaly', u'maxentID.6h.value', u'maxentID.7d.anomaly',
        u'maxentID.7d.value', u'proxyIP.anomaly',
        u'proxyIP.value',
        u'uaMismatch.value']

In [3]:
formula= "label ~ " + "+".join(cols).replace(".", "_")
print formula

label ~ mcid_counts+imei_counts+mac_counts+aid_counts+idfa_counts+idfv_counts+cracked_value+did_15m_anomaly+did_15m_value+did_1d_anomaly+did_1d_value+did_1h_anomaly+did_1h_value+did_1m_anomaly+did_1m_value+did_5m_anomaly+did_5m_value+did_6h_anomaly+did_6h_value+did_7d_anomaly+did_7d_value+idcIP_anomaly+idcIP_value+ipGeo_15m_anomaly+ipGeo_15m_value+ipGeo_1d_anomaly+ipGeo_1d_value+ipGeo_1h_anomaly+ipGeo_1h_value+ipGeo_1m_anomaly+ipGeo_1m_value+ipGeo_5m_anomaly+ipGeo_5m_value+ipGeo_6h_anomaly+ipGeo_6h_value+ipGeo_7d_anomaly+ipGeo_7d_value+ipSeg24_15m_anomaly+ipSeg24_15m_value+ipSeg24_1d_anomaly+ipSeg24_1d_value+ipSeg24_1h_anomaly+ipSeg24_1h_value+ipSeg24_1m_anomaly+ipSeg24_1m_value+ipSeg24_5m_anomaly+ipSeg24_5m_value+ipSeg24_6h_anomaly+ipSeg24_6h_value+ipSeg24_7d_anomaly+ipSeg24_7d_value+maxentID_15m_anomaly+maxentID_15m_value+maxentID_1d_anomaly+maxentID_1d_value+maxentID_1h_anomaly+maxentID_1h_value+maxentID_1m_anomaly+maxentID_1m_value+maxentID_5m_anomaly+maxentID_5m_value+maxentID_6h_

In [4]:
# load data
data = pd.read_csv("qiancheng_all_sample_with_did_counts.csv")
data["cracked.value"] = map(lambda x: 1 if x else 0, data["cracked.value"])
data["proxyIP.value"] = map(lambda x: 1 if x else 0, data["proxyIP.value"])
data["idcIP.value"] = map(lambda x: 1 if x else 0, data["idcIP.value"])
data["uaMismatch.value"] = map(lambda x: 1 if x else 0, data["uaMismatch.value"])
data.fillna(0, inplace=True)

In [5]:
# split data into ios, android
ios = data[data["os"] == 'ios']
ios.reset_index(inplace=True)
android = data[data["os"] == 'android']
android.reset_index(inplace=True)

In [6]:
# get X, y from data
y = ios["label"]
X = ios[cols]
# split data in to train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Run RandomForest

In [7]:
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
A = confusion_matrix(y_test, clf.predict(X_test))
print "The presion is %.2f %%" % (A[1, 1] * 1.0 / (np.sum(A[1, :])) * 100)

The presion is 98.30 %


# Feature importance

In [8]:
feature_importances = pd.DataFrame(zip(X_train.columns, clf.feature_importances_))
feature_importances.columns = ["features", "value"]
feature_importances_position = feature_importances[feature_importances.value>0]
feature_importances_position.sort_values("value", ascending=False)

Unnamed: 0,features,value
26,ipGeo.1d.value,0.371689
28,ipGeo.1h.value,0.252711
36,ipGeo.7d.value,0.096218
34,ipGeo.6h.value,0.086516
50,ipSeg24.7d.value,0.057138
25,ipGeo.1d.anomaly,0.048337
33,ipGeo.6h.anomaly,0.042953
48,ipSeg24.6h.value,0.025096
40,ipSeg24.1d.value,0.005482
23,ipGeo.15m.anomaly,0.005187


In [9]:
X.head()

Unnamed: 0,mcid_counts,imei_counts,mac_counts,aid_counts,idfa_counts,idfv_counts,cracked.value,did.15m.anomaly,did.15m.value,did.1d.anomaly,...,maxentID.1m.value,maxentID.5m.anomaly,maxentID.5m.value,maxentID.6h.anomaly,maxentID.6h.value,maxentID.7d.anomaly,maxentID.7d.value,proxyIP.anomaly,proxyIP.value,uaMismatch.value
0,2,0,0,0,2,2,0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.0,3.0,1.0,1,0
1,1,0,0,0,1,1,0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1,0
2,1,0,0,0,1,1,0,1.0,1.0,1.0,...,1.0,1.0,1.0,2.0,2.0,1.0,2.0,1.0,1,0
3,2,0,0,0,2,6,0,1.0,1.0,1.0,...,1.0,1.0,1.0,2.0,2.0,3.0,4.0,1.0,1,0
4,1,0,0,0,1,1,0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1,0


# cross validation

In [10]:
import numpy as np
from sklearn.model_selection import KFold
kf = KFold(n_splits=20)
# android.reset_index(inplace=True)
for train, test in kf.split(X):
    _y_train = y[train]
    _X_train = X.ix[train][cols]
    _y_test = y[test]
    _X_test = X.ix[test][cols]
    clf.fit(_X_train, _y_train)
    A = confusion_matrix(_y_test, clf.predict(_X_test))
    # print A
    print "The precision is %.2f %%" % (A[1, 1] * 1.0 / (np.sum(A[1, :])) * 100)

The precision is 94.31 %
The precision is 100.00 %
The precision is 99.05 %
The precision is 100.00 %
The precision is 100.00 %
The precision is 99.00 %
The precision is 100.00 %
The precision is 98.85 %
The precision is 98.32 %
The precision is 98.17 %
The precision is 95.35 %
The precision is 100.00 %
The precision is 97.41 %
The precision is 97.40 %
The precision is 97.09 %
The precision is 100.00 %
The precision is 99.31 %
The precision is 98.02 %
The precision is 99.05 %
The precision is 91.84 %


# Try Decision Tree

In [11]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=3, max_features=10, max_leaf_nodes=8)
clf.fit(X_train, y_train)
print "The precision is %.2f %%" % (clf.score(X_test, y_test) * 100)

The precision is 95.51 %


# Decision Tree Cross Validation

In [12]:
import numpy as np
from sklearn.model_selection import KFold
kf = KFold(n_splits=20)
# android.reset_index(inplace=True)
for train, test in kf.split(X):
    _y_train = y[train]
    _X_train = X.ix[train][cols]
    _y_test = y[test]
    _X_test = X.ix[test][cols]
    clf.fit(_X_train, _y_train)
    A = confusion_matrix(_y_test, clf.predict(_X_test))
    # print A
    print "The precision is %.2f %%" % (A[1, 1] * 1.0 / (np.sum(A[1, :])) * 100)

The precision is 86.99 %
The precision is 98.84 %
The precision is 98.10 %
The precision is 95.65 %
The precision is 86.57 %
The precision is 100.00 %
The precision is 100.00 %
The precision is 95.40 %
The precision is 84.87 %
The precision is 96.33 %
The precision is 94.19 %
The precision is 98.31 %
The precision is 93.97 %
The precision is 93.51 %
The precision is 96.12 %
The precision is 97.03 %
The precision is 97.93 %
The precision is 98.02 %
The precision is 94.29 %
The precision is 93.88 %
