# PART3: Multilabel-Classification using scikit-learn Linear models

## Load Data

In [1]:
import numpy as np
import pandas as pd 

data_root = '/media/sf_Yelp/input/'

train_photos = pd.read_csv(data_root+'train_photo_to_biz_ids.csv')
train_photo_to_biz = pd.read_csv(data_root+'train_photo_to_biz_ids.csv', index_col='photo_id')

train_df = pd.read_csv(data_root+"train_biz_fc7features_extra2.csv")
test_df  = pd.read_csv(data_root+"test_biz_fc7features_extra2.csv")

y_train = train_df['label'].values
X_train = train_df['feature vector'].values
X_test = test_df['feature vector'].values

def convert_label_to_array(str_label):
    str_label = str_label[1:-1]
    str_label = str_label.split(',')
    return [int(x) for x in str_label if len(x)>0]

def convert_feature_to_vector(str_feature):
    str_feature = str_feature[1:-1]
    str_feature = str_feature.split(',')
    return [float(x) for x in str_feature]

y_train = np.array([convert_label_to_array(y) for y in train_df['label']])
X_train = np.array([convert_feature_to_vector(x) for x in train_df['feature vector']])
X_test = np.array([convert_feature_to_vector(x) for x in test_df['feature vector']])


In [2]:
print "X_train: ", X_train.shape
print "y_train: ", y_train.shape
print "X_test: ", X_test.shape
print "train_df:"
train_df[0:5]

X_train:  (1996, 8192)
y_train:  (1996,)
X_test:  (10000, 8192)
train_df:


Unnamed: 0,business,label,feature vector
0,1000.0,"(1, 2, 3, 4, 5, 6, 7)","[0.20032024, 0.44084537, 0.23249489, 0.3600976..."
1,1001.0,"(0, 1, 6, 8)","[0.0013769998, 0.59398097, 0.55060995, 0.18394..."
2,100.0,"(1, 2, 4, 5, 6, 7)","[0.11435749, 0.033177156, 0.12572332, 0.539482..."
3,1006.0,"(1, 2, 4, 5, 6)","[0.075851507, 0.052600037, 0.059594199, 0.7067..."
4,1010.0,"(0, 6, 8)","[0.39024171, 0.28424361, 0.0, 0.1655795, 0.460..."


## Train a SVM using cross-validation and assess performance(F1-score)

In [11]:
from sklearn import svm, datasets
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
import xgboost as xgb
from sklearn import linear_model
from sklearn.ensemble import ExtraTreesClassifier 
from sklearn import cross_validation
from sklearn.metrics import f1_score

import time
t=time.time()

mlb = MultiLabelBinarizer()
y_ptrain= mlb.fit_transform(y_train)  #Convert list of labels to binary matrix

SEED = 0
random_state = np.random.RandomState(SEED)
#X_ptrain, X_ptest, y_ptrain, y_ptest = train_test_split(X_train, y_ptrain, test_size=.2,random_state=random_state)
classifier = OneVsRestClassifier(xgb.XGBClassifier(max_depth=4, n_estimators=500))
#classifier.fit(X_ptrain, y_ptrain)
#y_ppredict = classifier.predict(X_ptest)
y_ppredict = cross_validation.cross_val_predict(classifier, X_train, y_ptrain, cv=2, verbose =1)

print "Time passed: ", "{0:.1f}".format(time.time()-t), "sec"

Time passed:  797.2 sec


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 13.3min finished


In [12]:
print "Samples of predicted labels (in binary matrix):\n", y_ppredict[0:3]
print "\nSamples of predicted labels:\n", mlb.inverse_transform(y_ppredict[0:3])

Samples of predicted labels (in binary matrix):
[[0 1 1 1 1 1 1 1 0]
 [1 0 0 0 0 0 0 0 1]
 [0 1 1 0 1 1 1 1 0]]

Samples of predicted labels:
[(1, 2, 3, 4, 5, 6, 7), (0, 8), (1, 2, 4, 5, 6, 7)]


In [13]:
statistics = pd.DataFrame(columns=[ "attribuite "+str(i) for i in range(9)]+['num_biz'], index = ["biz count", "biz ratio"])
statistics.loc["biz count"] = np.append(np.sum(y_ppredict, axis=0), len(y_ppredict))
pd.options.display.float_format = '{:.0f}%'.format
statistics.loc["biz ratio"] = statistics.loc["biz count"]*100/len(y_ppredict) 
statistics

Unnamed: 0,attribuite 0,attribuite 1,attribuite 2,attribuite 3,attribuite 4,attribuite 5,attribuite 6,attribuite 7,attribuite 8,num_biz
biz count,579,943,967,943,488,1233,1399,556,1265,1996
biz ratio,29%,47%,48%,47%,24%,62%,70%,28%,63%,100%


In [14]:
from sklearn.metrics import f1_score

print "F1 score: ", f1_score(y_ptrain, y_ppredict, average='micro') 
print "Individual Class F1 score: ", f1_score(y_ptrain, y_ppredict, average=None)

F1 score:  0.823743541569
Individual Class F1 score:  [ 0.6688      0.82954545  0.86603111  0.65981501  0.76135266  0.88074134
  0.92352302  0.75531915  0.88054335]


## Re-Train an ensemble using all training data, and make predictions on test set

In [3]:
## Uncomment if skip previous train
from sklearn import svm
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
import time
import xgboost as xgb
from sklearn.svm import LinearSVC
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
#from sklearn.ensemble import AdaBoostClassifier

t = time.time()

mlb = MultiLabelBinarizer()
y_train= mlb.fit_transform(y_train)  #Convert list of labels to binary matrix

SEED = 0
random_state = np.random.RandomState(SEED)
classifier1 = OneVsRestClassifier(linear_model.LogisticRegressionCV())
classifier1.fit(X_train, y_train)
y_predict1 = classifier1.predict_proba(X_test)
print "Time passed: ", "{0:.1f}".format(time.time()-t), "sec"

rfc = RandomForestClassifier(n_estimators=1000, criterion='entropy', max_depth=None,
                                min_samples_split=4, min_samples_leaf=3,
                                max_features='auto', bootstrap=True,
                                oob_score=True, n_jobs=-1,
                                random_state=SEED, verbose=0)
classifier2 = OneVsRestClassifier(rfc)
classifier2.fit(X_train, y_train)
y_predict2 = classifier2.predict_proba(X_test)
print "Time passed: ", "{0:.1f}".format(time.time()-t), "sec"

classifier3 = OneVsRestClassifier(xgb.XGBClassifier(max_depth=4, n_estimators=500))
classifier3.fit(X_train, y_train)
y_predict3 = classifier3.predict_proba(X_test)

classifier3 = OneVsRestClassifier(xgb.XGBClassifier(max_depth=4, n_estimators=500))
classifier3.fit(X_train, y_train)
y_predict3 = classifier3.predict_proba(X_test)

print "Time passed: ", "{0:.1f}".format(time.time()-t), "sec"

Time passed:  342.0 sec
Time passed:  770.5 sec
Time passed:  1838.5 sec


In [6]:
print "Samples of predicted labels (in binary matrix):\n", y_predict1[0:3]
print "Samples of predicted labels (in binary matrix):\n", y_predict2[0:3]
print "Samples of predicted labels (in binary matrix):\n", y_predict3[0:3]

ensemble_predict = (y_predict1 + y_predict2+ y_predict3)/3
ensemble_predict = ensemble_predict >0.5
print "Samples of ensemble predict labels (in binary matrix):\n", ensemble_predict[0:3]
#print list(mlb.classes_)
y_predict_label = mlb.inverse_transform(ensemble_predict) #Convert binary matrix back to labels
print "Samples of y_predict_label (in binary matrix):\n", y_predict_label[0:3]

Samples of predicted labels (in binary matrix):
[[ 0.16210729  0.8767039   0.83367635  0.63187958  0.25629034  0.94028566
   0.94268152  0.31519181  0.40275532]
 [ 0.17267556  0.70747467  0.77021101  0.59990199  0.18312109  0.85268099
   0.91039362  0.26223796  0.77149978]
 [ 0.17874441  0.46318122  0.61745119  0.56113699  0.03348489  0.84580967
   0.80912967  0.26228757  0.58660279]]
Samples of predicted labels (in binary matrix):
[[ 0.07612857  0.85367857  0.9418      0.79181667  0.60460595  0.96516071
   0.97937778  0.56252619  0.25453193]
 [ 0.06363333  0.76885556  0.89991667  0.82049286  0.47321429  0.96256667
   0.95740714  0.55124762  0.46209877]
 [ 0.17049405  0.61960079  0.68812857  0.63968611  0.21865     0.82090476
   0.85648333  0.28507897  0.50713413]]
Samples of predicted labels (in binary matrix):
[[  6.37073768e-04   9.83949482e-01   9.98724163e-01   9.95703757e-01
    1.24399580e-01   9.99744117e-01   9.99948740e-01   6.73743486e-01
    1.70630366e-01]
 [  1.05992127e-

In [7]:
X_test.shape

(10000, 8192)

In [8]:
test_data_frame  = pd.read_csv(data_root+"test_biz_fc7features.csv")
df = pd.DataFrame(columns=['business_id','labels'])

for i in range(len(test_data_frame)):
    biz = test_data_frame.loc[i]['business']
    label = y_predict_label[i]
    label = str(label)[1:-1].replace(",", " ")
    df.loc[i] = [str(biz), label]

with open(data_root+"../submissions/ensemble_submission_fc7.csv",'w') as f:
    df.to_csv(f, index=False)    
    

In [11]:
statistics = pd.DataFrame(columns=[ "attribuite "+str(i) for i in range(9)]+['num_biz'], index = ["biz count", "biz ratio"])
statistics.loc["biz count"] = np.append(np.sum(ensemble_predict, axis=0), len(ensemble_predict))
pd.options.display.float_format = '{:.0f}%'.format
statistics.loc["biz ratio"] = statistics.loc["biz count"]*100/len(ensemble_predict) 
statistics

Unnamed: 0,attribuite 0,attribuite 1,attribuite 2,attribuite 3,attribuite 4,attribuite 5,attribuite 6,attribuite 7,attribuite 8,num_biz
biz count,677,8108,8510,6583,1754,9082,9364,1734,5232,10000
biz ratio,7%,81%,85%,66%,18%,91%,94%,17%,52%,100%


In [None]:
#LB score: 0.76437 (use fc7 layer)
#LB score: 0.73053 (use prob layer)

In [15]:
np.savetxt(data_root+"../submissions/ensemble/LRcv_submission_fc7.csv", y_predict1, delimiter=",")
np.savetxt(data_root+"../submissions/ensemble/RF_submission_fc7.csv", y_predict2, delimiter=",")
np.savetxt(data_root+"../submissions/ensemble/XGB_submission_fc7.csv", y_predict3, delimiter=",")