# PART3: Multilabel-Classification using scikit-learn Linear models

## Load Data

In [26]:
import numpy as np
import pandas as pd 

#data_root = '/media/sf_Yelp/input/'
data_root = '/home/long/Desktop/Yelp/input/'

train_photos = pd.read_csv(data_root+'train_photo_to_biz_ids.csv')
train_photo_to_biz = pd.read_csv(data_root+'train_photo_to_biz_ids.csv', index_col='photo_id')

train_df = pd.read_csv(data_root+"train_biz_fc7features.csv")
test_df  = pd.read_csv(data_root+"test_biz_fc7features.csv")

y_train = train_df['label'].values
X_train = train_df['feature vector'].values
X_test = test_df['feature vector'].values

#Age gender features
train_age_df = pd.read_csv(data_root+"train_biz_fc8_age_features.csv")
test_age_df  = pd.read_csv(data_root+"test_biz_fc8_age_features.csv")

train_gender_df = pd.read_csv(data_root+"train_biz_fc8_gender_features.csv")
test_gender_df  = pd.read_csv(data_root+"test_biz_fc8_gender_features.csv")


In [27]:
def convert_label_to_array(str_label):
    str_label = str_label[1:-1]
    str_label = str_label.split(',')
    return [int(x) for x in str_label if len(x)>0]

import re
def convert_feature_to_vector(str_feature):
    str_feature = re.sub('[\]\[!@#$]', '', str_feature)
    #str_feature = str_feature[1:-1]
    str_feature = str_feature.split(',')
    return [float(x) for x in str_feature]

print sum(train_df['business'] - train_age_df['business'])
print sum(train_df['business'] - train_gender_df['business'])
train_df['feature vector'] = train_age_df['feature vector'] + ',' + train_gender_df['feature vector'] + ',' + train_df['feature vector'] 
test_df['feature vector'] = test_age_df['feature vector'] + ',' + test_gender_df['feature vector'] + ',' + test_df['feature vector'] 

y_train = np.array([convert_label_to_array(y) for y in train_df['label']])
X_train = np.array([convert_feature_to_vector(x) for x in train_df['feature vector']])
X_test = np.array([convert_feature_to_vector(x) for x in test_df['feature vector']])

0.0
0.0


In [3]:
print "X_train: ", X_train.shape
print "y_train: ", y_train.shape
print "X_test: ", X_test.shape
print "train_df:"
print train_df[0:5]
print y_train[0:3]

X_train:  (1996, 4106)
y_train:  (1996,)
X_test:  (10000, 4106)
train_df:
   business                  label  \
0    1000.0  (1, 2, 3, 4, 5, 6, 7)   
1    1001.0           (0, 1, 6, 8)   
2     100.0     (1, 2, 4, 5, 6, 7)   
3    1006.0        (1, 2, 4, 5, 6)   
4    1010.0              (0, 6, 8)   

                                      feature vector  
0  [-1.7518281, -2.5073791, -1.1439633, -1.309718...  
1  [-2.8106494, -2.1684897, -1.1240553, -1.027255...  
2  [-0.44846466, -1.6965714, -0.61568594, -1.9498...  
3  [-0.98842466, -2.3486288, -1.0032905, -1.95205...  
4  [-2.5805631, -2.7594094, -1.7486759, -1.669193...  
[[1, 2, 3, 4, 5, 6, 7] [0, 1, 6, 8] [1, 2, 4, 5, 6, 7]]


## Train a SVM using cross-validation and assess performance(F1-score)

In [16]:
from sklearn import svm, datasets
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
import xgboost as xgb
from sklearn import linear_model
from sklearn import neighbors
from sklearn import naive_bayes
from sklearn.ensemble import ExtraTreesClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn import cross_validation
from sklearn.metrics import f1_score

import time
t=time.time()

mlb = MultiLabelBinarizer()
y_ptrain= mlb.fit_transform(y_train)  #Convert list of labels to binary matrix

# bgc = BaggingClassifier(base_estimator = linear_model.LogisticRegressionCV(),
#                         n_estimators=5, max_samples = 0.2, bootstrap  = 0, 
#                          n_jobs=-1, verbose = 1)

classifier = OneVsRestClassifier(linear_model.LinearRegression())
#classifier.fit(X_ptrain, y_ptrain)
#y_ppredict = classifier.predict(X_ptest)
y_ppredict = cross_validation.cross_val_predict(classifier, X_train, y_ptrain, cv=2, verbose =1)

print "Time passed: ", "{0:.1f}".format(time.time()-t), "sec"

Time passed:  8.4 sec


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    8.4s finished


In [17]:
print "Samples of predicted labels (in binary matrix):\n", y_ppredict[0:3]
print "\nSamples of predicted labels:\n", mlb.inverse_transform(y_ppredict[0:3])

Samples of predicted labels (in binary matrix):
[[0 1 1 1 1 1 1 1 0]
 [0 1 1 0 0 1 1 1 0]
 [0 1 1 0 1 1 1 1 0]]

Samples of predicted labels:
[(1, 2, 3, 4, 5, 6, 7), (1, 2, 5, 6, 7), (1, 2, 4, 5, 6, 7)]


In [18]:
statistics = pd.DataFrame(columns=[ "attribuite "+str(i) for i in range(9)]+['num_biz'], index = ["biz count", "biz ratio"])
statistics.loc["biz count"] = np.append(np.sum(y_ppredict, axis=0), len(y_ppredict))
pd.options.display.float_format = '{:.0f}%'.format
statistics.loc["biz ratio"] = statistics.loc["biz count"]*100/len(y_ppredict) 
statistics

Unnamed: 0,attribuite 0,attribuite 1,attribuite 2,attribuite 3,attribuite 4,attribuite 5,attribuite 6,attribuite 7,attribuite 8,num_biz
biz count,631,999,1062,1108,625,1213,1385,644,1152,1996
biz ratio,32%,50%,53%,56%,31%,61%,69%,32%,58%,100%


In [19]:
from sklearn.metrics import f1_score

print "F1 score: ", f1_score(y_ptrain, y_ppredict, average='micro') 
print "Individual Class F1 score: ", f1_score(y_ptrain, y_ppredict, average=None)

F1 score:  0.762787504291
Individual Class F1 score:  [ 0.59447005  0.76606426  0.81130268  0.60729512  0.67918089  0.83021933
  0.89763206  0.66118421  0.81506276]


## Re-Train a SVM using all training data, and make predictions on test set

In [3]:
## Uncomment if skip previous train
from sklearn import svm
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
import time
import xgboost as xgb
from sklearn.svm import LinearSVC
from sklearn import linear_model
#from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier

t = time.time()

mlb = MultiLabelBinarizer()
y_train= mlb.fit_transform(y_train)  #Convert list of labels to binary matrix

from sklearn.cross_validation import StratifiedKFold
print train_df['label'].shape
skf = StratifiedKFold(train_df['label'], n_folds=5, random_state = 0)
y_predict = np.zeros((X_test.shape[0],9))
for train_index, test_index in skf:
    X_train_i = X_train[train_index]
    y_train_i = y_train[train_index]
    classifier = OneVsRestClassifier(xgb.XGBClassifier(max_depth=4, n_estimators=1000))
    classifier.fit(X_train_i, y_train_i)
    y_predict_i = classifier.predict(X_test)
    y_predict = y_predict + y_predict_i
    print X_train_i.shape
    print y_train_i.shape
    print y_predict_i[1:3]

y_predict = y_predict/5
#y_predict = classifier.predict_proba(X_test)
np.savetxt(data_root+"../submissions/ensemble/y_predict_skf_XGBd4n1000_submission_CNfc7_AGfc8.csv", y_predict, delimiter=",")
y_predict = y_predict > 0.5

#print list(mlb.classes_)
y_predict_label = mlb.inverse_transform(y_predict) #Convert binary matrix back to labels

print "Time passed: ", "{0:.1f}".format(time.time()-t), "sec"

(1996,)




(1500, 4106)
(1500, 9)
[[0 1 1 1 0 1 1 1 1]
 [0 1 1 1 0 1 1 0 0]]
(1571, 4106)
(1571, 9)
[[0 1 1 1 0 1 1 1 0]
 [0 1 1 1 0 1 1 0 0]]
(1617, 4106)
(1617, 9)
[[0 1 1 1 1 1 1 1 1]
 [0 1 1 1 0 1 1 0 0]]
(1640, 4106)
(1640, 9)
[[0 1 1 1 0 1 1 1 1]
 [0 1 0 1 0 1 1 0 0]]
(1656, 4106)
(1656, 9)
[[0 1 1 1 0 1 1 1 1]
 [0 1 1 1 0 1 1 0 0]]
Time passed:  2032.3 sec


In [7]:
print y_predict_label[1:3]

[(1, 2, 3, 5, 6, 7, 8), (1, 2, 3, 5, 6)]


In [8]:
test_data_frame  = pd.read_csv(data_root+"test_biz_fc7features.csv")
df = pd.DataFrame(columns=['business_id','labels'])

for i in range(len(test_data_frame)):
    biz = test_data_frame.loc[i]['business']
    label = y_predict_label[i]
    label = str(label)[1:-1].replace(",", " ")
    df.loc[i] = [str(biz), label]
    
    
with open(data_root+"../submissions/skf_XGBd4n1000_submission_CNfc7_AGfc8.csv",'w') as f:
    df.to_csv(f, index=False)    
    

In [9]:
statistics = pd.DataFrame(columns=[ "attribuite "+str(i) for i in range(9)]+['num_biz'], index = ["biz count", "biz ratio"])
statistics.loc["biz count"] = np.append(np.sum(y_predict, axis=0), len(y_predict))
pd.options.display.float_format = '{:.0f}%'.format
statistics.loc["biz ratio"] = statistics.loc["biz count"]*100/len(y_predict) 
statistics

Unnamed: 0,attribuite 0,attribuite 1,attribuite 2,attribuite 3,attribuite 4,attribuite 5,attribuite 6,attribuite 7,attribuite 8,num_biz
biz count,729,8125,8541,6848,1897,9066,9354,2186,5529,10000
biz ratio,7%,81%,85%,68%,19%,91%,94%,22%,55%,100%


## Ensemble of previous models

In [28]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import MultiLabelBinarizer

#data_root = '/media/sf_Yelp/input/'
data_root = '/home/long/Desktop/Yelp/input/'
mlb = MultiLabelBinarizer()
y_train= mlb.fit_transform(y_train)  #Convert list of labels to binary matrix


y_predict_BR  = pd.read_csv(data_root+"../submissions/ensemble/y_predict_0.828991940194_BayesianRidge__submission_fc7_4096.csv",header=None)
y_predict_BR2  = pd.read_csv(data_root+"../submissions/ensemble/y_predict_skf_BR_submission_CNfc7_AGfc8.csv",header=None)
y_predict_LoR  = pd.read_csv(data_root+"../submissions/ensemble/y_predict_0.824536875218_LogisticRegressionCV_submission_fc7_4096.csv",header=None)
y_predict_XGB  = pd.read_csv(data_root+"../submissions/ensemble/y_predict_0.823321762664_XGB_d4_n500_submission_fc7_4096.csv",header=None)
y_predict_XGB2  = pd.read_csv(data_root+"../submissions/ensemble/y_predict_skf_XGBd4n1000_submission_CNfc7_AGfc8.csv",header=None)
y_predict_RF  = pd.read_csv(data_root+"../submissions/ensemble/y_predict_0.807488855869_RF_d4_n1000_submission_fc7_4096.csv",header=None)
y_predict_RCcv  = pd.read_csv(data_root+"../submissions/ensemble/y_predict_0.825110002933_RCcv_norm_submission_fc7_4096.csv",header=None)
y_predict_RCcv2  = pd.read_csv(data_root+"../submissions/ensemble/y_predict_skf_RCcv_norm_submission_CNfc7_AGfc8.csv",header=None)
y_predict_RC  = pd.read_csv(data_root+"../submissions/ensemble/y_predict_0.820068600663_RC_norm_submission_fc7_4096.csv",header=None)
y_predict_RC2  = pd.read_csv(data_root+"../submissions/ensemble/y_predict_skf_RC_submission_fc7_4096.csv",header=None)
y_predict_PAC  = pd.read_csv(data_root+"../submissions/ensemble/y_predict_0.812068670507_PAC_submission_fc7_4096.csv",header=None)
y_predict_PAC2  = pd.read_csv(data_root+"../submissions/ensemble/y_predict_skf_PAC_n50_submission_fc7_4096.csv",header=None)

y_predict_Per  = pd.read_csv(data_root+"../submissions/ensemble/y_predict_skf_Perceptron_submission_fc7_4096.csv",header=None)
y_predict_LSVC  = pd.read_csv(data_root+"../submissions/ensemble/y_predict_skf_LinearSVC_submission_fc7_4096.csv",header=None)
y_predict_SGD  = pd.read_csv(data_root+"../submissions/ensemble/y_predict_skf_SGD_submission_fc7_4096.csv",header=None)

#y_predict_LR  = pd.read_csv(data_root+"../submissions/ensemble/y_predict_0.762498569958_LR_submission_fc7_4096.csv",header=None)


ensemble_predict = (y_predict_BR + y_predict_BR2 + y_predict_LoR + y_predict_XGB + y_predict_XGB2 + 
                    y_predict_RF+ y_predict_RCcv+  y_predict_RCcv2 + y_predict_RC+ y_predict_RC2+
                    y_predict_PAC + y_predict_PAC2 + y_predict_Per + y_predict_LSVC + y_predict_SGD)/11
ensemble_predict = ensemble_predict >0.5
ensemble_predict = pd.DataFrame.as_matrix(ensemble_predict)
print "Samples of ensemble predict labels (in binary matrix):\n", ensemble_predict[0:3]
#print list(mlb.classes_)
y_predict_label = mlb.inverse_transform(ensemble_predict) #Convert binary matrix back to labels
print "Samples of y_predict_label (in binary matrix):\n", y_predict_label[0:3]


test_data_frame  = pd.read_csv(data_root+"test_biz_fc7features.csv")
df = pd.DataFrame(columns=['business_id','labels'])

for i in range(len(test_data_frame)):
    biz = test_data_frame.loc[i]['business']
    label = y_predict_label[i]
    label = str(label)[1:-1].replace(",", " ")
    df.loc[i] = [str(biz), label]

with open(data_root+"../submissions/ensemble_15models_scaled_correct_age_gender.csv",'w') as f:
    df.to_csv(f, index=False)  
    
    

Samples of ensemble predict labels (in binary matrix):
[[False  True  True  True False  True  True False False]
 [False  True  True  True False  True  True False  True]
 [False  True False  True False  True  True False  True]]
Samples of y_predict_label (in binary matrix):
[(1, 2, 3, 5, 6), (1, 2, 3, 5, 6, 8), (1, 3, 5, 6, 8)]
