# PART3: Multilabel-Classification using scikit-learn SVM

## Load Data

In [1]:
import numpy as np
import pandas as pd 

data_root = '/home/ncchen/Kaggle-Yelp/input/'

train_photos = pd.read_csv(data_root+'train_photo_to_biz_ids.csv')
train_photo_to_biz = pd.read_csv(data_root+'train_photo_to_biz_ids.csv', index_col='photo_id')

train_df = pd.read_csv(data_root+"train_biz_fc7features.csv")
test_df  = pd.read_csv(data_root+"test_biz_fc7features.csv")

y_train = train_df['label'].values
X_train = train_df['feature vector'].values
X_test = test_df['feature vector'].values

def convert_label_to_array(str_label):
    str_label = str_label[1:-1]
    str_label = str_label.split(',')
    return [int(x) for x in str_label if len(x)>0]

def convert_feature_to_vector(str_feature):
    str_feature = str_feature[1:-1]
    str_feature = str_feature.split(',')
    return [float(x) for x in str_feature]

y_train = np.array([convert_label_to_array(y) for y in train_df['label']])
X_train = np.array([convert_feature_to_vector(x) for x in train_df['feature vector']])
X_test = np.array([convert_feature_to_vector(x) for x in test_df['feature vector']])


In [2]:
print "X_train: ", X_train.shape
print "y_train: ", y_train.shape
print "X_test: ", X_test.shape
print "train_df:"
train_df[0:5]

X_train:  (1996, 4096)
y_train:  (1996,)
X_test:  (10000, 1000)
train_df:


Unnamed: 0,business,label,feature vector
0,1000,"(1, 2, 3, 4, 5, 6, 7)","[0.19977248, 0.43287012, 0.22732441, 0.3551769..."
1,1001,"(0, 1, 6, 8)","[0.0, 0.58892941, 0.53906041, 0.17221935, 0.01..."
2,100,"(1, 2, 4, 5, 6, 7)","[0.11154944, 0.034822457, 0.1202566, 0.5201095..."
3,1006,"(1, 2, 4, 5, 6)","[0.078059368, 0.054452561, 0.056381688, 0.6942..."
4,1010,"(0, 6, 8)","[0.39656404, 0.279632, 0.0, 0.1720508, 0.36192..."


## Train a SVM using 80% training data, and assess performance(F1-score)

In [3]:
from sklearn import svm, datasets
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y_ptrain= mlb.fit_transform(y_train)  #Convert list of labels to binary matrix

random_state = np.random.RandomState(0)
X_ptrain, X_ptest, y_ptrain, y_ptest = train_test_split(X_train, y_ptrain, test_size=.2,random_state=random_state)
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
classifier.fit(X_ptrain, y_ptrain)

y_ppredict = classifier.predict(X_ptest)


In [4]:
print "Samples of predicted labels (in binary matrix):\n", y_ppredict[0:3]
print "\nSamples of predicted labels:\n", mlb.inverse_transform(y_ppredict[0:3])

Samples of predicted labels (in binary matrix):
[[0 0 0 0 0 1 1 0 1]
 [0 0 1 0 0 0 1 0 1]
 [0 1 1 0 0 1 1 0 1]]

Samples of predicted labels:
[(5, 6, 8), (2, 6, 8), (1, 2, 5, 6, 8)]


In [5]:
statistics = pd.DataFrame(columns=[ "attribuite "+str(i) for i in range(9)]+['num_biz'], index = ["biz count", "biz ratio"])
statistics.loc["biz count"] = np.append(np.sum(y_ppredict, axis=0), len(y_ppredict))
pd.options.display.float_format = '{:.0f}%'.format
statistics.loc["biz ratio"] = statistics.loc["biz count"]*100/len(y_ppredict) 
statistics

Unnamed: 0,attribuite 0,attribuite 1,attribuite 2,attribuite 3,attribuite 4,attribuite 5,attribuite 6,attribuite 7,attribuite 8,num_biz
biz count,133,200,204,208,116,248,265,110,244,400
biz ratio,33%,50%,51%,52%,29%,62%,66%,28%,61%,100%


In [6]:
from sklearn.metrics import f1_score

print "F1 score: ", f1_score(y_ptest, y_ppredict, average='micro') 
print "Individual Class F1 score: ", f1_score(y_ptest, y_ppredict, average=None)

F1 score:  0.796372147455
Class-wise F1 score:  [ 0.66911765  0.78680203  0.84848485  0.62468514  0.76363636  0.8313253
  0.91353383  0.72972973  0.85420945]


## Re-Train a SVM using all training data, and make predictions on test set

In [3]:
## Uncomment if skip previous train
#from sklearn import svm
#from sklearn.preprocessing import label_binarize
#from sklearn.multiclass import OneVsRestClassifier
#from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y_train= mlb.fit_transform(y_train)  #Convert list of labels to binary matrix

random_state = np.random.RandomState(0)
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
classifier.fit(X_train, y_train)

y_predict = classifier.predict(X_test)

#print list(mlb.classes_)
y_predict_label = mlb.inverse_transform(y_predict) #Convert binary matrix back to labels

In [8]:
X_test.shape

(10000, 4096)

In [6]:
test_data_frame  = pd.read_csv(data_root+"test_biz_fc7features.csv")
df = pd.DataFrame(columns=['business_id','labels'])

for i in range(len(test_data_frame)):
    biz = test_data_frame.loc[i]['business']
    label = y_predict_label[i]
    label = str(label)[1:-1].replace(",", " ")
    df.loc[i] = [str(biz), label]

with open(data_root+"submission_fc7.csv",'w') as f:
    df.to_csv(f, index=False)    
    

In [7]:
statistics = pd.DataFrame(columns=[ "attribuite "+str(i) for i in range(9)]+['num_biz'], index = ["biz count", "biz ratio"])
statistics.loc["biz count"] = np.append(np.sum(y_predict, axis=0), len(y_predict))
pd.options.display.float_format = '{:.0f}%'.format
statistics.loc["biz ratio"] = statistics.loc["biz count"]*100/len(y_predict) 
statistics

Unnamed: 0,attribuite 0,attribuite 1,attribuite 2,attribuite 3,attribuite 4,attribuite 5,attribuite 6,attribuite 7,attribuite 8,num_biz
biz count,1408,6211,6991,5238,2034,8293,9043,1635,6104,10000
biz ratio,14%,62%,70%,52%,20%,83%,90%,16%,61%,100%


In [None]:
#LB score: 0.76437 (use fc7 layer)
#LB score: 0.73053 (use prob layer)