In [1]:
import pandas as pd
import numpy as np
import re

In [3]:
train_data = pd.read_csv('train.csv')
train_data.head(2)

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2


In [4]:
test_data = pd.read_csv('test.csv')
test_data.head(2)

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2
0,ANSL_75005,2005-08-17 00:00:00,2017-09-07 15:35:00,0.0,Black,0.87,42.73,0,7
1,ANSL_76663,2018-11-15 00:00:00,2019-05-08 17:24:00,1.0,Orange Tabby,0.06,6.71,0,1


In [5]:
train_data.color_type = pd.Categorical(train_data.color_type)
train_data['color_type'] = train_data.color_type.cat.codes

In [6]:

test_data.color_type = pd.Categorical(test_data.color_type)
test_data['color_type'] = test_data.color_type.cat.codes

In [7]:
train_data['condition'] = train_data['condition'].fillna(train_data['condition'].median())
test_data['condition'] = test_data['condition'].fillna(test_data['condition'].median())

In [8]:
def add_datepart(df, fldname, drop=True):
    fld = df[fldname]
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    for n in ('Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear'):
        df[targ_pre+n] = getattr(fld.dt,n.lower())
    df[targ_pre+'Elapsed'] = fld.astype(np.int64) // 10**9
    if drop: df.drop(fldname, axis=1, inplace=True)

In [9]:
add_datepart(train_data,'issue_date')
add_datepart(train_data,'listing_date')

add_datepart(test_data,'issue_date')
add_datepart(test_data,'listing_date')

In [10]:
train_labels = train_data[['breed_category','pet_category']]

In [75]:
train_features = train_data[['condition','color_type','length(m)','height(cm)','X1','X2','issue_Dayofyear','issue_Elapsed',
                                 'listing_Dayofyear','listing_Elapsed']]

test_features = test_data[['condition','color_type','length(m)','height(cm)','X1','X2','issue_Dayofyear','issue_Elapsed',
                                  'listing_Dayofyear','listing_Elapsed']]

In [76]:
from sklearn.model_selection import train_test_split,cross_val_score
X_train,X_test,y_train,y_test = train_test_split(train_features,train_labels,test_size=0.2, random_state=42)

In [77]:
from sklearn.datasets import make_multilabel_classification
X, y = make_multilabel_classification(sparse = True, n_labels = 20,
return_indicator = 'sparse', allow_unlabeled = False)

In [78]:
X_train.shape

(15067, 10)

In [79]:
from skmultilearn.problem_transform import BinaryRelevance,LabelPowerset,ClassifierChain
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

In [80]:
clf_LabelPowerSet = LabelPowerset(RandomForestClassifier())
clf_ClassifierChain = ClassifierChain(RandomForestClassifier(
                            n_estimators=600,
                            min_samples_split=2,
                            min_samples_leaf=4,
                            max_features='auto',
                            max_depth=40,
                            bootstrap=True))


clf_BinaryRelevance = BinaryRelevance(RandomForestClassifier())

In [81]:
clf_BinaryRelevance.fit(X_train, y_train)
predictioned_Label = clf_BinaryRelevance.predict(X_test)



In [82]:
clf_ClassifierChain.fit(X_train, y_train)
predictioned_Label = clf_ClassifierChain.predict(X_test)

In [83]:
clf_LabelPowerSet.fit(X_train, y_train)
predictions_Label = clf_LabelPowerSet.predict(X_test)



In [84]:
final_pred_val = clf_ClassifierChain.predict(test_features)
final_pred_val

<8072x2 sparse matrix of type '<class 'numpy.float64'>'
	with 12178 stored elements in Compressed Sparse Column format>

In [85]:
final_pred_val = pd.DataFrame(final_pred_val.toarray())

In [86]:
test_data['breed_category'] = final_pred_val[0]
test_data['pet_category'] = final_pred_val[1]

In [87]:
#test_data
final_test_data = test_data[['pet_id','breed_category','pet_category']]

In [88]:
final_test_data.to_csv('Multiple_label_Rand_clf.csv', index=False)

In [52]:
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
max_features = ['auto','sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_sample_slit = [2,5,10]
min_samples_leaf = [1,2,4]
bootStrap = [True,False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_sample_slit,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootStrap}

print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [53]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

rf_random.fit(train_features,train_labels)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 38.8min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 86.2min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_sta...


In [54]:
rf_random.best_params_

{'n_estimators': 600,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 40,
 'bootstrap': True}