In [25]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_regression
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from matplotlib import pyplot
import numpy as np

In [26]:
# get a list of models to evaluate
def get_models():
	models = dict()
	models['lr'] = LogisticRegression()
	models['knn'] = KNeighborsClassifier()
	models['bayes'] = GaussianNB()
	models['cart'] = DecisionTreeClassifier()
	models['svm'] = SVC()
	return models


In [27]:
# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
	return scores


In [28]:
# reading the files
X = pd.read_csv('Train.csv', sep=r'\s*,\s*')
X_test = pd.read_csv('Test.csv', sep=r'\s*,\s*')
X_sample = pd.read_csv('Test.csv', sep=r'\s*,\s*')

In [29]:
# data filling and using encoding for some fields
#X.fillna(X.mean(),inplace=True)
Y= X.breed_category
Y2 = X.pet_category
X.drop(['breed_category','pet_category','issue_date','listing_date','pet_id'],axis=1,inplace=True)
###
X_test.drop(['issue_date','listing_date','pet_id'],axis=1,inplace=True)

Col_to_encode = ['color_type']

X[Col_to_encode]= X[Col_to_encode].apply(lambda col:LabelEncoder().fit_transform(col))
###
X_test[Col_to_encode]= X_test[Col_to_encode].apply(lambda col:LabelEncoder().fit_transform(col))
#X_test.fillna(X_test.mean(), inplace=True)

In [35]:
#checking the model accuracy and mae scores
models = get_models()
results,names = list(),list()
for name, model in models.items():
	scores = evaluate_model(model, X, Y)
	results.append(scores)
	names.append(name)
	print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))

>lr 0.759 (0.010)
>knn 0.779 (0.008)
>bayes 0.810 (0.009)
>cart 0.888 (0.007)
>svm 0.803 (0.009)


In [30]:
#predicting missing values for train data set
jcols = ['condition']
notnans = X[jcols].notnull().all(axis=1)
df_notnans = X[notnans]
icols = ['color_type','length','height','X1','X2']

train_X,val_X,train_Y,val_Y = train_test_split(df_notnans[icols],df_notnans[jcols],train_size=0.8,test_size=0.2,random_state=0)

#if multiple missing values then
#regr_multirf = MultiOutputRegressor(RandomForestRegressor(max_depth=30,random_state=0))
model_random = RandomForestRegressor(n_estimators=100, random_state=1)
model_random.fit(train_X,train_Y)
preds = model_random.predict(val_X)

df_nans = X.loc[~notnans].copy()
df_nans[jcols] = model_random.predict(df_nans[icols])
X.loc[~notnans] = df_nans.round().astype(int)
df_nans.round().astype(int).head()

Unnamed: 0,condition,color_type,length,height,X1,X2
2,2,15,0,41,15,4
21,1,15,0,14,11,4
35,1,53,0,22,13,9
38,1,15,1,8,0,6
43,1,2,0,30,9,2


In [31]:
#predicting missing values for test data set
jcols = ['condition']
notnans = X_test[jcols].notnull().all(axis=1)
df_notnans = X_test[notnans]
icols = ['color_type','length','height','X1','X2']

train_X,val_X,train_Y,val_Y = train_test_split(df_notnans[icols],df_notnans[jcols],train_size=0.8,test_size=0.2,random_state=0)

#if multiple missing values then
#regr_multirf = MultiOutputRegressor(RandomForestRegressor(max_depth=30,random_state=0))
model_random = RandomForestRegressor(n_estimators=100, random_state=1)
model_random.fit(train_X,train_Y)
preds = model_random.predict(val_X)

df_nans = X_test.loc[~notnans].copy()
df_nans[jcols] = model_random.predict(df_nans[icols])
X_test.loc[~notnans] = df_nans.round().astype(int)
df_nans.round().astype(int).head()

Unnamed: 0,condition,color_type,length,height,X1,X2
19,2,14,1,34,15,4
27,1,28,0,24,15,4
31,1,14,1,6,15,4
35,2,28,0,19,18,4
40,1,2,1,35,16,9


In [32]:
#after checking all the models best model is used and checking the mae scores

train_X,val_X,train_Y,val_Y = train_test_split(X,Y,train_size=0.8,test_size=0.2,random_state=0)

model_random = GradientBoostingClassifier()
model_random.fit(train_X,train_Y)
preds = model_random.predict(val_X)

me = mean_absolute_error(val_Y,preds)
r_score = r2_score(val_Y,preds)
print(me,r_score)

0.10061056543668702 0.7327310653947021


In [33]:
#predicting the test dataset and creating another test file for extra field prediction

model_random.fit(X,Y)
preds = model_random.predict(X_test)
output = pd.DataFrame({'pet_id':X_sample.pet_id,'condition':X_test.condition,'color_type':X_test.color_type,'length(m)':X_test.length,'height(cm)':X_test.height,'X1':X_test.X1,'X2':X_test.X2,'breed_category':preds.astype(int)})
output.to_csv('new_test.csv', index=False)

In [34]:
#now predicting for pet_category

X_test = pd.read_csv('new_test.csv')
X_sample = pd.read_csv('new_test.csv')
X_test.drop(['pet_id','breed_category'],axis=1,inplace=True)

# X2 = pd.read_csv('train.csv')
# X2.fillna(X.mean(),inplace=True)
# Y2 = X2.pet_category
# X2.drop(['pet_category','issue_date','listing_date','pet_id'],axis=1,inplace=True)
# Col_to_encode = ['color_type']
# X2[Col_to_encode]= X[Col_to_encode].apply(lambda col:LabelEncoder().fit_transform(col))

model_random.fit(X,Y2)
preds = model_random.predict(X_test)
output = pd.DataFrame({'pet_id':X_sample.pet_id,'breed_category':X_sample.breed_category,'pet_category':preds.astype(int)})
output.to_csv('submission.csv', index=False)