In [2]:
#quality>5 = good
#quality<6 = bad
##Dataset from:
##P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.
##Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.
##Author: Nishank Raisinghani
##Project: Using XGBoost and SciKit Learn to predict the quality of wine using a classifier


##Import
import numpy as np
import pandas as pd
from pandas import DataFrame
from sklearn import tree
from sklearn.preprocessing import scale
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.preprocessing import StandardScaler


##Load in data
df = pd.read_csv("winequality-white.csv")


##Make label data binary: turn 0-10 scale into good and bad
d = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:1, 7:1, 8:1, 9:1, 10:1}
df['quality'] = df['quality'].map(d)

##Define your features
features = list(df.columns[:11])

##split data up into training and testing
from sklearn.model_selection import train_test_split
traindf, testdf = train_test_split(df, test_size=0.2)

##define your features and label values
scaler = StandardScaler()
x = traindf[features]
scaler.fit(x)
x=pd.DataFrame(scaler.transform(x))
y = traindf['quality']
testx = testdf[features]
scaler.fit(testx)
testx=pd.DataFrame(scaler.transform(testx))
testy = testdf['quality']

##Turn into DMatrix; this is how XGBoost likes its data
training = xgb.DMatrix(x, label =y)
testing = xgb.DMatrix(testx, label=testy)

##Enter in your parameters; I am training 2 models, one is a softmax classification and one is a binary logistic classification
param = {
    'max_depth': 4,
    'eta': 0.1,
    'objective': 'multi:softmax',
    'num_class': 2} 
epochs = 10 

model = xgb.train(param, training, epochs)

param2 = {
    'max_depth': 4,
    'eta': 0.2,
    'objective': 'binary:logistic'} 
epochs = 8 

model2 = xgb.train(param2, training, epochs)

##Run your model on the testing data
predictions = model.predict(testing)
predictions2 = model2.predict(testing)
predictionDF = DataFrame({'predictions':predictions})
predictionDF2 = DataFrame({'predictions':predictions2})

##Since the binary logistic will give you a value between 0 and 1, and you only want a value of 0 or 1 you need to map this data to make it 0s and 1s
predictionDF2['predictions'] = predictionDF2['predictions'].map(lambda x: 0.0 if x <0.5 else 1.0)

##Use these 0 and 1 values to turn the predictions into 'good' and 'bad' so it looks presentable
d = {1:'good', 0:'bad'}
predictionDF['predictions']=predictionDF['predictions'].map(d)
predictionDF2['predictions']=predictionDF2['predictions'].map(d)
testdf['quality']=testdf['quality'].map(d)

##return the 
testdf['predictions2'] = predictionDF2['predictions'].values
testdf['predictions']=predictionDF['predictions'].values
testdf.head(50)





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdf['quality']=testdf['quality'].map(d)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdf['predictions2'] = predictionDF2['predictions'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdf['predictions']=predictionDF['predictions'].values


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,predictions2,predictions
1379,6.9,0.46,0.2,0.9,0.054,5.0,126.0,0.992,3.1,0.42,10.4,good,bad,bad
2254,6.2,0.26,0.19,3.4,0.049,47.0,172.0,0.9924,3.14,0.43,10.4,good,good,good
1159,7.8,0.3,0.29,16.85,0.054,23.0,135.0,0.9998,3.16,0.38,9.0,good,bad,bad
2518,7.1,0.12,0.3,3.1,0.018,15.0,37.0,0.99004,3.02,0.52,11.9,good,good,good
3700,7.0,0.3,0.27,1.5,0.076,24.0,145.0,0.99344,3.1,0.52,10.1,bad,bad,bad
726,6.4,0.24,0.31,2.8,0.038,41.0,114.0,0.99155,3.37,0.66,11.7,good,good,good
1002,6.4,0.28,0.43,7.1,0.045,60.0,221.0,0.9952,3.09,0.45,9.4,good,bad,bad
2251,7.4,0.18,0.29,1.4,0.042,34.0,101.0,0.99384,3.54,0.6,10.5,good,good,good
388,6.7,0.21,0.32,5.4,0.047,29.0,140.0,0.995,3.39,0.46,9.7,good,good,good
149,6.9,0.25,0.3,4.1,0.054,23.0,116.0,0.994,2.99,0.38,9.4,good,bad,good


In [8]:
##Use this to check the amount of error in the softmax classification
from sklearn.metrics import accuracy_score, jaccard_score
print(accuracy_score(testy, predictions))


0.7438775510204082
