In [1]:
#quality>5 = good
#quality<6 = bad
##Dataset from:
##P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.
##Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.
##Author: Nishank Raisinghani
##Project: Using XGBoost and SciKit Learn to predict the quality of wine using a classifier


##Import
import numpy as np
import pandas as pd
from pandas import DataFrame
from sklearn import tree
from sklearn.preprocessing import scale
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.preprocessing import StandardScaler


##Load in data
df = pd.read_csv("winequality-white1.csv", sep=";")


##Make label data binary: turn 0-10 scale into good and bad
d = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:1, 7:1, 8:1, 9:1, 10:1}
df['quality'] = df['quality'].map(d)

##Define your features
features = list(df.columns[:11])

##split data up into training and testing
from sklearn.model_selection import train_test_split
traindf, testdf = train_test_split(df, test_size=0.2)

##define your features and label values
scaler = StandardScaler()
x = traindf[features]
scaler.fit(x)
x=pd.DataFrame(scaler.transform(x))
y = traindf['quality']
testx = testdf[features]
scaler.fit(testx)
testx=pd.DataFrame(scaler.transform(testx))
testy = testdf['quality']

##Turn into DMatrix; this is how XGBoost likes its data
training = xgb.DMatrix(x, label =y)
testing = xgb.DMatrix(testx, label=testy)

##Enter in your parameters; I am training 2 models, one is a softmax classification and one is a binary logistic classification
param = {
    'max_depth': 4,
    'eta': 0.3,
    'objective': 'multi:softmax',
    'num_class': 2} 
epochs = 10 

model = xgb.train(param, training, epochs)

param2 = {
    'max_depth': 4,
    'eta': 0.2,
    'objective': 'binary:logistic'} 
epochs = 8 

model2 = xgb.train(param2, training, epochs)

##Run your model on the testing data
predictions = model.predict(testing)
predictions2 = model2.predict(testing)
predictionDF = DataFrame({'predictions':predictions})
predictionDF2 = DataFrame({'predictions':predictions2})

##Since the binary logistic will give you a value between 0 and 1, and you only want a value of 0 or 1 you need to map this data to make it 0s and 1s
predictionDF2['predictions'] = predictionDF2['predictions'].map(lambda x: 0.0 if x <0.5 else 1.0)

##Use these 0 and 1 values to turn the predictions into 'good' and 'bad' so it looks presentable
d = {1:'good', 0:'bad'}
predictionDF['predictions']=predictionDF['predictions'].map(d)
predictionDF2['predictions']=predictionDF2['predictions'].map(d)
testdf['quality']=testdf['quality'].map(d)

##return the 
testdf['predictions2'] = predictionDF2['predictions'].values
testdf['predictions']=predictionDF['predictions'].values
testdf.head(50)





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdf['quality']=testdf['quality'].map(d)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdf['predictions2'] = predictionDF2['predictions'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdf['predictions']=predictionDF['predictions'].values


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,predictions2,predictions
4488,6.7,0.48,0.49,2.9,0.03,28.0,122.0,0.98926,3.13,0.4,13.0,good,good,good
3862,6.5,0.27,0.19,6.6,0.045,98.0,175.0,0.99364,3.16,0.34,10.1,good,good,good
2109,7.7,0.23,0.31,10.7,0.038,59.0,186.0,0.9969,3.12,0.55,9.5,good,good,good
4054,7.2,0.17,0.28,17.55,0.05,33.0,154.0,0.99971,2.94,0.43,9.0,good,good,good
4515,7.1,0.28,0.26,2.8,0.039,50.0,118.0,0.9908,3.06,0.59,11.2,good,good,good
4102,6.9,0.37,0.23,9.5,0.057,54.0,166.0,0.99568,3.23,0.42,10.0,bad,bad,bad
4796,6.4,0.105,0.29,1.1,0.035,44.0,140.0,0.99142,3.17,0.55,10.7,good,good,good
1787,7.2,0.26,0.3,2.1,0.033,50.0,158.0,0.9909,3.33,0.43,12.1,good,good,good
2475,7.4,0.56,0.09,1.5,0.071,19.0,117.0,0.99496,3.22,0.53,9.8,bad,bad,bad
3904,5.0,0.455,0.18,1.9,0.036,33.0,106.0,0.98746,3.21,0.83,14.0,good,good,good


In [2]:
##Use this to check the amount of error in the softmax classification
from sklearn.metrics import accuracy_score, jaccard_score
print(accuracy_score(testy, predictions))


0.7755102040816326
