In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# 7.1a

In [None]:
df = pd.read_csv('Coffee-modified.csv')
select_lst = ['Total.Cup.Points', 'Species','Country.of.Origin','Processing.Method', 'Aroma', 'Flavor', 'Aftertaste',
'Acidity','Body', 'Balance', 'Uniformity', 'Moisture', 'altitude_mean_meters']
df = df.filter(select_lst)
df.dropna(inplace=True)
df.info()

In [None]:
Y = pd.DataFrame(df[df.columns[0]])
X = pd.DataFrame(df[df.columns[1:]])
Y=Y.reset_index(drop = True)
X=X.reset_index(drop = True)
print(X.info())
print(Y.info())

# 7.1b

## Process Y from values to Coffee Bean Grade
## define Bean_Grade = [1,2,3] 


In [None]:
rating_pctile = np.percentile( Y, [75, 90])
Bean_grade = []
NSamples =[0,0,0]

for i in Y.values:    
    if float(i) < rating_pctile[0]: #75 percentile
        NSamples[0] += 1 
        Bean_grade.append(1)
    elif rating_pctile [0] <= float(i) < rating_pctile [1]:
        NSamples[1] += 1 
        Bean_grade.append(2)
    elif  float(i) >= rating_pctile[1]:
        NSamples[2] +=1
        Bean_grade.append(3)
print(NSamples)



In [None]:
fig = px.bar(x = [1,2,3], y = NSamples, color=NSamples, range_y=[0.0,1000])
fig.show()

In [None]:
Y["Bean_grade"] = Bean_grade
Y.drop(["Total.Cup.Points"],axis =1,inplace= True)
Y

# 7.1 (c)

In [None]:
X_category = X.select_dtypes("object")
X_continuous = X.select_dtypes("number")
print(X_category.info())
print(X_continuous.info())


In [None]:
lower = pd.DataFrame(np.tril(X_continuous.corr(),-1),columns = X_continuous.corr().columns)
to_drop = [column for column in lower.columns if any(lower[column] > 0.8)]
X_continuous.drop(columns = to_drop,inplace=True)
to_drop


In [None]:
standard_scaler = preprocessing.StandardScaler()
s_X = pd.DataFrame(standard_scaler.fit_transform(X_continuous),columns= X_continuous.columns)
X_Encoded = pd.get_dummies(X_category,columns=X_category.columns, drop_first=True )
X_Encoded

In [None]:
X = X_Encoded.join(s_X)
# X = pd.concat([X_Encoded, s_X], axis=1,join= 'outer')
# print(s_X.shape)
# print(X_Encoded.shape)
print(X.info())
print(Y.info())

In [None]:
seed = 55
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=seed)
#KNN parameter
# test_size = int(np.floor(0.3 * len( X )))
# train_size = int(np.floor(0.7 * len( X )))
# x_train, x_test = X[0:train_size], X[train_size:len(X)]
# y_train, y_test = Y[0:train_size], Y[train_size:len(X)]

# 7.2a KNN Model

In [None]:

from sklearn.metrics import r2_score

k = [1,3,5,7,9,11,13,19,25,35]
modelKNN = KNeighborsClassifier(n_neighbors=11,p=2)
modelKNN.fit(x_train,y_train)

y_pred = modelKNN.predict(x_test)
KNNScore = accuracy_score(y_test,y_pred)
KNNScore
#rint(r2_score(y_test,y_pred))

In [None]:
print('Confusion Matrix: ')
print(confusion_matrix(y_test, y_pred))
print('Classification Report: ')
print(classification_report(y_test, y_pred))

## knn loop

In [None]:
KNN_score_list = []
for i in range(0,len(k)):
    modelKNN = KNeighborsClassifier(n_neighbors=k[i],p=2)
    modelKNN.fit(x_train,y_train)

    y_pred = modelKNN.predict(x_test)
    KNNScore = accuracy_score(y_test,y_pred)
    KNN_score_list.append(KNNScore)
fig = px.bar(KNN_score_list,x=k,y=KNN_score_list,color = KNN_score_list,range_y=[0.7,1.0])
fig.show()


# 7.2b Decision Tree Model Training and Testing

In [None]:
# Decision tree parameter
ASM_function = ['entropy','gini']
maxD = [4,5,6,None]

In [None]:
#model training
ModelDT = DecisionTreeClassifier(criterion= 'entropy' ,splitter='best',max_depth=4)
ModelDT.fit(x_train,y_train)
#model testing 
y_pred = ModelDT.predict(x_test)
DTScore = accuracy_score(y_test,y_pred)
DTScore

In [None]:
#Print Confusion Matrix and Classification Report for best k
print('Confusion Matrix: ')
print(confusion_matrix(y_test,y_pred))
print('Classifiction Report: ')
print(classification_report(y_test,y_pred))


## loop

In [None]:
DTScore_list = []
for j in ASM_function:
    for i in range (0,len(maxD)):
        ModelDT = DecisionTreeClassifier(criterion= j ,splitter='best',max_depth=maxD[i])
        ModelDT.fit(x_train,y_train)
        y_pred = ModelDT.predict(x_test)
        DTScore = accuracy_score(y_test,y_pred)
        DTScore_list.append(DTScore)
        print(DTScore)


In [None]:
#visualize decision tree
from sklearn import tree

feature_names = x_train.columns
Labels = np.unique(y_train)
label = []
for i in Labels:
    label.append(str(i))
print(feature_names)
print(Labels)
from matplotlib.pylab import rcParams

##set up the parameters
rcParams['figure.figsize'] = 100,50
tree.plot_tree(ModelDT,feature_names= feature_names,class_names=label,rounded=True,filled = True,fontsize =9)
plt.show()

# 7.2c Random forest model training 

In [None]:
# Random Forest parameter
ASM_function = ['entropy','gini']
nEstimator = 100 
nJob = 8
rState =10

In [None]:
# Model Training 
RandomF = RandomForestClassifier(criterion = 'entropy',n_estimators=nEstimator,n_jobs=nJob,random_state=rState)
RandomF.fit(x_train,y_train)
#Model Testing
y_pred = RandomF.predict(x_test)
RFScore = accuracy_score(y_test,y_pred)
RFScore

In [None]:
#print confusion matrix and classification report for best k
print('Confusion Matrix: ')
print(confusion_matrix(y_test,y_pred))
print('Classification Report: ')
print(classification_report(y_test,y_pred))

In [None]:
RFScore_list = []
for i in ASM_function:    
    RandomF = RandomForestClassifier(criterion = i,n_estimators=nEstimator,n_jobs=nJob,random_state=rState)
    RandomF.fit(x_train,y_train)    
    y_pred = RandomF.predict(x_test)
    RFScore = accuracy_score(y_test,y_pred.ravel())
    RFScore_list.append(RFScore)
    print(RFScore)

## Visualize Feature Important Score 

In [None]:
feature_imp = pd.Series(RandomF.feature_importances_,index = feature_names).sort_values(ascending=False)

plt.figure(figsize=(15,15))
sns.barplot(x=feature_imp,y=feature_imp.index)

## Visualize selected estimator 

In [None]:
fig,axes = plt.subplots(nrows=1 , ncols = 5 , figsize= (100,20),dpi = 500)
for index in range(0,5):
    tree.plot_tree(RandomF.estimators_[index],class_names=label,filled = True,ax = axes[index])
axes[index].set_title('Estimator:'+str(index),fontsize=11)

# Hyperparmeter Tuning

In [None]:
#Create Model List
classification = {'KNN':KNeighborsClassifier(),'DT':DecisionTreeClassifier(),'RF':RandomForestClassifier()}
#Create Parameter Dicitonary for KNN
K_list = [1,3,5,7,9,11,13,15,17,19,21,23,35,45]
KNN_param = dict(n_neighbors = K_list)
#Create Parameter Dictionary for Decision Tree
ASM_function = ['entropy','gini']
maxD = [4,5,6,None]
maxF =['auto','log2',None]
minSample = [1,2,4]
DT_param = dict(criterion = ASM_function,max_depth = maxD,min_samples_leaf = minSample,max_features = maxF)
#Create Parameter Dictionary for Random Forest
nEst = [10,30,50,100]
RF_param = dict(n_estimators = nEst,criterion = ASM_function,max_depth = maxD,min_samples_leaf= minSample,max_features = maxF)
DecisionTreeClassifier().get_params().keys()


## Perform GridsearchCV()

In [None]:
# grid = GridSearchCV(estimator = classification['RF'],n_jobs=8,verbose=10,scoring='accuracy',cv=5,param_grid=RF_param)
# grid_result = grid.fit(x_train,y_train)
for i in classification:
    print(i)

In [None]:
for model in classification:
    if model == 'KNN':
        param = KNN_param
    elif model == 'DT':
        param = DT_param
    else:
        param = RF_param
    grid = GridSearchCV(estimator = classification[model],n_jobs=8,verbose=10,scoring='accuracy',cv=5,param_grid=param)
    grid_result = grid.fit(x_train,y_train)

In [None]:
# Show Best Parameters 
print('Best params: ',grid_result.best_params_)
print('Best score: ', grid_result.best_score_)

In [None]:
means   = grid_result.cv_results_['mean_test_score']
stds    = grid_result.cv_results_['std_test_score']
params  = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))