In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv('../input/cacaos/flavors_of_cacao.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.columns

In [None]:
df['Bean\nType'].value_counts()

In [None]:
df['Bean\nType'].nunique()

In [None]:
sns.heatmap(df.corr(),annot=True)

In [None]:
print('Unique Values:')
print('Company (Maker-if known): ',df['Company\xa0\n(Maker-if known)'].nunique())
print('Specific Bean Origin or Bar Name: ', df['Specific Bean Origin\nor Bar Name'].nunique())
print('Company Location: ',df['Company\nLocation'].nunique())
print('Bean Type: ', df['Bean\nType'].nunique())
print('Broad Bean Origin', df['Broad Bean\nOrigin'].nunique())
print('Review Date: ', df['Review\nDate'].nunique())
print('Cocoa Percent: ', df['Cocoa\nPercent'].nunique())

In [None]:
sns.countplot(x = df['Rating'])

In [None]:
sns.countplot(x = df['Review\nDate'])

In [None]:
df['Cocoa\nPercent'] = df['Cocoa\nPercent'].str.replace('%', '')
df['Cocoa\nPercent'] = df['Cocoa\nPercent'].str.replace('.', '')
df['Cocoa\nPercent'] = df['Cocoa\nPercent'].astype(int)

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(x= 'Cocoa\nPercent', data = df, color = 'brown')

In [None]:
def normalizeIt(percent):
    if percent > 100:
        percent = int(str(percent)[:2])
    return percent

In [None]:
df['Cocoa\nPercent'] = df['Cocoa\nPercent'].apply(normalizeIt)

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(x= 'Cocoa\nPercent', data = df, color = 'brown')

In [None]:
df['Rating'] = (df['Rating']* 100).astype(int)
df['Rating'].head(5)

In [None]:
company = pd.get_dummies(df['Company\xa0\n(Maker-if known)'],drop_first=True)
sbOrigin = pd.get_dummies(df['Specific Bean Origin\nor Bar Name'],drop_first=True)
companyLocation = pd.get_dummies(df['Company\nLocation'],drop_first=True)
bType = pd.get_dummies(df['Bean\nType'],drop_first=True)
bbOrigin = pd.get_dummies(df['Broad Bean\nOrigin'],drop_first=True)

In [None]:
df = pd.concat([df, company, sbOrigin, companyLocation, bType, bbOrigin], axis = 1)

In [None]:
df.drop(['Company\xa0\n(Maker-if known)', 'Specific Bean Origin\nor Bar Name','Company\nLocation', 'Bean\nType', 
         'Broad Bean\nOrigin'], axis = 1, inplace = True )

In [None]:
#Removing Duplicate Column

df = df.loc[:,~df.columns.duplicated()]


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop('Rating', axis = 1) #Features
y = df['Rating']   # Target Variables
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=7)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)

In [None]:
df['Venezuela'].head(5)

In [None]:
rfc_pred = rfc.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, accuracy_score
print(classification_report(y_test,rfc_pred))

In [None]:
print(accuracy_score(y_test,rfc_pred)*100)

In [None]:
sns.countplot(x = 'Rating', data=df)

In [None]:
def rating_to_stars(rating):
    
    rating = int(rating)
    
    if (rating == 0.0 ):
        return 0.0
    elif (rating > 0 ) and (rating <= 199 ):
        return 1.0
    elif (rating >= 200 ) and (rating <= 299 ):
        return 2.0
    elif (rating >= 300 ) and (rating <= 399 ):
        return 3.0
    else:
        return 4.0

In [None]:
df['Rating'] = df['Rating'].apply(rating_to_stars)

In [None]:
sns.countplot(x = 'Rating', data=df)

In [None]:
X = df.drop('Rating', axis = 1)
y = df['Rating']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=7)

In [None]:
rfc = RandomForestClassifier(n_estimators=5000, min_weight_fraction_leaf= 0)
rfc.fit(X_train, y_train)

In [None]:
rfc_pred = rfc.predict(X_test)
print(classification_report(y_test,rfc_pred))

In [None]:
print(accuracy_score(y_test,rfc_pred)*100)