In [26]:
# Import everything

import sys
import pandas as pd
import numpy as np


import sklearn
from sklearn.model_selection import train_test_split
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

import xgboost
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import pickle
import joblib


In [27]:
df = pd.read_csv('assets/Kickstarter_FinalCleaned.csv')
df.head()

Unnamed: 0,id,backers_count,category,pledged,state,blurb_length,goal_in_usd,campaign_duration,sub_category
0,0,128,publishing,4718.0,1,17.0,5770.03,40,zines
1,1,0,publishing,0.0,0,22.0,3804.7,30,zines
2,2,1,publishing,25.0,0,20.0,1705.15,30,zines
3,3,2,publishing,120.0,0,19.0,5371.42,60,zines
4,4,0,publishing,0.0,0,16.0,9.15,30,zines


In [29]:
df.columns

Index(['id', 'backers_count', 'category', 'pledged', 'state', 'blurb_length',
       'goal_in_usd', 'campaign_duration', 'sub_category'],
      dtype='object')

In [30]:
df.shape

(8317, 9)

In [31]:
# Extracting the target and feature matrix
target = 'state'
y = df[target]
X = df.drop(columns=target)

print(X.shape)
print(y.shape)

(8317, 8)
(8317,)


In [32]:
# Splitting into train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .4)
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

(4990, 8) (3327, 8)
(4990,) (3327,)


In [33]:
#Baseline

print('baseline accuracy', y.value_counts(normalize=True).max())


baseline accuracy 0.6059877359624865


In [34]:
#  Random Forest

model_rf = make_pipeline(OrdinalEncoder(),
                       SimpleImputer(strategy="mean"),
                       RandomForestClassifier( n_jobs=-1, random_state=42))

In [35]:
# Decision Tree
model_dt = make_pipeline(OrdinalEncoder(),
                      SimpleImputer(strategy="mean"),
                      DecisionTreeClassifier(random_state=42))

In [36]:
# XGBoost

model_xgb = make_pipeline(OrdinalEncoder(),
                       SimpleImputer(strategy="mean"),
                       XGBClassifier(random_state=42))

In [37]:
# Gradient Boost

model_gb = make_pipeline(OrdinalEncoder(),
                       SimpleImputer(strategy="mean"),
                       GradientBoostingClassifier(random_state=42))

In [38]:
model_rf.fit(X_train,y_train)
model_dt.fit(X_train,y_train)
model_xgb.fit(X_train,y_train)
model_gb.fit(X_train,y_train)



Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['category', 'sub_category'],
                                mapping=[{'col': 'category',
                                          'data_type': dtype('O'),
                                          'mapping': art              1
film & video     2
fashion          3
publishing       4
technology       5
crafts           6
design           7
food             8
theater          9
photography     10
comics          11
music           12
dance           13
NaN             -2
dtype: int64},
                                         {'col': 'sub_category',
                                          'data_type': dtype('O'),
                                          'mapping': illustration          1
action                2
accessories           3
radio & podcasts      4
hardware              5
                   ... 
literary spaces     119
translations        120
letterpress         121
quilts              122
NaN        

In [39]:
#Check Metrics on training
print('model_dt accuracy score', accuracy_score(y_train, model_dt.predict(X_train)))
print('model_rf accuracy score', accuracy_score(y_train, model_rf.predict(X_train)))
print('model_xgb accuracy score', accuracy_score(y_train, model_xgb.predict(X_train)))
print('model_gb accuracy score', accuracy_score(y_train, model_gb.predict(X_train)))

model_dt accuracy score 1.0
model_rf accuracy score 1.0
model_xgb accuracy score 1.0
model_gb accuracy score 0.9875751503006012


In [40]:
# Metrics with test data
# print('model_dt accuracy score', accuracy_score(y_test, model_dt.predict(X_test)))
# print('model_rf accuracy score', accuracy_score(y_test, model_rf.predict(X_test)))
# print('model_xgb accuracy score', accuracy_score(y_test, model_xgb.predict(X_test)))
# print('model_gb accuracy score', accuracy_score(y_test, model_gb.predict(X_test)))

In [41]:
# saving models using pickle
saved_model_rf = pickle.dumps(model_rf)
saved_model_xgb = pickle.dumps(model_xgb)


In [42]:

joblib_file = "joblib_RF_Model.pkl"  
joblib.dump(model_rf, 'assets/model_rf')


['assets/model_rf']

In [43]:
joblib_file = "joblib_XGB_Model.pkl"  
joblib.dump(model_xgb, 'assets/model_xgb')

['assets/model_xgb']

In [44]:
#Testing if model saved and working correctly
# # Load from file
# load_xgb_model = joblib.load('assets/model_xgb')
# load_xgb_model



In [45]:
# # Use the Reloaded Joblib Model to 
# # Calculate the accuracy score and predict target values

# # Calculate the Score 
# score = load_xgb_model.score(X_test, y_test)  
# # # Print the Score
# print("Test score: {0:.2f} %".format(100 * score))  

# # # Predict the Labels using the reloaded Model
# Ypredict = load_xgb_model.predict(X_test)  

# Ypredict

In [46]:
X_train.columns

Index(['id', 'backers_count', 'category', 'pledged', 'blurb_length',
       'goal_in_usd', 'campaign_duration', 'sub_category'],
      dtype='object')

In [47]:
X_train.category.value_counts()

film & video    878
art             719
technology      592
comics          482
fashion         431
crafts          342
publishing      333
design          322
food            318
theater         203
music           133
dance           129
photography     108
Name: category, dtype: int64