In [1]:
# Import everything

import sys
import pandas as pd
import numpy as np


import sklearn
from sklearn.model_selection import train_test_split
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

import xgboost
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import pickle
import joblib


In [2]:
df = pd.read_csv('assets/Kickstarter_FinalCleaned.csv')
df.head()

Unnamed: 0,id,backers_count,category,pledged,state,blurb_length,goal_in_usd,campaign_duration,sub_category
0,0,128,publishing,4718.0,1,17.0,5770.03,40,zines
1,1,0,publishing,0.0,0,22.0,3804.7,30,zines
2,2,1,publishing,25.0,0,20.0,1705.15,30,zines
3,3,2,publishing,120.0,0,19.0,5371.42,60,zines
4,4,0,publishing,0.0,0,16.0,9.15,30,zines


In [3]:
df.columns

Index(['id', 'backers_count', 'category', 'pledged', 'state', 'blurb_length',
       'goal_in_usd', 'campaign_duration', 'sub_category'],
      dtype='object')

In [4]:
df = df.drop(columns='id')

In [5]:
df.shape

(8317, 8)

In [6]:
# Extracting the target and feature matrix
target = 'state'
y = df[target]
X = df.drop(columns=target)

print(X.shape)
print(y.shape)

(8317, 7)
(8317,)


In [7]:
# Splitting into train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .4)
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

(4990, 7) (3327, 7)
(4990,) (3327,)


In [8]:
#Baseline

print('baseline accuracy', y.value_counts(normalize=True).max())


baseline accuracy 0.6059877359624865


In [9]:
#  Random Forest

model_rf = make_pipeline(OrdinalEncoder(),
                       SimpleImputer(strategy="mean"),
                       RandomForestClassifier( n_jobs=-1, random_state=42))

In [10]:
# Decision Tree
model_dt = make_pipeline(OrdinalEncoder(),
                      SimpleImputer(strategy="mean"),
                      DecisionTreeClassifier(random_state=42))

In [11]:
# XGBoost

model_xgb = make_pipeline(OrdinalEncoder(),
                       SimpleImputer(strategy="mean"),
                       XGBClassifier(random_state=42))

In [12]:
# Gradient Boost

model_gb = make_pipeline(OrdinalEncoder(),
                       SimpleImputer(strategy="mean"),
                       GradientBoostingClassifier(random_state=42))

In [13]:
model_rf.fit(X_train,y_train)
model_dt.fit(X_train,y_train)
model_xgb.fit(X_train,y_train)
model_gb.fit(X_train,y_train)



Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['category', 'sub_category'],
                                mapping=[{'col': 'category',
                                          'data_type': dtype('O'),
                                          'mapping': art              1
dance            2
fashion          3
film & video     4
publishing       5
comics           6
photography      7
design           8
food             9
crafts          10
technology      11
music           12
theater         13
NaN             -2
dtype: int64},
                                         {'col': 'sub_category',
                                          'data_type': dtype('O'),
                                          'mapping': painting          1
performances      2
accessories       3
thrillers         4
fiction           5
               ... 
calendars       120
quilts          121
translations    122
letterpress     123
NaN              -2
Length: 124, dtype: int64}])),


In [14]:
#Check Metrics on training
print('model_dt accuracy score', accuracy_score(y_train, model_dt.predict(X_train)))
print('model_rf accuracy score', accuracy_score(y_train, model_rf.predict(X_train)))
print('model_xgb accuracy score', accuracy_score(y_train, model_xgb.predict(X_train)))
print('model_gb accuracy score', accuracy_score(y_train, model_gb.predict(X_train)))

model_dt accuracy score 1.0
model_rf accuracy score 1.0
model_xgb accuracy score 1.0
model_gb accuracy score 0.9889779559118237


In [15]:
# Metrics with test data
# print('model_dt accuracy score', accuracy_score(y_test, model_dt.predict(X_test)))
# print('model_rf accuracy score', accuracy_score(y_test, model_rf.predict(X_test)))
# print('model_xgb accuracy score', accuracy_score(y_test, model_xgb.predict(X_test)))
# print('model_gb accuracy score', accuracy_score(y_test, model_gb.predict(X_test)))

In [16]:
# saving models using pickle
saved_model_rf = pickle.dumps(model_rf)
saved_model_xgb = pickle.dumps(model_xgb)


In [17]:

joblib_file = "joblib_RF_Model.pkl"  
joblib.dump(model_rf, 'assets/model_rf')


['assets/model_rf']

In [18]:
joblib_file = "joblib_XGB_Model.pkl"  
joblib.dump(model_xgb, 'assets/model_xgb')

['assets/model_xgb']

In [19]:
#Testing if model saved and working correctly
# # Load from file
# load_xgb_model = joblib.load('assets/model_xgb')
# load_xgb_model



In [20]:
# # Use the Reloaded Joblib Model to 
# # Calculate the accuracy score and predict target values

# # Calculate the Score 
# score = load_xgb_model.score(X_test, y_test)  
# # # Print the Score
# print("Test score: {0:.2f} %".format(100 * score))  

# # # Predict the Labels using the reloaded Model
# Ypredict = load_xgb_model.predict(X_test)  

# Ypredict

In [21]:
X_train.columns

Index(['backers_count', 'category', 'pledged', 'blurb_length', 'goal_in_usd',
       'campaign_duration', 'sub_category'],
      dtype='object')

In [22]:
X_train.sub_category.unique()

array(['painting', 'performances', 'accessories', 'thrillers', 'fiction',
       'comic books', 'places', 'comedy', 'graphic design',
       'civic design', 'documentary', 'drinks', 'graphic novels',
       'webseries', 'anthologies', 'woodworking', 'illustration',
       'footwear', 'performance art', 'software', 'world music', 'sound',
       'animation', 'farms', 'wearables', 'photobooks', 'gadgets',
       'fine art', 'product design', 'restaurants', 'plays',
       'architecture', 'digital art', 'sculpture', 'jewelry', 'textiles',
       '3d printing', 'public art', "children's books", 'apparel',
       'science fiction', 'pop', 'drama', 'cookbooks', 'narrative film',
       'typography', 'nonfiction', 'web', 'r&b', 'webcomics', 'shorts',
       'childrenswear', 'events', 'musical', 'installations', 'toys',
       'fantasy', 'zines', 'interactive design', 'action', 'experimental',
       'horror', 'robots', 'conceptual art', "farmer's markets", 'diy',
       'punk', 'festivals', '

In [23]:
X_train.pledged.max()

9286475.0

In [24]:
X_train.columns

Index(['backers_count', 'category', 'pledged', 'blurb_length', 'goal_in_usd',
       'campaign_duration', 'sub_category'],
      dtype='object')