In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
#from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.tree import plot_tree
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, precision_score


RSEED=42

In [2]:
df = pd.read_csv('/Users/annelahann/neue-fische/kickstarter-ml-project/data/data_clean.csv')

In [3]:
df[['launched_at_day','launched_at_month']]=df[['launched_at_day','launched_at_month']].astype(object)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177522 entries, 0 to 177521
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   blurb              177514 non-null  object 
 1   country            177522 non-null  object 
 2   goal               177522 non-null  float64
 3   name               177522 non-null  object 
 4   state              177522 non-null  object 
 5   main_category      177522 non-null  object 
 6   sub_category       177522 non-null  object 
 7   location_type      177522 non-null  object 
 8   duration           177522 non-null  int64  
 9   deadline_month     177522 non-null  int64  
 10  deadline_day       177522 non-null  int64  
 11  launched_at_month  177522 non-null  object 
 12  launched_at_day    177522 non-null  object 
 13  target             177522 non-null  int64  
 14  baseline           177522 non-null  int64  
dtypes: float64(1), int64(5), object(9)
memory usage: 20

In [4]:
# Define predictors and target variable
X = df[['main_category','location_type','duration','goal','country','launched_at_month','launched_at_day']]
y = df['target']
print(f"We have {X.shape[0]} observations in our dataset and {X.shape[1]} features")
print(f"Our target vector has also {y.shape[0]} values")

We have 177522 observations in our dataset and 7 features
Our target vector has also 177522 values


In [5]:
# Creating list for categorical predictors/features 
# (dates are also objects so if you have them in your data you would deal with them first)
cat_features = list(X.columns[X.dtypes==object])
cat_features

['main_category',
 'location_type',
 'country',
 'launched_at_month',
 'launched_at_day']

In [6]:
# Creating list for numerical predictors/features
# Since 'Survived' is our target variable we will exclude this feature from this list of numerical predictors 
num_features = list(X.columns[X.dtypes!=object])
num_features

['duration', 'goal']

In [7]:
# Split into train and test set 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RSEED)

In [8]:
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

X_train shape: (142017, 7)
X_test shape: (35505, 7)
y_train shape: (142017,)
y_test shape: (35505,)


In [9]:
X.describe()
X['goal'] = np.log(X['goal'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['goal'] = np.log(X['goal'])


In [10]:
from sklearn.pipeline import Pipeline

# Pipeline for numerical features
# Initiating Pipeline and calling one step after another
# each step is built as a list of (key, value)
# key is the name of the processing step
# value is an estimator object (processing step)
num_pipeline = Pipeline([
    ('imputer_num', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])

# Pipeline for categorical features 
cat_pipeline = Pipeline([
    ('imputer_cat', SimpleImputer(strategy='constant', fill_value='missing')),
    ('1hot', OneHotEncoder(handle_unknown='ignore'))
])

In [11]:
from sklearn.compose import ColumnTransformer

# Complete pipeline for numerical and categorical features
# 'ColumnTransformer' applies transformers (num_pipeline/ cat_pipeline)
# to specific columns of an array or DataFrame (num_features/cat_features)
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

In [26]:
data_prep=preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])
print(data_prep)

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer_num',
                                                  SimpleImputer(strategy='median')),
                                                 ('std_scaler',
                                                  StandardScaler())]),
                                 ['duration', 'goal']),
                                ('cat',
                                 Pipeline(steps=[('imputer_cat',
                                                  SimpleImputer(fill_value='missing',
                                                                strategy='constant')),
                                                 ('1hot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['main_category', 'location_type', 'country',
                                  'launched_at_month', 'launched_at_day'])])


In [None]:
dectr1=DecisionTreeClassifier(max_depth=10)

In [19]:
# Building a full pipeline with our preprocessor and a LogisticRegression Classifier
pipe_dectr = Pipeline([
    ('preprocessor', preprocessor),
    ('dectr', DecisionTreeClassifier())
])
  


In [20]:
# Making predictions on the training set using cross validation as well as calculating the probabilities
# cross_val_predict expects an estimator (model), X, y and nr of cv-splits (cv)
y_train_predicted = cross_val_predict(pipe_dectr, X_train, y_train, cv=5)
y_test_predicted = cross_val_predict(pipe_dectr, X_test, y_test, cv=5)

In [21]:
# Calculating the accuracy for the LogisticRegression Classifier 
print('Cross validation scores:')
print('-------------------------')
print("Accuracy: {:.4f}".format(accuracy_score(y_train, y_train_predicted)))
print("Recall: {:.4f}".format(recall_score(y_train, y_train_predicted)))
print("Precision: {:.4f}".format(precision_score(y_train, y_train_predicted)))
print(confusion_matrix(y_train, y_train_predicted))

print("Accuracy: {:.4f}".format(accuracy_score(y_test, y_test_predicted)))
print("Recall: {:.4f}".format(recall_score(y_test, y_test_predicted)))
print("Precision: {:.4f}".format(precision_score(y_test, y_test_predicted)))
print(confusion_matrix(y_test, y_test_predicted))


Cross validation scores:
-------------------------
Accuracy: 0.5966
Recall: 0.6131
Precision: 0.6230
[[38389 28040]
 [29248 46340]]
Accuracy: 0.5912
Recall: 0.6104
Precision: 0.6150
[[ 9506  7188]
 [ 7328 11483]]


In [34]:
# Defining parameter space for grid-search. Since we want to access the classifier step (called 'logreg') in our pipeline 
# we have to add 'logreg__' in front of the corresponding hyperparameters. 
param_dectr = {'dectr__criterion':('gini','entropy'),
                'dectr__max_depth': [5, 10, 20],
                'dectr__max_features': ['auto', 'sqrt']
               }

grid_dectr = GridSearchCV(pipe_dectr, param_grid=param_dectr, cv=5, scoring='accuracy', 
                           verbose=5, n_jobs=-1)

In [35]:
grid_dectr.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits




[CV 1/5] END dectr__criterion=gini, dectr__max_depth=5, dectr__max_features=auto;, score=0.556 total time=   0.3s
[CV 2/5] END dectr__criterion=gini, dectr__max_depth=5, dectr__max_features=auto;, score=0.578 total time=   0.3s
[CV 3/5] END dectr__criterion=gini, dectr__max_depth=5, dectr__max_features=auto;, score=0.588 total time=   0.3s
[CV 4/5] END dectr__criterion=gini, dectr__max_depth=5, dectr__max_features=auto;, score=0.631 total time=   0.3s
[CV 5/5] END dectr__criterion=gini, dectr__max_depth=5, dectr__max_features=auto;, score=0.628 total time=   0.3s
[CV 1/5] END dectr__criterion=gini, dectr__max_depth=5, dectr__max_features=sqrt;, score=0.573 total time=   0.3s
[CV 2/5] END dectr__criterion=gini, dectr__max_depth=5, dectr__max_features=sqrt;, score=0.609 total time=   0.3s
[CV 3/5] END dectr__criterion=gini, dectr__max_depth=5, dectr__max_features=sqrt;, score=0.570 total time=   0.3s
[CV 4/5] END dectr__criterion=gini, dectr__max_depth=5, dectr__max_features=sqrt;, score



[CV 5/5] END dectr__criterion=gini, dectr__max_depth=10, dectr__max_features=auto;, score=0.635 total time=   0.3s
[CV 4/5] END dectr__criterion=gini, dectr__max_depth=10, dectr__max_features=auto;, score=0.627 total time=   0.4s
[CV 1/5] END dectr__criterion=gini, dectr__max_depth=10, dectr__max_features=sqrt;, score=0.633 total time=   0.3s
[CV 2/5] END dectr__criterion=gini, dectr__max_depth=10, dectr__max_features=sqrt;, score=0.579 total time=   0.4s
[CV 3/5] END dectr__criterion=gini, dectr__max_depth=10, dectr__max_features=sqrt;, score=0.627 total time=   0.4s




[CV 4/5] END dectr__criterion=gini, dectr__max_depth=10, dectr__max_features=sqrt;, score=0.623 total time=   0.3s
[CV 5/5] END dectr__criterion=gini, dectr__max_depth=10, dectr__max_features=sqrt;, score=0.630 total time=   0.3s




[CV 3/5] END dectr__criterion=gini, dectr__max_depth=20, dectr__max_features=auto;, score=0.652 total time=   0.8s
[CV 2/5] END dectr__criterion=gini, dectr__max_depth=20, dectr__max_features=auto;, score=0.647 total time=   0.8s
[CV 4/5] END dectr__criterion=gini, dectr__max_depth=20, dectr__max_features=auto;, score=0.641 total time=   0.8s
[CV 1/5] END dectr__criterion=gini, dectr__max_depth=20, dectr__max_features=auto;, score=0.640 total time=   0.9s
[CV 1/5] END dectr__criterion=gini, dectr__max_depth=20, dectr__max_features=sqrt;, score=0.652 total time=   0.7s
[CV 5/5] END dectr__criterion=gini, dectr__max_depth=20, dectr__max_features=auto;, score=0.653 total time=   0.8s
[CV 3/5] END dectr__criterion=gini, dectr__max_depth=20, dectr__max_features=sqrt;, score=0.642 total time=   0.8s
[CV 2/5] END dectr__criterion=gini, dectr__max_depth=20, dectr__max_features=sqrt;, score=0.649 total time=   0.8s
[CV 1/5] END dectr__criterion=entropy, dectr__max_depth=5, dectr__max_features=a



[CV 3/5] END dectr__criterion=entropy, dectr__max_depth=5, dectr__max_features=auto;, score=0.570 total time=   0.3s
[CV 4/5] END dectr__criterion=entropy, dectr__max_depth=5, dectr__max_features=auto;, score=0.555 total time=   0.3s
[CV 5/5] END dectr__criterion=entropy, dectr__max_depth=5, dectr__max_features=auto;, score=0.637 total time=   0.3s




[CV 1/5] END dectr__criterion=entropy, dectr__max_depth=5, dectr__max_features=sqrt;, score=0.569 total time=   0.3s
[CV 3/5] END dectr__criterion=entropy, dectr__max_depth=5, dectr__max_features=sqrt;, score=0.576 total time=   0.3s
[CV 2/5] END dectr__criterion=entropy, dectr__max_depth=5, dectr__max_features=sqrt;, score=0.583 total time=   0.4s
[CV 4/5] END dectr__criterion=gini, dectr__max_depth=20, dectr__max_features=sqrt;, score=0.654 total time=   0.8s
[CV 5/5] END dectr__criterion=gini, dectr__max_depth=20, dectr__max_features=sqrt;, score=0.650 total time=   0.8s
[CV 4/5] END dectr__criterion=entropy, dectr__max_depth=5, dectr__max_features=sqrt;, score=0.625 total time=   0.3s
[CV 5/5] END dectr__criterion=entropy, dectr__max_depth=5, dectr__max_features=sqrt;, score=0.605 total time=   0.3s




[CV 2/5] END dectr__criterion=entropy, dectr__max_depth=10, dectr__max_features=auto;, score=0.626 total time=   0.3s
[CV 1/5] END dectr__criterion=entropy, dectr__max_depth=10, dectr__max_features=auto;, score=0.629 total time=   0.4s
[CV 3/5] END dectr__criterion=entropy, dectr__max_depth=10, dectr__max_features=auto;, score=0.624 total time=   0.3s
[CV 4/5] END dectr__criterion=entropy, dectr__max_depth=10, dectr__max_features=auto;, score=0.646 total time=   0.3s
[CV 5/5] END dectr__criterion=entropy, dectr__max_depth=10, dectr__max_features=auto;, score=0.604 total time=   0.3s
[CV 1/5] END dectr__criterion=entropy, dectr__max_depth=10, dectr__max_features=sqrt;, score=0.640 total time=   0.3s
[CV 2/5] END dectr__criterion=entropy, dectr__max_depth=10, dectr__max_features=sqrt;, score=0.642 total time=   0.4s
[CV 3/5] END dectr__criterion=entropy, dectr__max_depth=10, dectr__max_features=sqrt;, score=0.638 total time=   0.4s
[CV 4/5] END dectr__criterion=entropy, dectr__max_depth=



[CV 1/5] END dectr__criterion=entropy, dectr__max_depth=20, dectr__max_features=auto;, score=0.640 total time=   0.7s
[CV 3/5] END dectr__criterion=entropy, dectr__max_depth=20, dectr__max_features=auto;, score=0.651 total time=   0.7s
[CV 2/5] END dectr__criterion=entropy, dectr__max_depth=20, dectr__max_features=auto;, score=0.648 total time=   0.7s
[CV 5/5] END dectr__criterion=entropy, dectr__max_depth=20, dectr__max_features=auto;, score=0.654 total time=   0.8s
[CV 2/5] END dectr__criterion=entropy, dectr__max_depth=20, dectr__max_features=sqrt;, score=0.658 total time=   0.6s
[CV 4/5] END dectr__criterion=entropy, dectr__max_depth=20, dectr__max_features=auto;, score=0.643 total time=   0.8s
[CV 1/5] END dectr__criterion=entropy, dectr__max_depth=20, dectr__max_features=sqrt;, score=0.653 total time=   0.8s
[CV 3/5] END dectr__criterion=entropy, dectr__max_depth=20, dectr__max_features=sqrt;, score=0.650 total time=   0.6s
[CV 4/5] END dectr__criterion=entropy, dectr__max_depth=

In [36]:
# Show best parameters
print('Best score:\n{:.2f}'.format(grid_dectr.best_score_))
print("Best parameters:\n{}".format(grid_dectr.best_params_))


Best score:
0.65
Best parameters:
{'dectr__criterion': 'entropy', 'dectr__max_depth': 20, 'dectr__max_features': 'sqrt'}


In [41]:
# Save best model (including fitted preprocessing steps) as best_model 
best_model = grid_dectr.best_estimator_
best_model
if 'dectr' in best_model.named_steps:
    feature_importance=best_model.named_steps['dectr'].feature_importances_
    print("Feature Importance: ", feature_importance)
else:
    print("best model is not a decision tree")

Feature Importance:  [1.83836348e-01 2.55559531e-01 4.68250655e-03 1.52170762e-03
 5.60138019e-03 2.73087413e-02 2.15929657e-03 5.42765886e-03
 1.59433789e-02 6.04273105e-02 1.71127142e-02 1.51511114e-02
 4.16303220e-02 1.91446699e-02 5.43592387e-02 3.03543994e-02
 2.16950979e-03 9.25002938e-05 7.64210638e-03 0.00000000e+00
 5.89066516e-04 1.69728026e-03 4.27146581e-05 4.18754026e-03
 6.39027548e-03 1.12230263e-03 2.03285740e-03 7.97912454e-04
 2.12352749e-03 5.74946905e-04 5.31897512e-03 8.76870381e-04
 5.40796460e-03 1.46896084e-03 2.98475983e-03 2.23351064e-03
 9.00814896e-03 2.66896525e-03 8.29702269e-04 3.76322499e-03
 1.49128207e-03 3.56721082e-04 3.31476592e-03 1.54081978e-03
 6.70247512e-04 9.16188852e-04 1.86782637e-03 6.41992460e-04
 2.01066816e-02 5.90107639e-03 5.13964866e-03 5.89680129e-03
 5.82603359e-03 5.53112265e-03 5.09092520e-03 9.89284243e-03
 5.74830684e-03 5.44601189e-03 5.48962817e-03 6.01572988e-03
 6.41070878e-03 5.41585777e-03 3.42131307e-03 3.71038438e-03
 2.

In [None]:
index_feat=range(1,len(feature_importance)+1)


In [39]:
#Calculating the accuracy, recall and precision for the test set with the optimized model
y_test_predicted = best_model.predict(X_test)
y_train_predicted = best_model.predict(X_train)

print("Accuracy: {:.2f}".format(accuracy_score(y_test, y_test_predicted)))
print("Recall: {:.2f}".format(recall_score(y_test, y_test_predicted)))
print("Precision: {:.2f}".format(precision_score(y_test, y_test_predicted)))
print(confusion_matrix(y_test, y_test_predicted))

print("Accuracy: {:.2f}".format(accuracy_score(y_train, y_train_predicted)))
print("Recall: {:.2f}".format(recall_score(y_train, y_train_predicted)))
print("Precision: {:.2f}".format(precision_score(y_train, y_train_predicted)))
print(confusion_matrix(y_train, y_train_predicted))

AttributeError: 'Pipeline' object has no attribute 'feature_importance_'