In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
#from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, precision_score


RSEED=42

In [4]:
df = pd.read_csv('/Users/annelahann/neue-fische/kickstarter-ml-project/data/data_clean.csv')

In [15]:
df[['launched_at_day','launched_at_month']]=df[['launched_at_day','launched_at_month']].astype(object)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177522 entries, 0 to 177521
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   blurb              177514 non-null  object 
 1   country            177522 non-null  object 
 2   goal               177522 non-null  float64
 3   name               177522 non-null  object 
 4   state              177522 non-null  object 
 5   main_category      177522 non-null  object 
 6   sub_category       177522 non-null  object 
 7   location_type      177522 non-null  object 
 8   duration           177522 non-null  int64  
 9   deadline_month     177522 non-null  int64  
 10  deadline_day       177522 non-null  int64  
 11  launched_at_month  177522 non-null  object 
 12  launched_at_day    177522 non-null  object 
 13  target             177522 non-null  int64  
 14  baseline           177522 non-null  int64  
dtypes: float64(1), int64(5), object(9)
memory usage: 20

In [16]:
# Define predictors and target variable
X = df[['main_category','location_type','duration','goal','country','launched_at_month','launched_at_day']]
y = df['target']
print(f"We have {X.shape[0]} observations in our dataset and {X.shape[1]} features")
print(f"Our target vector has also {y.shape[0]} values")

We have 177522 observations in our dataset and 7 features
Our target vector has also 177522 values


In [17]:
# Creating list for categorical predictors/features 
# (dates are also objects so if you have them in your data you would deal with them first)
cat_features = list(X.columns[X.dtypes==object])
cat_features

['main_category',
 'location_type',
 'country',
 'launched_at_month',
 'launched_at_day']

In [18]:
# Creating list for numerical predictors/features
# Since 'Survived' is our target variable we will exclude this feature from this list of numerical predictors 
num_features = list(X.columns[X.dtypes!=object])
num_features

['duration', 'goal']

In [19]:
# Split into train and test set 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RSEED)

In [20]:
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

X_train shape: (142017, 7)
X_test shape: (35505, 7)
y_train shape: (142017,)
y_test shape: (35505,)


In [25]:
X.describe()
X['goal'] = np.log(X['goal'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['goal'] = np.log(X['goal'])


In [29]:
from sklearn.pipeline import Pipeline

# Pipeline for numerical features
# Initiating Pipeline and calling one step after another
# each step is built as a list of (key, value)
# key is the name of the processing step
# value is an estimator object (processing step)
num_pipeline = Pipeline([
    ('imputer_num', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])

# Pipeline for categorical features 
cat_pipeline = Pipeline([
    ('imputer_cat', SimpleImputer(strategy='constant', fill_value='missing')),
    ('1hot', OneHotEncoder(handle_unknown='ignore'))
])

In [31]:
from sklearn.compose import ColumnTransformer

# Complete pipeline for numerical and categorical features
# 'ColumnTransformer' applies transformers (num_pipeline/ cat_pipeline)
# to specific columns of an array or DataFrame (num_features/cat_features)
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

In [32]:
# Building a full pipeline with our preprocessor and a LogisticRegression Classifier
pipe_logreg = Pipeline([
    ('preprocessor', preprocessor),
    ('logreg', LogisticRegression(max_iter=1000))
])

In [52]:
# Making predictions on the training set using cross validation as well as calculating the probabilities
# cross_val_predict expects an estimator (model), X, y and nr of cv-splits (cv)
y_train_predicted = cross_val_predict(pipe_logreg, X_train, y_train, cv=5)
y_test_predicted = cross_val_predict(pipe_logreg, X_test, y_test, cv=5)

In [54]:
# Calculating the accuracy for the LogisticRegression Classifier 
print('Cross validation scores:')
print('-------------------------')
print("Accuracy: {:.4f}".format(accuracy_score(y_train, y_train_predicted)))
print("Recall: {:.4f}".format(recall_score(y_train, y_train_predicted)))
print("Precision: {:.4f}".format(precision_score(y_train, y_train_predicted)))
print(confusion_matrix(y_train, y_train_predicted))

print("Accuracy: {:.4f}".format(accuracy_score(y_test, y_test_predicted)))
print("Recall: {:.4f}".format(recall_score(y_test, y_test_predicted)))
print("Precision: {:.4f}".format(precision_score(y_test, y_test_predicted)))
print(confusion_matrix(y_test, y_test_predicted))


Cross validation scores:
-------------------------
Accuracy: 0.6551
Recall: 0.7649
Precision: 0.6494
[[35214 31215]
 [17770 57818]]
Accuracy: 0.6566
Recall: 0.7668
Precision: 0.6489
[[ 8889  7805]
 [ 4386 14425]]


In [44]:
# Defining parameter space for grid-search. Since we want to access the classifier step (called 'logreg') in our pipeline 
# we have to add 'logreg__' in front of the corresponding hyperparameters. 
param_logreg = {'logreg__penalty':('l1','l2'),
                'logreg__C': [0.001, 0.01, 0.1, 1, 10],
                'logreg__solver': ['liblinear', 'lbfgs', 'sag']
               }

grid_logreg = GridSearchCV(pipe_logreg, param_grid=param_logreg, cv=5, scoring='recall', 
                           verbose=5, n_jobs=-1)

In [45]:
grid_logreg.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END logreg__C=0.001, logreg__penalty=l1, logreg__solver=liblinear;, score=0.789 total time=   0.5s
[CV 1/5] END logreg__C=0.001, logreg__penalty=l1, logreg__solver=lbfgs;, score=nan total time=   0.4s
[CV 2/5] END logreg__C=0.001, logreg__penalty=l1, logreg__solver=lbfgs;, score=nan total time=   0.4s
[CV 2/5] END logreg__C=0.001, logreg__penalty=l1, logreg__solver=liblinear;, score=0.777 total time=   0.6s
[CV 3/5] END logreg__C=0.001, logreg__penalty=l1, logreg__solver=lbfgs;, score=nan total time=   0.4s
[CV 5/5] END logreg__C=0.001, logreg__penalty=l1, logreg__solver=liblinear;, score=0.789 total time=   0.6s
[CV 4/5] END logreg__C=0.001, logreg__penalty=l1, logreg__solver=liblinear;, score=0.783 total time=   0.6s
[CV 3/5] END logreg__C=0.001, logreg__penalty=l1, logreg__solver=liblinear;, score=0.786 total time=   0.6s
[CV 4/5] END logreg__C=0.001, logreg__penalty=l1, logreg__solver=lbfgs;, score=nan total tim



[CV 1/5] END logreg__C=10, logreg__penalty=l2, logreg__solver=sag;, score=0.767 total time=  46.6s




[CV 3/5] END logreg__C=10, logreg__penalty=l2, logreg__solver=sag;, score=0.762 total time=  45.3s




[CV 2/5] END logreg__C=10, logreg__penalty=l2, logreg__solver=sag;, score=0.760 total time=  45.9s




[CV 4/5] END logreg__C=10, logreg__penalty=l2, logreg__solver=sag;, score=0.770 total time=  33.6s
[CV 5/5] END logreg__C=10, logreg__penalty=l2, logreg__solver=sag;, score=0.766 total time=  28.0s


50 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/annelahann/neue-fische/kickstarter-ml-project/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/annelahann/neue-fische/kickstarter-ml-project/.venv/lib/python3.11/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/annelahann/neue-fische/kickstarter-ml-project/.venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solv

In [50]:
# Show best parameters
print('Best score:\n{:.2f}'.format(grid_logreg.best_score_))
print("Best parameters:\n{}".format(grid_logreg.best_params_))


Best score:
0.78
Best parameters:
{'logreg__C': 0.001, 'logreg__penalty': 'l1', 'logreg__solver': 'liblinear'}
Cross validation scores:
-------------------------
Accuracy: 0.66
Recall: 0.76
Precision: 0.65
[[35214 31215]
 [17770 57818]]


In [47]:
# Save best model (including fitted preprocessing steps) as best_model 
best_model = grid_logreg.best_estimator_
best_model

In [51]:
#Calculating the accuracy, recall and precision for the test set with the optimized model
y_test_predicted = best_model.predict(X_test)
y_train_predicted = best_model.predict(X_train)

print("Accuracy: {:.2f}".format(accuracy_score(y_test, y_test_predicted)))
print("Recall: {:.2f}".format(recall_score(y_test, y_test_predicted)))
print("Precision: {:.2f}".format(precision_score(y_test, y_test_predicted)))
print(confusion_matrix(y_test, y_test_predicted))

print("Accuracy: {:.2f}".format(accuracy_score(y_train, y_train_predicted)))
print("Recall: {:.2f}".format(recall_score(y_train, y_train_predicted)))
print("Precision: {:.2f}".format(precision_score(y_train, y_train_predicted)))
print(confusion_matrix(y_train, y_train_predicted))

Accuracy: 0.62
Recall: 0.79
Precision: 0.61
[[ 7163  9531]
 [ 3871 14940]]
Accuracy: 0.62
Recall: 0.79
Precision: 0.61
[[28486 37943]
 [15890 59698]]
