In [1]:
import pandas as pd

DATA = '/kaggle/input/campaign-data/campaign_data.csv'

df = pd.read_csv(filepath_or_buffer=DATA, index_col=['CampaignID'])
df['VideoIncluded'] = df['VideoIncluded'] == 'Yes'
df['IsSuccessful'] = df['IsSuccessful'] == 1
df.head()

Unnamed: 0_level_0,GoalAmount,RaisedAmount,DurationDays,NumBackers,Category,LaunchMonth,Country,Currency,OwnerExperience,VideoIncluded,SocialMediaPresence,NumUpdates,IsSuccessful
CampaignID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
CID_00000,76159,73518.028955,15,1903,Film,September,Australia,GBP,11,True,41758,3,False
CID_00001,69394,60068.683288,49,469,Art,October,Australia,GBP,7,True,79112,48,False
CID_00002,18652,22958.15347,58,148,Film,February,USA,EUR,17,False,3942,9,True
CID_00003,19487,24229.04839,39,780,Film,October,Germany,AUD,10,False,22047,35,True
CID_00004,26734,17984.630311,68,1622,Games,September,Australia,AUD,12,True,72568,17,False


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, CID_00000 to CID_99999
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   GoalAmount           100000 non-null  int64  
 1   RaisedAmount         100000 non-null  float64
 2   DurationDays         100000 non-null  int64  
 3   NumBackers           100000 non-null  int64  
 4   Category             100000 non-null  object 
 5   LaunchMonth          100000 non-null  object 
 6   Country              100000 non-null  object 
 7   Currency             100000 non-null  object 
 8   OwnerExperience      100000 non-null  int64  
 9   VideoIncluded        100000 non-null  bool   
 10  SocialMediaPresence  100000 non-null  int64  
 11  NumUpdates           100000 non-null  int64  
 12  IsSuccessful         100000 non-null  bool   
dtypes: bool(2), float64(1), int64(6), object(4)
memory usage: 9.3+ MB


Is our target class balanced?

In [3]:
TARGET = 'IsSuccessful'
df[TARGET].value_counts(normalize=True).to_dict()

{True: 0.50094, False: 0.49906}

In [4]:
import arrow
from umap import UMAP

COLUMNS = [key for key, dtype in df.dtypes.to_dict().items() if str(dtype) in {'bool', 'float64', 'int64'} and key != TARGET]

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=500)
df[['x', 'y']] = umap.fit_transform(X=df[COLUMNS])
print('done with UMAP in {}'.format(arrow.now() - time_start))

2024-07-23 19:00:46.999697: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-23 19:00:46.999846: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-23 19:00:47.156539: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=500, n_jobs=1, random_state=2024, verbose=True)
Tue Jul 23 19:00:59 2024 Construct fuzzy simplicial set
Tue Jul 23 19:00:59 2024 Finding Nearest Neighbors
Tue Jul 23 19:00:59 2024 Building RP forest with 21 trees
Tue Jul 23 19:01:07 2024 NN descent for 17 iterations
	 1  /  17
	 2  /  17
	Stopping threshold met -- exiting after 2 iterations
Tue Jul 23 19:01:33 2024 Finished Nearest Neighbor Search
Tue Jul 23 19:01:39 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Tue Jul 23 19:05:15 2024 Finished embedding
done with UMAP in 0:04:16.353762


In [5]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.scatter(data_frame=df.sample(n=3000, random_state=2024), x='x', y='y', color=TARGET, facet_col=TARGET)

It looks like we have a smallish region where we can distinguish successful appeals from unsuccessful appeals and a large region where it looks like the results are random. Let's build a model.

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df[TARGET], test_size=0.2, random_state=2024, stratify=df[TARGET])
model = LogisticRegression(max_iter=10000, tol=1e-3).fit(X_train, y_train)
print('model fit in {} iterations'.format(model.n_iter_[0]))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))

model fit in 154 iterations
accuracy: 1.0000


In [7]:
from sklearn.metrics import classification_report

print(classification_report(zero_division=0 , y_true=y_test, y_pred=model.predict(X=X_test)))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00      9981
        True       1.00      1.00      1.00     10019

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000



That's amazing. What features are the most important?

In [8]:
from plotly import express

express.histogram(x=COLUMNS, y=model.coef_[0])

Yeah probably if we added a feature that was just the raised amount as a percentage of the goal amount we wouldn't need any of the other features, would we?