In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Configuraciones generales de Pandas
pd.options.display.float_format = '{:.4f}'.format
pd.set_option("display.max_rows", 30)
pd.set_option("display.max_columns", 30)
pd.set_option('display.latex.repr', True)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.width', None)

In [3]:
df_train = pd.read_csv('../../datasets/z_train_mini.csv', nrows=10000)
df_target = pd.read_csv('../../datasets/z_test_mini.csv', nrows=10000)
# df_train = pd.read_csv('../../datasets/z_train_mini.csv')
# df_target = pd.read_csv('../../datasets/z_test_mini.csv')

In [4]:
# Explorando las columnas
df_train.state.value_counts()

failed        5070
successful    3446
canceled      1218
undefined      115
live           102
suspended       49
Name: state, dtype: int64

In [5]:
# el 52% han sido fallidos y el 35% exitoso, las otras clasificaciones no aportan al modelo por lo que las eliminaremos
df_classify = df_train[df_train.state.isin(['failed','successful'])]
df_classify.state.value_counts(normalize=True)

failed       0.5953
successful   0.4047
Name: state, dtype: float64

In [6]:
# Revisamos el balance de la variable dependiente
total_rows_0, total_rows_1 = df_classify.state.value_counts()
print(total_rows_0)
print(total_rows_1)

5070
3446


In [7]:
# Dividimos el dataset 2 para cada tipo de datos
df_class_0 = df_classify[df_classify.state == 'failed']
df_class_1 = df_classify[df_classify.state == 'successful']
print(df_class_0.state.count())
print(df_class_1.state.count())

5070
3446


In [8]:
# creamos un dataset balanceado utilizando undersample
df_sample_0 = df_class_0.sample(total_rows_1)
print('inicial: ', df_sample_0.state.count())
df_sample = pd.concat([df_sample_0, df_class_1], axis=0)
print('final: ', df_sample.state.count())
df_classify = df_sample.copy()

inicial:  3446
final:  6892


In [9]:
# veamos la distribución de la duración del proyecto tanto para proyectos exitos como fallidos
df_classify = df_classify.copy()
df_classify['launched_date'] = pd.to_datetime(df_classify.launched)
df_classify['deadline_date'] = pd.to_datetime(df_classify.deadline)
df_classify['duration'] = (df_classify.deadline_date - df_classify.launched_date).dt.days
df_classify

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,launched_date,deadline_date,duration
7830,1048012191,Maid in Dubai,Fiction,Publishing,GBP,2013-06-19,7000.0000,2013-05-20 20:18:37,508.0000,failed,13,GB,771.2600,784.6500,10812.1500,2013-05-20 20:18:37,2013-06-19,29
4571,1028428265,Women and Meds,Documentary,Film & Video,USD,2013-09-18,25000.0000,2013-08-19 15:52:21,3900.0000,failed,68,US,3900.0000,3900.0000,25000.0000,2013-08-19 15:52:21,2013-09-18,29
1777,1011224953,Journey to Dance Gallery Festival,Performances,Dance,USD,2014-09-05,2500.0000,2014-08-22 17:14:56,1256.0000,failed,12,US,1256.0000,1256.0000,2500.0000,2014-08-22 17:14:56,2014-09-05,13
2715,1016968400,Diva of Spiritual Voice/Hidden jewel from !NEW...,World Music,Music,USD,2012-07-18,80000.0000,2012-06-18 21:58:50,100.0000,failed,2,US,100.0000,100.0000,80000.0000,2012-06-18 21:58:50,2012-07-18,29
4036,1025057240,Our Single Lives .Com - The Webseries,Webseries,Film & Video,USD,2013-02-01,20000.0000,2012-12-14 22:34:26,3040.0000,failed,43,US,3040.0000,3040.0000,20000.0000,2012-12-14 22:34:26,2013-02-01,48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9984,1060942332,"ALLEY ART: A Tour, A Showcase, & An Art Book!",Public Art,Art,USD,2014-05-26,9500.0000,2014-04-26 01:04:33,10051.3800,successful,47,US,10051.3800,10051.3800,9500.0000,2014-04-26 01:04:33,2014-05-26,29
9985,1060946644,"""Not Mine"" CD Release",World Music,Music,USD,2013-12-08,2500.0000,2013-11-08 01:40:24,2748.0000,successful,64,US,2748.0000,2748.0000,2500.0000,2013-11-08 01:40:24,2013-12-08,29
9991,1060968336,"LGBT Detroit: The Renovation of a Safe, Brave ...",Civic Design,Design,USD,2016-07-03,10000.0000,2016-06-06 20:29:07,11290.0000,successful,73,US,11290.0000,11290.0000,10000.0000,2016-06-06 20:29:07,2016-07-03,26
9994,1060982909,PrayerMate for Android,Technology,Technology,GBP,2013-11-27,2500.0000,2013-10-28 22:13:33,2765.0000,successful,92,GB,4470.6300,4519.3000,4086.1700,2013-10-28 22:13:33,2013-11-27,29


In [10]:
# ----------------------------------------------------------------------------------------------
# veamos la distribución de la duración del proyecto tanto para proyectos exitos como fallidos
# ----------------------------------------------------------------------------------------------
df_target['launched_date'] = pd.to_datetime(df_target.launched)
df_target['deadline_date'] = pd.to_datetime(df_target.deadline)
df_target['duration'] = (df_target.deadline_date - df_target.launched_date).dt.days
df_target

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,backers,country,usd pledged,usd_pledged_real,usd_goal_real,launched_date,deadline_date,duration
0,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0000,2012-03-17 03:24:11,1.0000,1,US,1.0000,1.0000,5000.0000,2012-03-17 03:24:11,2012-04-16,29
1,100004721,Of Jesus and Madmen,Nonfiction,Publishing,CAD,2013-10-09,2500.0000,2013-09-09 18:19:37,0.0000,0,CA,0.0000,0.0000,2406.3900,2013-09-09 18:19:37,2013-10-09,29
2,1000071625,Boco Tea,Food,Food,USD,2012-06-02,5000.0000,2012-05-03 17:24:32,1781.0000,40,US,1781.0000,1781.0000,5000.0000,2012-05-03 17:24:32,2012-06-02,29
3,1000102741,Matt Cavenaugh & Jenny Powers make their 1st a...,Music,Music,USD,2011-01-06,10000.0000,2010-12-07 23:16:50,15827.0000,147,US,15827.0000,15827.0000,10000.0000,2010-12-07 23:16:50,2011-01-06,29
4,1000103948,Superhero Teddy Bear,DIY,Crafts,GBP,2016-01-05,12000.0000,2015-12-06 20:09:06,0.0000,0,GB,0.0000,0.0000,17489.6500,2015-12-06 20:09:06,2016-01-05,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1284819163,"""Grief Sleeps"" - Short Film Inspired by True E...",Narrative Film,Film & Video,USD,2014-07-09,14788.0000,2014-06-06 04:26:14,1445.0000,32,US,1445.0000,1445.0000,14788.0000,2014-06-06 04:26:14,2014-07-09,32
9996,1284849192,Two Gun Crowley,Festivals,Film & Video,USD,2016-01-06,1000000.0000,2015-12-24 22:44:42,0.0000,0,US,0.0000,0.0000,1000000.0000,2015-12-24 22:44:42,2016-01-06,12
9997,1284854192,"""Mercy Buckets"" launches at the SF Fringe",Theater,Theater,USD,2011-09-18,500.0000,2011-08-14 23:28:38,1480.0000,24,US,1480.0000,1480.0000,500.0000,2011-08-14 23:28:38,2011-09-18,34
9998,1284891103,"Cthulhu Grandmas Present, Cthulhu Dice Bags.",Tabletop Games,Games,USD,2017-08-10,20.0000,2017-07-11 20:51:32,2076.0000,75,US,238.0000,2076.0000,20.0000,2017-07-11 20:51:32,2017-08-10,29


In [11]:
# preparando datos para aplicar algoritmos ML
# seleccionamos las columnas a incluir en el modelo
df_variables = df_classify[['usd_goal_real','backers','main_category','duration','currency']]
df_variables.head()

Unnamed: 0,usd_goal_real,backers,main_category,duration,currency
7830,10812.15,13,Publishing,29,GBP
4571,25000.0,68,Film & Video,29,USD
1777,2500.0,12,Dance,13,USD
2715,80000.0,2,Music,29,USD
4036,20000.0,43,Film & Video,48,USD


In [12]:
# -----------------------------------------------------------------------------------------
df_t_variables = df_target[['usd_goal_real','backers','main_category','duration','currency']]
df_t_variables.head()

Unnamed: 0,usd_goal_real,backers,main_category,duration,currency
0,5000.0,1,Music,29,USD
1,2406.39,0,Publishing,29,CAD
2,5000.0,40,Food,29,USD
3,10000.0,147,Music,29,USD
4,17489.65,0,Crafts,29,GBP


In [13]:
# Convertimos las variables categoricas a variables ficticias o dummies
X = pd.get_dummies(data=df_variables, columns=['main_category','currency'], drop_first=True)
X.head()


Unnamed: 0,usd_goal_real,backers,duration,main_category_Comics,main_category_Crafts,main_category_Dance,main_category_Design,main_category_Fashion,main_category_Film & Video,main_category_Food,main_category_Games,main_category_Journalism,main_category_Music,main_category_Photography,main_category_Publishing,main_category_Technology,main_category_Theater,currency_CAD,currency_CHF,currency_DKK,currency_EUR,currency_GBP,currency_HKD,currency_MXN,currency_NOK,currency_NZD,currency_SEK,currency_SGD,currency_USD
7830,10812.15,13,29,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4571,25000.0,68,29,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1777,2500.0,12,13,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2715,80000.0,2,29,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4036,20000.0,43,48,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [14]:
# -----------------------------------------------------------------------------------------
# Convertimos las variables categoricas a variables ficticias o dummies
X_t = pd.get_dummies(data=df_t_variables, columns=['main_category','currency'], drop_first=True)
X_t.head()


Unnamed: 0,usd_goal_real,backers,duration,main_category_Comics,main_category_Crafts,main_category_Dance,main_category_Design,main_category_Fashion,main_category_Film & Video,main_category_Food,main_category_Games,main_category_Journalism,main_category_Music,main_category_Photography,main_category_Publishing,main_category_Technology,main_category_Theater,currency_CAD,currency_CHF,currency_DKK,currency_EUR,currency_GBP,currency_HKD,currency_MXN,currency_NOK,currency_NZD,currency_SEK,currency_SGD,currency_USD
0,5000.0,1,29,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,2406.39,0,29,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,5000.0,40,29,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,10000.0,147,29,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,17489.65,0,29,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [15]:
# genermos columnas dummies para el dato categorico de estado
y = pd.get_dummies(data = df_classify.state, drop_first=True)
y.head()

Unnamed: 0,successful
7830,0
4571,0
1777,0
2715,0
4036,0


In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [17]:
# Creamos el modelo
from sklearn.linear_model import LogisticRegression
ks_model = LogisticRegression().fit(X_train, y_train)

  return f(*args, **kwargs)


In [18]:
# Validamos el modelo 
# aplicando una matriz de confusión
from sklearn.metrics import confusion_matrix, classification_report
y_pred_test = ks_model.predict(X_test)
confusion_matrix(y_test, y_pred_test)

array([[617,  40],
       [ 89, 633]], dtype=int64)

In [19]:
# Revisamos el reporte de clasificación
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.87      0.94      0.91       657
           1       0.94      0.88      0.91       722

    accuracy                           0.91      1379
   macro avg       0.91      0.91      0.91      1379
weighted avg       0.91      0.91      0.91      1379



# Dataset para submission

In [20]:
# --------------------------------------------------
# Validamos el modelo aplicando una matriz de confusión
y_pred_test = ks_model.predict(X_t)
# Agregamos las etiquetas de predicción al df
df_target['state'] = y_pred_test
# creamos el df de submission
df_submission = df_target[['ID','state']]
df_submission.head(10)

Unnamed: 0,ID,state
0,1000007540,0
1,100004721,0
2,1000071625,1
3,1000102741,1
4,1000103948,0
5,1000170964,0
6,1000171141,1
7,1000183112,1
8,1000202062,0
9,1000227361,1


In [21]:
df_submission.state.value_counts()

0    6103
1    3897
Name: state, dtype: int64

In [22]:
# creamos el archivo csv para submission en el portal kaggle
df_submission.to_csv('./outputs/kickstarter_2.csv', index=False)