## Kickstarter Projects

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss, accuracy_score, confusion_matrix

### 1. データの読み込み 

In [2]:
df_project = pd.read_csv("../ks-projects-201801.csv")[['name','state','currency','main_category','category','deadline','goal','launched','country']]
display(df_project.head())

Unnamed: 0,name,state,currency,main_category,category,deadline,goal,launched,country
0,The Songs of Adelaide & Abullah,failed,GBP,Publishing,Poetry,2015-10-09,1000.0,2015-08-11 12:12:28,GB
1,Greeting From Earth: ZGAC Arts Capsule For ET,failed,USD,Film & Video,Narrative Film,2017-11-01,30000.0,2017-09-02 04:43:57,US
2,Where is Hank?,failed,USD,Film & Video,Narrative Film,2013-02-26,45000.0,2013-01-12 00:20:50,US
3,ToshiCapital Rekordz Needs Help to Complete Album,failed,USD,Music,Music,2012-04-16,5000.0,2012-03-17 03:24:11,US
4,Community Film Project: The Art of Neighborhoo...,canceled,USD,Film & Video,Film & Video,2015-08-29,19500.0,2015-07-04 08:35:03,US


### 2.事前処理

In [3]:
#プロジェクトの期間を日数に変換する
df_project['deadline'] = pd.to_datetime(df_project['deadline'])
df_project['launched'] = pd.to_datetime(df_project['launched'])
df_project['running_time'] = (df_project['deadline'] - df_project['launched']).dt.days

#まだ終わらないプロジェクトを対象外にする
df_project = df_project[df_project["state"].isin(["failed", "successful"])]

#state列のデータを０と１の数値に変換する
df_project["state"] = df_project["state"].apply(lambda x: 1 if x=="successful" else 0)

#プロジェクト名の長さを新項目にする
df_project['name'] = df_project['name'].astype(str)
df_project["name_length"] = df_project["name"].apply(lambda x: len(x.replace(" ","")) if x!=np.NaN else 0)

#キックオフの月を取得する
df_project["launched_month"] = df_project["launched"].dt.month

#要らない項目も廃棄する
df_project = df_project.drop(["name","launched","deadline"], axis = 1)

欠損血を確認する

In [4]:
pd.isnull(df_project).sum()
#以下の結果からみると、欠損血の対応処理をする必要がない

state             0
currency          0
main_category     0
category          0
goal              0
country           0
running_time      0
name_length       0
launched_month    0
dtype: int64

In [5]:
display(df_project.head())

Unnamed: 0,state,currency,main_category,category,goal,country,running_time,name_length,launched_month
0,0,GBP,Publishing,Poetry,1000.0,GB,58,26,8
1,0,USD,Film & Video,Narrative Film,30000.0,US,59,38,9
2,0,USD,Film & Video,Narrative Film,45000.0,US,44,12,1
3,0,USD,Music,Music,5000.0,US,29,43,3
5,1,USD,Food,Restaurants,50000.0,US,34,18,2


カテゴリー的な項目を前処理する

In [6]:
#currencyの割合を確認する
df_project['currency'].value_counts(normalize=True) * 100

USD    78.845557
GBP     8.887013
EUR     4.334966
CAD     3.731062
AUD     1.996231
SEK     0.455265
MXN     0.425416
NZD     0.384111
DKK     0.280093
CHF     0.196578
NOK     0.176076
HKD     0.143815
SGD     0.136881
JPY     0.006934
Name: currency, dtype: float64

In [8]:
#1％以下のcurrencyをまとめて値をothersにする
currency_others = df_project['currency'].value_counts(normalize=True) * 100 < 1
df_project.loc[df_project['currency'].isin(currency_others.index[currency_others]), 'currency'] = 'others'

In [9]:
#countryの割合を確認する
df_project['country'].value_counts(normalize=True) * 100

US      78.800030
GB       8.880380
CA       3.729555
AU       1.994724
DE       1.035954
FR       0.759780
NL       0.726916
IT       0.714253
ES       0.564709
SE       0.454963
MX       0.425416
NZ       0.384111
DK       0.279189
IE       0.205924
CH       0.196578
NO       0.175473
BE       0.157684
AT       0.146227
HK       0.143815
SG       0.136881
N,0"     0.063315
LU       0.017185
JP       0.006934
Name: country, dtype: float64

In [10]:
#1％以下のcountryをまとめて値をothersにする
country_other = df_project['country'].value_counts(normalize=True) * 100 < 1
df_project.loc[df_project['country'].isin(country_other.index[country_other]), 'country'] = 'others'

In [12]:
#main_categoryの割合を確認する
df_project['main_category'].value_counts(normalize=True) * 100

Film & Video    17.042888
Music           13.853622
Publishing      10.686666
Games            8.599080
Technology       8.155574
Art              7.730761
Design           7.647245
Food             6.649280
Fashion          5.962162
Theater          3.087963
Comics           2.978217
Photography      2.921233
Crafts           2.357127
Journalism       1.250923
Dance            1.077259
Name: main_category, dtype: float64

In [13]:
#main_categoryの中に1％以下のカテゴリーがないので、何もしない

#categoryの割合を確認する
df_project['category'].value_counts(normalize=True) * 100 

Product Design       5.632019
Documentary          4.378684
Music                3.808849
Tabletop Games       3.540816
Shorts               3.435291
Food                 3.140424
Video Games          2.812392
Film & Video         2.600136
Fiction              2.444562
Fashion              2.269993
Art                  2.257029
Nonfiction           2.235321
Theater              2.015226
Rock                 1.883169
Children's Books     1.868697
Apparel              1.859049
Technology           1.726389
Indie Rock           1.606693
Apps                 1.594030
Photography          1.581669
Webseries            1.552122
Publishing           1.523178
Narrative Film       1.392327
Comics               1.381171
Web                  1.299163
Country & Folk       1.270219
Crafts               1.251828
Design               1.069420
Hip-Hop              1.042888
Hardware             0.939172
                       ...   
Blues                0.070551
Animals              0.069043
Couture   

In [14]:
#1％以上のcategoryをまとめて値をothersにする
category_other = df_project['category'].value_counts(normalize=True) * 100 < 1
df_project.loc[df_project['category'].isin(category_other.index[category_other]), 'category'] = 'others'

One Hot Encodingでカテゴリー項目を数値化する

In [15]:

category_dummy = pd.get_dummies(df_project['category'],prefix='category')
df_project = df_project.merge(category_dummy, left_index= True, right_index=True)

main_category_dummy = pd.get_dummies(df_project['main_category'],prefix='main_category')
df_project = df_project.merge(main_category_dummy, left_index= True, right_index=True)

currency_dummy = pd.get_dummies(df_project['currency'],prefix='currency')
df_project = df_project.merge(currency_dummy, left_index= True, right_index=True)

country_dummy = pd.get_dummies(df_project['country'],prefix='country')
df_project = df_project.merge(country_dummy, left_index= True, right_index=True)

In [16]:
#要らない項目も廃棄する
df_project = df_project.drop(["main_category","currency","category","country"], axis = 1)
df_project.head()

Unnamed: 0,state,goal,running_time,name_length,launched_month,category_Apparel,category_Apps,category_Art,category_Children's Books,category_Comics,...,currency_EUR,currency_GBP,currency_USD,currency_others,country_AU,country_CA,country_DE,country_GB,country_US,country_others
0,0,1000.0,58,26,8,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
1,0,30000.0,59,38,9,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2,0,45000.0,44,12,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
3,0,5000.0,29,43,3,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
5,1,50000.0,34,18,2,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
