## Kickstarter Projects

In [179]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss, accuracy_score, confusion_matrix

### 1. データの読み込み 

In [180]:
df_project = pd.read_csv("../ks-projects-201801.csv")[['name','state','currency','main_category','category','deadline','goal','launched','country']]
display(df_project.head())

Unnamed: 0,name,state,currency,main_category,category,deadline,goal,launched,country
0,The Songs of Adelaide & Abullah,failed,GBP,Publishing,Poetry,2015-10-09,1000.0,2015-08-11 12:12:28,GB
1,Greeting From Earth: ZGAC Arts Capsule For ET,failed,USD,Film & Video,Narrative Film,2017-11-01,30000.0,2017-09-02 04:43:57,US
2,Where is Hank?,failed,USD,Film & Video,Narrative Film,2013-02-26,45000.0,2013-01-12 00:20:50,US
3,ToshiCapital Rekordz Needs Help to Complete Album,failed,USD,Music,Music,2012-04-16,5000.0,2012-03-17 03:24:11,US
4,Community Film Project: The Art of Neighborhoo...,canceled,USD,Film & Video,Film & Video,2015-08-29,19500.0,2015-07-04 08:35:03,US


### 2.事前処理

In [181]:
#プロジェクトの期間を日数に変換する
df_project['deadline'] = pd.to_datetime(df_project['deadline'])
df_project['launched'] = pd.to_datetime(df_project['launched'])
df_project['running_time'] = (df_project['deadline'] - df_project['launched']).dt.days

#まだ終わらないプロジェクトを対象外にする
df_project = df_project[df_project["state"].isin(["failed", "successful"])]

#state列のデータを０と１の数値に変換する
df_project["state"] = df_project["state"].apply(lambda x: 1 if x=="successful" else 0)

#プロジェクト名の長さを新項目にする
df_project['name'] = df_project['name'].astype(str)
df_project["name_length"] = df_project["name"].apply(lambda x: len(x.replace(" ","")) if x!=np.NaN else 0)

#キックオフの月を取得する
df_project["launched_month"] = df_project["launched"].dt.month

#要らない項目も廃棄する
df_project = df_project.drop(["name","launched","deadline"], axis = 1)

欠損血を確認する

In [182]:
pd.isnull(df_project).sum()
#以下の結果からみると、欠損血の対応処理をする必要がない

state             0
currency          0
main_category     0
category          0
goal              0
country           0
running_time      0
name_length       0
launched_month    0
dtype: int64

In [183]:
display(df_project.head())

Unnamed: 0,state,currency,main_category,category,goal,country,running_time,name_length,launched_month
0,0,GBP,Publishing,Poetry,1000.0,GB,58,26,8
1,0,USD,Film & Video,Narrative Film,30000.0,US,59,38,9
2,0,USD,Film & Video,Narrative Film,45000.0,US,44,12,1
3,0,USD,Music,Music,5000.0,US,29,43,3
5,1,USD,Food,Restaurants,50000.0,US,34,18,2


カテゴリー的な項目を前処理する

In [184]:
#1％以上のをcurrencyを確認する
currency_matter = df_project['currency'].value_counts(normalize=True) * 100 < 1
print(currency_matter)

USD    False
GBP    False
EUR    False
CAD    False
AUD    False
SEK     True
MXN     True
NZD     True
DKK     True
CHF     True
NOK     True
HKD     True
SGD     True
JPY     True
Name: currency, dtype: bool


In [185]:
#1％以上のをcurrencyをまとめる
df_project.loc[df_project['currency'].isin(currency_matter.index[currency_matter]), 'currency'] = 'others'

In [186]:
#1％以上のをcountryを確認する
country_matter = df_project['country'].value_counts(normalize=True) * 100 < 1
print(country_matter)

US      False
GB      False
CA      False
AU      False
DE      False
FR       True
NL       True
IT       True
ES       True
SE       True
MX       True
NZ       True
DK       True
IE       True
CH       True
NO       True
BE       True
AT       True
HK       True
SG       True
N,0"     True
LU       True
JP       True
Name: country, dtype: bool


In [187]:
#1％以上のをcountryをまとめる
df_project.loc[df_project['country'].isin(country_matter.index[country_matter]), 'country'] = 'others'

In [188]:
#1％以上のをmain_categoryを確認する
main_category_matter = df_project['main_category'].value_counts(normalize=True) * 100 < 1
print(main_category_matter)

Film & Video    False
Music           False
Publishing      False
Games           False
Technology      False
Art             False
Design          False
Food            False
Fashion         False
Theater         False
Comics          False
Photography     False
Crafts          False
Journalism      False
Dance           False
Name: main_category, dtype: bool


In [189]:
#1％以上のをcategoryを確認する
category_matter = df_project['category'].value_counts(normalize=True) * 100 < 1
print(category_matter)

Product Design       False
Documentary          False
Music                False
Tabletop Games       False
Shorts               False
Food                 False
Video Games          False
Film & Video         False
Fiction              False
Fashion              False
Art                  False
Nonfiction           False
Theater              False
Rock                 False
Children's Books     False
Apparel              False
Technology           False
Indie Rock           False
Apps                 False
Photography          False
Webseries            False
Publishing           False
Narrative Film       False
Comics               False
Web                  False
Country & Folk       False
Crafts               False
Design               False
Hip-Hop              False
Hardware              True
                     ...  
Blues                 True
Animals               True
Couture               True
Fabrication Tools     True
Makerspaces           True
Movie Theaters        True
P

In [190]:
#1％以上のをcountryをまとめる
df_project.loc[df_project['category'].isin(category_matter.index[category_matter]), 'category'] = 'others'

One Hot Encodingでカテゴリー項目を数値化する

In [176]:

category_dummy = pd.get_dummies(df_project['category'],prefix='category')
df_project = df_project.merge(category_dummy, left_index= True, right_index=True)

main_category_dummy = pd.get_dummies(df_project['main_category'],prefix='main_category')
df_project = df_project.merge(main_category_dummy, left_index= True, right_index=True)

currency_dummy = pd.get_dummies(df_project['currency'],prefix='currency')
df_project = df_project.merge(currency_dummy, left_index= True, right_index=True)

country_dummy = pd.get_dummies(df_project['country'],prefix='country')
df_project = df_project.merge(country_dummy, left_index= True, right_index=True)

In [178]:
#要らない項目も廃棄する
df_project = df_project.drop(["main_category","currency","category","country"], axis = 1)
df_project.head()

Unnamed: 0,state,goal,running_time,name_length,launched_month,category_Apparel,category_Apps,category_Art,category_Children's Books,category_Comics,...,currency_EUR,currency_GBP,currency_USD,currency_others,country_AU,country_CA,country_DE,country_GB,country_US,country_others
0,0,1000.0,58,26,8,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
1,0,30000.0,59,38,9,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2,0,45000.0,44,12,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
3,0,5000.0,29,43,3,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
5,1,50000.0,34,18,2,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
