## Kickstarter Projects

In [86]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss, accuracy_score, confusion_matrix

### 1. データの読み込み 

In [87]:
df_project = pd.read_csv("../ks-projects-201801.csv")[['state','currency','main_category','category','deadline','goal','launched','country']]
display(df_project.head())

Unnamed: 0,state,currency,main_category,category,deadline,goal,launched,country
0,failed,GBP,Publishing,Poetry,2015-10-09,1000.0,2015-08-11 12:12:28,GB
1,failed,USD,Film & Video,Narrative Film,2017-11-01,30000.0,2017-09-02 04:43:57,US
2,failed,USD,Film & Video,Narrative Film,2013-02-26,45000.0,2013-01-12 00:20:50,US
3,failed,USD,Music,Music,2012-04-16,5000.0,2012-03-17 03:24:11,US
4,canceled,USD,Film & Video,Film & Video,2015-08-29,19500.0,2015-07-04 08:35:03,US


### 2.事前処理

プロジェクトの期間を日数に変換する

In [88]:
df_project['deadline'] = pd.to_datetime(df_project['deadline'])
df_project['launched'] = pd.to_datetime(df_project['launched'])
df_project['running_time'] = (df_project['deadline'] - df_project['launched']).dt.days

まだ終わらないプロジェクトを対象外にする

In [89]:
df_project_new = df_project[df_project['state'] != 'live']
df_project = df_project_new

state列からsuccessful列を作成する（１：True、0：False）

In [90]:
df_project['successful'] = pd.get_dummies(df_project['state'])['successful']

種類的な列を数字化する

In [91]:
category_index, category_labels = pd.factorize(df_project['category'])
df_project['category'] = category_index
main_category_index, main_category_labels = pd.factorize(df_project['main_category'])
df_project['main_category'] = main_category_index
country_index, country_labels = pd.factorize(df_project['country'])
df_project['country'] = country_index
currency_index, currency_labels = pd.factorize(df_project['currency'])
df_project['currency'] = currency_index

要らない列を廃棄する

In [92]:
df_project.drop(['state','deadline','launched'],inplace=True, axis=1)
display(df_project.head())
df_project.describe()

Unnamed: 0,currency,main_category,category,goal,country,running_time,successful
0,0,0,0,1000.0,0,58,0
1,1,1,1,30000.0,1,59,0
2,1,1,1,45000.0,1,44,0
3,1,2,2,5000.0,1,29,0
4,1,1,3,19500.0,1,55,0


Unnamed: 0,currency,main_category,category,goal,country,running_time,successful
count,375862.0,375862.0,375862.0,375862.0,375862.0,375862.0,375862.0
mean,1.330738,4.960778,38.722111,48874.65,1.719647,33.441473,0.356397
std,1.422147,4.104895,34.47773,1176595.0,2.796445,66.142888,0.478935
min,0.0,0.0,0.0,0.01,0.0,0.0,0.0
25%,1.0,1.0,9.0,2000.0,1.0,29.0,0.0
50%,1.0,4.0,30.0,5100.0,1.0,29.0,0.0
75%,1.0,8.0,55.0,16000.0,1.0,36.0,1.0
max,13.0,14.0,158.0,100000000.0,22.0,16738.0,1.0


### 3. [演習] Scikit-learnを用いてロジスティック回帰を実装する

In [94]:
y = df_project["successful"].values
X = df_project.drop('successful', axis=1).values
clf = SGDClassifier(loss='log', penalty='none', max_iter=10000, fit_intercept=True, random_state=1234, tol=1e-3)
clf.fit(X, y)

# 重みを取得して表示
w0 = clf.intercept_[0]
w1 = clf.coef_[0, 0]
w2 = clf.coef_[0, 1]
w3 = clf.coef_[0, 2]
w4 = clf.coef_[0, 3]
w5 = clf.coef_[0, 4]
w6 = clf.coef_[0, 5]
print('w0 = {:.3f}, w1 = {:.3f}, w2 = {:.3f}, w3 = {:.3f}, w4 = {:.3f}, w5 = {:.3f}, w6 = {:.3f}'.format(w0, w1, w2, w3, w4, w5, w6))

w0 = 3892.042, w1 = 4078.862, w2 = 10628.476, w3 = -12556.622, w4 = -143.112, w5 = 1826.531, w6 = 46589.163


In [95]:
# ラベルを予測
y_est = clf.predict(X)

# 対数尤度を表示
print('対数尤度 = {:.3f}'.format(- log_loss(y, y_est)))

# 正答率を表示
print('正答率 = {:.3f}%'.format(100 * accuracy_score(y, y_est)))

対数尤度 = -15.058
正答率 = 56.403%


In [96]:
# 予測値と正解のクロス集計
conf_mat = pd.DataFrame(confusion_matrix(y, y_est), 
                        index=['正解 = リノベーションなし', '正解 = リノベーション済み'], 
                        columns=['予測 = リノベーションなし', '予測 = リノベーション済み'])
conf_mat

Unnamed: 0,予測 = リノベーションなし,予測 = リノベーション済み
正解 = リノベーションなし,131538,110368
正解 = リノベーション済み,53497,80459
