In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import warnings
warnings.simplefilter('ignore')


In [2]:
# データの読み込み
df_org = pd.read_csv('./data/ks-projects-201801.csv')


In [35]:
# 成功（successful）と失敗（failed）以外のデータを除外する
df = df_org[(df_org['state'] == 'successful') | (df_org['state'] == 'failed')]

## プロジェクトの状態（state）

In [36]:
state_mapping = { 'successful':1, 'failed':0}
df['state'] = df['state'].map(state_mapping)

## 日時

### 期間を新しく作成する
deadline - launched

In [37]:
# 日時に関する列を string から TimeStamp に変換
df['deadline'] = pd.to_datetime(df['deadline'])
df['launched'] = pd.to_datetime(df['launched'])

# 期間の列を作成
df['period'] = (df['deadline'] - df['launched']).dt.days # 差分を日数として表示

### launched
周期性を考慮してクォーターで分ける
米国の会計年度は10月から9月。

In [38]:
df['launched_year'] = df['launched'].dt.year
df['launched_month'] = df['launched'].dt.month
df['launched_day'] = df['launched'].dt.day

### 不要な列を削除する

In [39]:
# launched と deadline を削除
# df = df.drop(columns=['deadline', 'launched'])

## カテゴリ

In [40]:
# main_categoryとcategoryを結合する（class）
df['class'] = df['main_category'].str.cat(df['category'], sep='**')
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,period,launched_year,launched_month,launched_day,class
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,0,0,GB,0.0,0.0,1533.95,58,2015,8,11,Publishing**Poetry
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,0,15,US,100.0,2421.0,30000.0,59,2017,9,2,Film & Video**Narrative Film
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,0,3,US,220.0,220.0,45000.0,44,2013,1,12,Film & Video**Narrative Film
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,0,1,US,1.0,1.0,5000.0,29,2012,3,17,Music**Music
5,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26 13:38:27,52375.0,1,224,US,52375.0,52375.0,50000.0,34,2016,2,26,Food**Restaurants


In [41]:
# 各classの個数をカウントし、dataframeに変換する
# https://www.web-dev-qa-db-ja.com/ja/python/python-pandas%EF%BC%9A-valuecounts%E5%87%BA%E5%8A%9B%E3%82%92%E3%83%87%E3%83%BC%E3%82%BF%E3%83%95%E3%83%AC%E3%83%BC%E3%83%A0%E3%81%AB%E5%A4%89%E6%8F%9B%E3%81%97%E3%81%BE%E3%81%99/835447736/
df_counts = df['class'].value_counts().rename_axis('class').reset_index(name='counts')
# トップ40位を抜き出す
df_largest = df_counts.nlargest(120, columns='counts')
# トップ40位の要素を削除したdataframeを作成する
df_others = df_counts.drop(df_largest.index)
# df_othersのclass列の項目をリスト化する
others_list = df_others['class'].to_list()
# others_list内の文字列に一致するclassをOthersに変更する
df = df.replace(others_list, 'Others')

In [42]:
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,period,launched_year,launched_month,launched_day,class
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,0,0,GB,0.0,0.0,1533.95,58,2015,8,11,Publishing**Poetry
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,0,15,US,100.0,2421.0,30000.0,59,2017,9,2,Film & Video**Narrative Film
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,0,3,US,220.0,220.0,45000.0,44,2013,1,12,Film & Video**Narrative Film
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,0,1,US,1.0,1.0,5000.0,29,2012,3,17,Music**Music
5,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26 13:38:27,52375.0,1,224,US,52375.0,52375.0,50000.0,34,2016,2,26,Food**Restaurants


## プロジェクト名
nameをどうにかする

In [43]:
# We generate new feature based on ratio between vowels and other alpha characters
def countVowelstoLettersRatio(s):
    '''Count ratio between vowels and letters'''
    s = str(s)
    count = 1  
    vowels = 0
    for i in s:
        if i.isalpha():
            count = count + 1
            if i in 'aeiou':
                vowels = vowels + 1
    return ((vowels * 1.0) / count)

In [44]:
# for each name calculate vowels ratio
df['name_vowel_ratio'] = df.name.apply(countVowelstoLettersRatio)


In [79]:
# length of the name
df['name_len'] = df.name.str.len()

### 単語数を抽出

In [45]:
# # 単語数をカウントする関数
# def count_words(sentence):
#     words_list = str(sentence).split(' ') 
#     return len(words_list)
# # 単語数をカウントする関数を name の各データに適用
# df['n_words'] = df['name'].apply(count_words) 
# # name を削除
# # df = df.drop(columns=['name'])

In [80]:
# number of words in the name
df['name_number_of_word'] = df.name.apply(lambda x: len(str(x).split(' ')))


## 目標金額
usd_goal_realを対数変換する

In [82]:
df['usd_goal_real'].head()

0     1533.95
1    30000.00
2    45000.00
3     5000.00
5    50000.00
Name: usd_goal_real, dtype: float64

In [47]:
# df['usd_goal_real'].plot(bins=20, kind='hist')

In [48]:
# usd_goal_realを対数変換する
df['log_goal'] = np.log(df['usd_goal_real'])
# df = df.drop(columns='usd_goal_real')

In [49]:
df['Goal_1000'] = df.usd_goal_real.apply(lambda x: x // 1000)
df['Goal_500'] = df.usd_goal_real.apply(lambda x: x // 500)
df['Goal_10'] = df.usd_goal_real.apply(lambda x: x // 10)


## 過去のプロジェクト数

In [62]:
# We will create data frames containing only single main category
categories = set(df.main_category)
frames = {}
for ct in categories:
    frames[ct] = df[df['main_category'] == ct]


In [63]:
# from datetime import datetime
# import time

# def get_timestamp(dt):
#     '''Add timestamp as a value'''
#     return time.mktime(dt.timetuple())


In [64]:
# We will use Progressbar to track progress as it istime consuming operation
# import pyprind

def getElementsInRange(cat,end,week):
    '''Get number of launched projects in given range from (end - week) to end'''
    # global pbar
    pob = frames[cat]
    start = end - pd.DateOffset(weeks = week)
    # as we sorted our projects by launch date earlier geting number of projects in given date range is easy
    # value = pob['launched'].searchsorted(end)[0] - pob['launched'].searchsorted(start)[0]
    value = pob['launched'].searchsorted(end) - pob['launched'].searchsorted(start)
    # pbar.update()
    return value

In [66]:
# Number of projects in same category for last week    
df['Last_Week'] = df.apply(lambda x: getElementsInRange(x['main_category'],x['launched'],1),axis = 1) 

In [67]:
# Number of projects in same category for last month    
df['Last_Month'] = df.apply(lambda x: getElementsInRange(x['main_category'],x['launched'],4),axis = 1) 


In [68]:
# Number of projects in same category for last 6 months  
df['Last_6_Month'] = df.apply(lambda x: getElementsInRange(x['main_category'],x['launched'],26),axis = 1)


In [69]:
# Number of projects in same category for last 3 months  
df['Last_3_Month'] = df.apply(lambda x: getElementsInRange(x['main_category'],x['launched'],13),axis = 1)


In [70]:
# Number of projects in same category for last year    
df['Last_Year'] = df.apply(lambda x: getElementsInRange(x['main_category'],x['launched'],52),axis = 1) 


## ある期間における目標金額

In [57]:
def getRangeMean(cat,end,week):
    # global pbar
    pob = frames[cat]
    start = end - pd.DateOffset(weeks = week)
    # value = pob.iloc[pob['launced'].searchsorted(start)[0]:pob['launced'].searchsorted(end)[0]]['goal'].mean()
    value = pob.iloc[pob['launched'].searchsorted(start):pob['launched'].searchsorted(end)]['goal'].mean()
    # pbar.update()
    return value


In [58]:
def getRangeMedian(cat,end,week):
    # global pbar
    pob = frames[cat]
    start = end - pd.DateOffset(weeks = week)
    # value = pob.iloc[pob['launced'].searchsorted(start)[0]:pob['launced'].searchsorted(end)[0]]['goal'].median()
    value = pob.iloc[pob['launched'].searchsorted(start):pob['launched'].searchsorted(end)]['goal'].median()
    # pbar.update()
    return value


In [71]:
# Mean goal for category last month
df['mean_goal_in_category_last_month'] = df.apply(lambda x: getRangeMean(x['main_category'],x['launched'],4),axis = 1) 

In [72]:
# Median goal for category last month
df['median_goal_in_category_last_month'] = df.apply(lambda x: getRangeMedian(x['main_category'],x['launched'],4),axis = 1) 

In [73]:
# Mean goal for category last month
df['mean_goal_in_category_last_year'] = df.apply(lambda x: getRangeMean(x['main_category'],x['launched'],52),axis = 1) 


In [74]:
df['median_goal_Last_6_Month'] = df.apply(lambda x: getRangeMedian(x['main_category'],x['launched'],26),axis = 1)

In [75]:
df['mean_goal_Last_6_Month'] = df.apply(lambda x: getRangeMean(x['main_category'],x['launched'],26),axis = 1)


In [76]:
df['mean_goal_Last_Week'] = df.apply(lambda x: getRangeMean(x['main_category'],x['launched'],1),axis = 1)


In [77]:
df['median_goal_Last_Week'] = df.apply(lambda x: getRangeMedian(x['main_category'],x['launched'],1),axis = 1)


# 加工データの保存

In [81]:
# df.to_csv('./data/input_01.csv')
# df.to_csv('./data/input_02.csv')
# df.to_csv('./data/input_03.csv')
# df.to_csv('./data/input_04.csv')
df.to_csv('./data/input_05.csv')

### input_01.csv
- 初期

### input_02.csv
- input_01
- main_category x category -> class

### input_03.csv
- input_01
- main_category x category -> class
- launched -> year/month/day

### input_04.csv
- input_03
- how successful is ...の特徴量を参考に追加

### input_05.csv
- input_04
- how successful is ...の特徴量を参考にさらに追加
