# KickstarterProject
Using kNN, linear regression model to predict success rate of crowdfunding.

In [84]:
import re
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

%matplotlib inline

In [93]:
df = pd.read_csv('./data/data.csv')
df.head(1)

Unnamed: 0.1,Unnamed: 0,backers_count,blurb,category,converted_pledged_amount,country,created_at,currency,currency_symbol,currency_trailing_code,...,static_usd_rate,usd_pledged,usd_type,preparation_duration,preparation_duration_r,launch_duration,launch_duration_r,created_at_readable,deadline_readable,launched_at_readable
0,1,21,2006 was almost 7 years ago.... Can you believ...,Rock,802,US,1387659690,USD,$,True,...,1.0,802.0,international,351356,4d 1H 35M 56S,3888000,45d 0H 0M 0S,2013-12-21 16:01:30,2014-02-08 17:37:26,2013-12-25 17:37:26


In [94]:
df.shape

(3779, 41)

In [95]:
df.state.value_counts()

successful    2224
failed        1276
canceled       149
live           120
suspended       10
Name: state, dtype: int64

In [96]:
# drop status rows labeled as live, canceled, suspended.
df = df[~df['state'].isin(['live', 'canceled', 'suspended'])]
df.shape

(3500, 41)

In [97]:
# drop irrelevant columns.
df.drop(['Unnamed: 0', 'blurb', 'created_at', 'currency_symbol', 'currency_trailing_code', 'current_currency',
       'deadline', 'disable_communication', 'friends', 'id',
       'is_backing', 'is_starred', 'launched_at', 'state_changed_at',
       'name', 'permissions', 'profile', 'source_url', 'staff_pick', 
       'preparation_duration_r', 'launch_duration_r',
       'created_at_readable', 'deadline_readable', 'launched_at_readable'], axis = 1, inplace = True)
df.head()

Unnamed: 0,backers_count,category,converted_pledged_amount,country,currency,fx_rate,goal,is_starrable,location,pledged,spotlight,state,static_usd_rate,usd_pledged,usd_type,preparation_duration,launch_duration
0,21,Rock,802,US,USD,1.0,200.0,False,Chicago,802.0,True,successful,1.0,802.0,international,351356,3888000
1,97,Mixed Media,2259,US,USD,1.0,400.0,False,Sacramento,2259.0,True,successful,1.0,2259.0,international,413843,1728000
2,88,Photobooks,29638,US,USD,1.0,27224.0,False,Columbus,29638.0,True,successful,1.0,29638.0,international,769946,2595600
3,193,Footwear,49158,IT,EUR,1.128433,40000.0,False,Venice,43180.0,True,successful,1.136525,49075.152523,international,314662,3625358
4,20,Software,549,US,USD,1.0,1000.0,False,Redmond,549.0,False,failed,1.0,549.0,domestic,212500,2592000


In [98]:
df['state'] = df.state.str.contains('successful').astype(int)

In [99]:
# add column representing continent
def classifier(row):
    if row.country in ['US', 'CA', 'GT', 'MX', 'PR', 'NI', 'SV', 'PA', 'BO', 'GU']:
        return 'America'
    elif row.country in ['NG', 'GH', 'ZA', 'KE', 'ET', 'CD', 'MA', 'TZ', 'ZM', 'LR', 'RW', 'ML', 'CM', 'NA', 'ZW', 'TN', 'SL', 'CF', 'UG', 'SD', 'SN', 'MW', 'MZ', 'LY', 'GN', 'SZ']:
        return 'Africa'
    elif row.country in ['GB', 'NO', 'DE', 'SE', 'BA', 'IS', 'HU', 'IT', 'NL','FR', 'UK', 'AT',
       'TR','FI', 'CZ','AM', 'PT','DK','CH', 'SJ', 'RU', 'UA', 'BG','ES','PL', 'GE','IE','GR','RS','SI','BE','GL','RO','LT', 'FM','EE','CY', 'MK','KG',]:
        return 'Europe'
    elif row.country in ['JM', 'HT','BS','DO','LC', 'DO', 'TT']:
        return 'Carribean'
    elif row.country in ['CN', 'TW', 'HK', 'NP', 'ID', 'SG', 'IN', 'JP', 'LB', 'KZ', 'KR', 'PH', 'KH', 'TH','MY','BT','LK','BM','VN','BD', 'LA','GU']:
        return 'Asia'
    elif row.country in ['IL','QA', 'AF','KZ','AE','PS','SY','SA', 'IQ','IR','TJ',]:
        return 'Arab'
    else:
        return "Oceania"   
df["continent"] = df.apply(classifier, axis=1)

In [100]:
df.head()

Unnamed: 0,backers_count,category,converted_pledged_amount,country,currency,fx_rate,goal,is_starrable,location,pledged,spotlight,state,static_usd_rate,usd_pledged,usd_type,preparation_duration,launch_duration,continent
0,21,Rock,802,US,USD,1.0,200.0,False,Chicago,802.0,True,1,1.0,802.0,international,351356,3888000,America
1,97,Mixed Media,2259,US,USD,1.0,400.0,False,Sacramento,2259.0,True,1,1.0,2259.0,international,413843,1728000,America
2,88,Photobooks,29638,US,USD,1.0,27224.0,False,Columbus,29638.0,True,1,1.0,29638.0,international,769946,2595600,America
3,193,Footwear,49158,IT,EUR,1.128433,40000.0,False,Venice,43180.0,True,1,1.136525,49075.152523,international,314662,3625358,Europe
4,20,Software,549,US,USD,1.0,1000.0,False,Redmond,549.0,False,0,1.0,549.0,domestic,212500,2592000,America


In [127]:
from sklearn import preprocessing
def encode_features(df):
    features = ['category', 'country', 'currency', 'is_starrable', 'continent']
    df_combined = pd.concat([df])
    
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df_combined[feature])
        df[feature] = le.transform(df[feature])
    return df
    
data = encode_features(df)
data.head()

TypeError: '<' not supported between instances of 'float' and 'str'