In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns 
import re
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import preprocessing
from textblob import TextBlob

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

In [None]:
train_data = pd.read_csv("../input/train.csv")

In [None]:
# Overview
train_data.head()

In [None]:
train_data.info()

In [None]:
# Find if any entries are null
for i in train_data.columns:
    print(i, train_data[i].isnull().sum().sum())

In [None]:
# Fill in missing data by empty string
train_data['name'].fillna(" ")
train_data['desc'].fillna(" ")

In [None]:
# Convert UNIX time format to standard time format
date_column = ['deadline', 'state_changed_at', 'created_at', 'launched_at']
for i in date_column:
    train_data[i]=train_data[i].apply(lambda x: datetime.fromtimestamp(int(x)).strftime("%Y-%m-%d %H:%M:%S"))

**Preliminary exploration**
- How many projects got funded successfullly?
- How much is the goal? Can we/should we decompose the goal into bins or use numerical value?
- Are there any correlations between the currency and the status of funded projects? (ie. Is USD more favorable?)
- Are there any correlations between the country and the status of funded projects? (ie. Is a project in the US more likely to be funded?)
- Distribution of funded/not funded projects over years/months?


In [None]:
# Distribution of funded projects
sns.countplot(x='final_status',data=train_data)
plt.show()

Most of the goal is less than 20,000,000. There are some outliers which have detorted the goal features. These outliers might need to be removed.  After removing outliers, a few observations:
1. Most projects have very small goal
2. Most funded projects have goal less than 20000

In [None]:
# Distribution of goals
sns.distplot(train_data['goal'], bins=5)
plt.show()

In [None]:
train_data['goal'].describe()

In [None]:
#Remove some of the outliers and replot the histograms
P = np.percentile(train_data['goal'], [0, 95])
new_goal = train_data[(train_data['goal'] > P[0]) & (train_data['goal'] < P[1])]

In [None]:
sns.distplot(new_goal['goal'], bins=5)
plt.show()

In [None]:
g = sns.FacetGrid(new_goal, col='final_status')
g.map(plt.hist, 'goal', bins = 40)

It does looks like most of the projects got fully funded are the ones asking for less than 20,000. Next, we will see if that goal feature has any multivariate correlation with country or currency.

In [None]:
g = sns.FacetGrid(new_goal, col="final_status",  row="country")
g = g.map(plt.hist, "goal", bins = 40)

In [None]:
non_us = new_goal[new_goal['country'] != 'US']
g = sns.FacetGrid(non_us, col="final_status",  row="country")
g = g.map(plt.hist, "goal", bins = 40)

It doesn't look like **disable_communication** is a good feature to include, if at all. Most of the projects in this dataset are communication-disabled. However, a chi-square test needs to be done. It might be because of communication that a project is not funded.

In [None]:
# Explore the effect of disable_communication
figure, axes = plt.subplots(1, 2, sharey=True)
sns.countplot(x='disable_communication',data=train_data, hue='final_status', ax = axes[0])
sns.countplot(x='final_status', data= train_data, ax = axes[1])
plt.show()

In [None]:
train_data['disable_communication'].describe()

It does look like country and currency of project impacts whether or not a project got funded. Since they are nomial data, we will need to convert them into one-hot encoding. 

In [None]:
figure, axes = plt.subplots(2)
sns.countplot(x='country',data=train_data, hue='final_status', ax = axes[0])
sns.countplot(x='currency',data=train_data, hue='final_status', ax = axes[1])
plt.show()

In [None]:
figure, axes = plt.subplots(2)
sns.countplot(x='country',data=train_data, ax = axes[0])
sns.countplot(x='currency',data=train_data, ax = axes[1])
plt.show()

From these plots and from the above goal conditioned on currency plots, we can safely remove SEK, NOK, SKK because they don't add into the prediction. Removing these data points also makes one-hot encoding cleaner. 

Next, we investigate the **number of backers** and its relationship to project status. 

In [None]:
# Understand the distribution of backers using box-plot
ax = sns.boxplot(x=train_data["backers_count"])


In [None]:
#Remove some of the outliers and replot the histograms
P_backer = np.percentile(train_data['backers_count'], [0, 95])
new_backers = train_data[(train_data['backers_count'] > P_backer[0]) & (train_data['backers_count'] < P_backer[1])]

In [None]:
ax = sns.boxplot(x=new_backers["backers_count"])

In [None]:
new_backers.shape

Next, we investigate whether there is any correlation between the number of backers and the goals, countries, and finally the project status

In [None]:
# Explore the effect of disable_communication
# figure, axes = plt.subplots(1, 2, sharey=True)
sns.countplot(x='backers_count',data=new_backers, hue='final_status')
plt.xticks([],[])
# sns.countplot(x='final_status', data= train_data, ax = axes[1])
plt.show()