In [9]:
# modules we'll use
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# read in all our data
ks_projects_orig  = pd.read_csv("../data/ks-projects-201801.csv")

# set seed for reproducibility
np.random.seed(0) 

# look at a few rows of the ks_projects file
ks_projects_orig.sample(5)


Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
338862,796196901,10G Christmas Tree,Art,Art,USD,2010-12-26,10526.0,2010-12-08 08:44:04,0.0,failed,0,US,0.0,0.0,10526.0
277871,483825010,Gliff,Gaming Hardware,Games,USD,2016-03-28,10000.0,2016-01-28 04:56:18,51.0,failed,5,US,51.0,51.0,10000.0
47000,123916947,STUFFED Food Truck,Food Trucks,Food,USD,2015-01-06,60000.0,2014-11-07 02:24:36,25.0,failed,1,US,25.0,25.0,60000.0
111338,1565733636,NeoExodus Adventure: Origin of Man for Pathfin...,Tabletop Games,Games,USD,2012-05-01,500.0,2012-03-15 01:16:10,585.0,successful,17,US,585.0,585.0,500.0
53743,1273544891,NAPOLEON IN NEW YORK! an original TV Series,Comedy,Film & Video,USD,2016-07-26,25000.0,2016-05-27 00:07:25,25.0,failed,1,US,25.0,25.0,25000.0


In [10]:
# get info about DataFrame columns
ks_projects_orig.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ID                378661 non-null  int64  
 1   name              378657 non-null  object 
 2   category          378661 non-null  object 
 3   main_category     378661 non-null  object 
 4   currency          378661 non-null  object 
 5   deadline          378661 non-null  object 
 6   goal              378661 non-null  float64
 7   launched          378661 non-null  object 
 8   pledged           378661 non-null  float64
 9   state             378661 non-null  object 
 10  backers           378661 non-null  int64  
 11  country           378661 non-null  object 
 12  usd pledged       374864 non-null  float64
 13  usd_pledged_real  378661 non-null  float64
 14  usd_goal_real     378661 non-null  float64
dtypes: float64(5), int64(2), object(8)
memory usage: 43.3+ MB


In [None]:
# convert 'deadline' and 'launched' columns to datetime
ks_projects_orig['deadline'] = pd.to_datetime(ks_projects_orig['deadline'])
ks_projects_orig['launched'] = pd.to_datetime(ks_projects_orig['launched'])

# get info about DataFrame columns
ks_projects_orig.info()

# look at a few rows of the ks_projects file
ks_projects_orig.sample(5)

In [None]:
# convert non numeric to numeric
le = LabelEncoder()
ks_projects_orig['category'] = le.fit_transform(ks_projects_orig['category'])
ks_projects_orig['main_category'] = le.fit_transform(ks_projects_orig['main_category'])
ks_projects_orig['state'] = le.fit_transform(ks_projects_orig['state'])
ks_projects_orig['country'] = le.fit_transform(ks_projects_orig['country'])

# get info about DataFrame columns
ks_projects_orig.info()

# look at a few rows of the ks_projects file
ks_projects_orig.sample(5)

# Parsing Dates

## print the first few rows of the date column
print(ks_projects_orig['deadline'].head())

# create a new column, deadline_parsed, with the parsed dates
ks_projects_orig['deadline_parsed'] = pd.to_datetime(ks_projects_orig['deadline'], format = "%Y-%m-%d")

# print the first few rows
ks_projects_orig['deadline_parsed'].head()

In [None]:
# create a new column, launched_parsed, with the parsed dates
ks_projects_orig['launched_parsed'] = pd.to_datetime(ks_projects_orig['launched'], format = "%Y-%m-%d")

# print the first few rows
ks_projects_orig['launched_parsed'].head()

In [None]:
# create a new column diff, as a difference between deadline and launched
ks_projects_orig['duration'] = ks_projects_orig['deadline_parsed'] - ks_projects_orig['launched_parsed']
ks_projects_orig['duration'] = ks_projects_orig['duration'].astype('timedelta64[D]')

# print the first few rows
ks_projects_orig['duration'].head()

In [None]:
# drop redundant columns
ks_projects_reduced = ks_projects_orig.drop(['ID', 'name', 'currency', 'goal', 'pledged', 'usd pledged', 'deadline',
                                             'deadline_parsed', 'launched', 'launched_parsed'], axis=1)

# get info about DataFrame columns
ks_projects_reduced.info()

# look at a few rows of the ks_projects file
ks_projects_reduced.sample(5)

In [None]:
# convert to numpy_array
ks_projects_numpy = ks_projects_reduced.to_numpy()

# print first 5 rows
ks_projects_numpy[0:4, :]

