<a href="https://colab.research.google.com/github/photostachio/Unit2_Build_Startups/blob/master/Startups_Build_Week_Unit_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%%capture
import sys

if 'google.colab' in sys.modules:
    # Install packages in Colab
    !pip install category_encoders==2.*
    !pip install eli5
    !pip install pandas-profiling==2.*
    !pip install pdpbox
    !pip install shap

In [0]:
%matplotlib inline
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None
pd.set_option('display.max_column',None)
df = pd.read_csv('/content/drive/My Drive/investments_VC.csv',encoding = 'unicode_escape')
df.head()

In [0]:
df['status'].unique()

In [0]:
df = df.dropna()

In [0]:
df['status'].unique()

In [0]:
#First we will transform 'status' into a 2/1/0 format for Acquired, operating and closed
df['status'] = df['status'].map({'acquired': 1, 'operating': 1, 'closed': 0})
df['status']

In [0]:
df['status'].value_counts(normalize=True)

In [0]:
df['status'].mean()

In [0]:
df['initial_funding'] = df['seed'] + df['venture'] + df['equity_crowdfunding'] + df['private_equity'] + df['angel'] + df['product_crowdfunding']

In [0]:
sns.countplot(x='status', data=df,palette='RdBu_r')

In [0]:
df['status'].value_counts()

In [0]:
df.describe()


In [0]:
df['category_list'].describe()

In [0]:
df[' market '].describe()

In [0]:
df['founded_year'].describe()

In [0]:
def year_group(row):
    if row['founded_year'] >= 1900 and row['founded_year'] <= 1925:
        row['founded_year_group'] = 'less_than_1925'
    elif row['founded_year'] > 1925 and row['founded_year'] <= 1950:
        row['founded_year_group'] = '1925_1950'
    elif row['founded_year'] > 1950 and row['founded_year'] <= 1975:
        row['founded_year_group'] = '1950_1975'
    elif row['founded_year'] > 1975 and row['founded_year'] <= 2000:
        row['founded_year_group'] = '1975_2000'
    elif row['founded_year'] > 2000:
        row['founded_year_group'] = '2000_2014'
    else:
        row['founded_year_group'] = ''
    return row['founded_year_group']

df['founded_year_group'] =  df.apply(year_group,axis =1)

In [0]:
plt.figure(figsize = (10,7))
sns.countplot(x = 'founded_year_group', data = df.dropna())
plt.show()

In [0]:
df_2000 = df[df['founded_year'] >= 2000]
df_2000['founded_year'] = df_2000['founded_year'].astype(int)
plt.figure(figsize = (16,7))
sns.countplot(x = 'founded_year', data = df_2000)
plt.show()

In [0]:
df_2000['country_code'].unique()

In [0]:
df_2000_USA = df_2000[(df_2000['country_code'] =='USA')]
plt.figure(figsize=(16,7))
g = sns.countplot(x ='state_code', data = df_2000_USA, order=df['state_code'].value_counts().iloc[:53].index)
plt.xticks(rotation=30)
plt.show()

In [0]:
plt.figure(figsize=(20,7))
sns.countplot(x =' market ', data = df_2000, order=df_2000[' market '].value_counts().iloc[:50].index)
plt.xticks(rotation=75)
plt.show()

In [0]:
df_2000_USA.dtypes

In [0]:
columns = [' market ', 'initial_funding', 'status', 'country_code',
           'state_code', 'city', 'funding_rounds', 'seed', 'venture',
           'equity_crowdfunding', 'private_equity', 'angel', 'product_crowdfunding']

df_2000_USA = df_2000_USA[columns]

In [0]:
df_2000_USA['state_code']

In [0]:
df_2000_USA.dtypes

In [0]:
df_2000_USA = pd.concat([df_2000_USA,pd.get_dummies(df_2000_USA[' market '],drop_first=True,prefix=" market ")],axis=1)
df_2000_USA = pd.concat([df_2000_USA,pd.get_dummies(df_2000_USA['country_code'],drop_first=True,prefix="country_code")],axis=1)
df_2000_USA = pd.concat([df_2000_USA,pd.get_dummies(df_2000_USA['state_code'],drop_first=True,prefix="state_code")],axis=1)
df_2000_USA = pd.concat([df_2000_USA,pd.get_dummies(df_2000_USA['city'],drop_first=True,prefix="city")],axis=1)


In [0]:
#Drop the original Categorical Variables
df_2000_USA.drop([' market ','country_code','state_code','city'],axis=1,inplace=True)

In [0]:
#Create the train and test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_2000_USA.drop('status',axis=1), 
                                                    df_2000_USA['status'], test_size=0.30, 
                                                    random_state=101)

In [0]:
y_train.value_counts(normalize=True)

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

#Train the model
# logmodel = RandomForestClassifier(random_state = 1, max_depth = 15, n_estimators = 500,
#                                   min_samples_split = 2, min_samples_leaf = 1, class_weight='balanced')

logmodel = RandomForestClassifier(class_weight='balanced', criterion='gini', max_depth=15, max_features='auto', max_leaf_nodes=None,
                                                         min_samples_leaf=1,min_samples_split=2, min_weight_fraction_leaf=0,
                                                         n_estimators=500, n_jobs=1, oob_score=False, random_state=42,
                                                         verbose=0, warm_start=False)
logmodel.fit(X_train,y_train)

#Predicting on the Test Set
predictions = logmodel.predict(X_test)

In [0]:
#Model Evaluation
from sklearn.metrics import classification_report, roc_auc_score
print(classification_report(y_test,predictions))

truePos = X_test[((predictions == 1) & (y_test == predictions))]
falsePos = X_test[((predictions == 1) & (y_test != predictions))]
trueNeg = X_test[((predictions == 0) & (y_test == predictions))]
falseNeg = X_test[((predictions == 0) & (y_test != predictions))]

TP = truePos.shape[0]
FP = falsePos.shape[0]
TN = trueNeg.shape[0]
FN = falseNeg.shape[0]

accuracy = float(TP + TN)/float(TP + TN + FP + FN)
print('Accuracy: '+str(accuracy))
print('ROC AUC SCORE',roc_auc_score(y_test, predictions))

In [0]:
import pickle

with open('logmodel.pkl', 'wb') as fid:
    pickle.dump(logmodel, fid,2)  

In [0]:
cat = df_2000_USA.drop('status',axis=1)
index_dict = dict(zip(cat.columns,range(cat.shape[1])))

#Save the index_dict into disk
with open('cat', 'wb') as fid:
    pickle.dump(index_dict, fid,2)  