In [78]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [79]:
# Reading in
df = pd.read_csv("raw_data.csv")

In [80]:
# Cleaning

# Dropping columns
cols_to_drop = ['#', 'Score', 'Start Date (UTC)', 'Submit Date (UTC)', 
                'Network ID', "Ok, lets start with the basics. What's your first name?", 
               "What's your email address?"]
df.drop(cols_to_drop, axis=1, inplace=True)

In [81]:
# Cleaning NaNs
clean = df.fillna({"Which state are you based in?": "Unknown",
                  "Which industry is your startup in?": "Unknown",
                  "Where are you based?": "Unknown"}).fillna(0)

In [87]:
clean['Score_Clean'] = clean['SCORE'].map({'MVP': 3, 'Proto': 2, 'Napkin': 1})
clean.drop(['SCORE'], axis=1)

Unnamed: 0,Where are you based?,Which state are you based in?,Which industry is your startup in?,How clear and well articulated is *the problem* that you're solving with your offering?,How much does your solution easily demonstrate a clear customer value proposition? (zero means 'they don't get it' and 5 means 'they repeatedly come back for more'),There a large and still growing market for your solution (zero means 'not at all' and 5 means 'huge and still growing rapidly'),"How much does your current business model/pricing strategy encourage repeat purchases/subscription vs. one-time transactional revenue generation? (zero means , 'not at all' and 5 means 'I have a full subscription models)",How unique is your offering to what exists in the market? (zero means 'not at all' and 5 means 'no direct competitors'),How soon do you envision revenue being generated? (zero means 'revenue? what's that?' and 5 means 'already generating and growing'),How easy is it for a customer to purchase your offering (zero means 'long sales cycle & lots of steps to get to payment' and 5 means 'buy now & buy often'),...,"How detailed is your 3 year, bottom up financial forecast? (zero means 'oh, we need to do a 3 year forecast?' and 5 means 'what revenue stream or cost category would you like to discuss right now?')","How long will your revenue or capital raised last given your monthly burn rate? (zero means 'OMG, we need a sale or an investor NOW' and 5 means 'we should be good for at least 15-18 months')","How much traction have you had with prospective investors ? (zero means 'we've got lots of irons in the fire but no investment in our bank' and 5 means 'we've raised from some key, influential investors who have helped us secure others')","How comfortable are prospective investors with the current valuation? (zero means 'no one's invested as they don't get how big we're going to be' and 5 means 'we did a crawl, walk, run raise where we started low, built some traction, achieved some milestones and then raised the next round')",How ready are you with your due diligence documents? (zero means 'due who?' and 5 means 'just log in to our online deal room to access whatever you need to see'),How well do you know potential acquirers and their history of acquiring companies in your space? (zero means 'not at all' and 5 means 'we're tracking all M&A activity in our space'),"How prepared are you to grow the business organically if you can't get acquired or IPO? (zero means 'we better get acquired or I'm not sure investors will make any money' and 5 means 'heck, we're ready to become the acquirer and consolidate the industry ourselves')",How well connected are you in your industry? (zero means 'I'm new to the space but a bit of an introvert' and 5 means 'I'm a natural connector and my Rolodex is overflowing'),How well do you communicate with visuals vs text? (zero means 'I'm a terrible artist!' and 5 means 'I'm a strong believer that a picture says a thousand words'),Score_Clean
0,Israel,Unknown,Marketing & Advertising,5.0,5.0,5,5.0,4,5,1,...,0.0,3.0,0.0,1.0,1.0,0.0,5.0,5.0,4.0,3
1,India,Unknown,Retail,3.0,2.0,4,3.0,3,1,3,...,3.0,1.0,2.0,1.0,4.0,3.0,1.0,2.0,2.0,2
2,United States of America,North Carolina,Hospitality,5.0,3.0,4,3.0,4,2,4,...,1.0,0.0,2.0,0.0,2.0,0.0,4.0,3.0,5.0,2
3,Singapore,Unknown,"Health, Wellness & Fitness",4.0,4.0,5,5.0,3,3,5,...,4.0,1.0,0.0,2.0,3.0,0.0,5.0,4.0,5.0,2
4,India,Unknown,Financial Services,5.0,5.0,5,4.0,3,5,4,...,5.0,2.0,0.0,4.0,3.0,3.0,5.0,4.0,4.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,United States of America,Pennsylvania,Internet,5.0,4.0,5,0.0,4,3,4,...,0.0,0.0,0.0,0.0,1.0,0.0,4.0,3.0,4.0,1
230,United States of America,California,Hospital & Health Care,5.0,5.0,5,5.0,4,5,3,...,5.0,3.0,0.0,1.0,3.0,3.0,5.0,5.0,5.0,2
231,United States of America,California,Education Management,5.0,5.0,5,3.0,4,4,4,...,5.0,1.0,2.0,2.0,3.0,2.0,4.0,4.0,4.0,3
232,Singapore,Unknown,Staffing & Recruiting,5.0,4.0,3,2.0,2,5,1,...,3.0,2.0,2.0,2.0,1.0,2.0,3.0,4.0,1.0,2


In [117]:
# Categorical Variables
cleaned = pd.get_dummies(clean)
cleaned

Unnamed: 0,How clear and well articulated is *the problem* that you're solving with your offering?,How much does your solution easily demonstrate a clear customer value proposition? (zero means 'they don't get it' and 5 means 'they repeatedly come back for more'),There a large and still growing market for your solution (zero means 'not at all' and 5 means 'huge and still growing rapidly'),"How much does your current business model/pricing strategy encourage repeat purchases/subscription vs. one-time transactional revenue generation? (zero means , 'not at all' and 5 means 'I have a full subscription models)",How unique is your offering to what exists in the market? (zero means 'not at all' and 5 means 'no direct competitors'),How soon do you envision revenue being generated? (zero means 'revenue? what's that?' and 5 means 'already generating and growing'),How easy is it for a customer to purchase your offering (zero means 'long sales cycle & lots of steps to get to payment' and 5 means 'buy now & buy often'),How easy is it to identify and to reach your paying customers? (zero means 'i think I know who they are and all I have to do is go knocking on their doors' and 5 means 'highly leveraged means to get to them'),"How much traction have you demonstrated to date? (zero means 'we haven't gotten out of the gate yet' and 5 means 'we have lots of engaged users, strong user growth, some (& growing) monthly recurring revenue, formalized partnerships)","How much have you done with respect to intellectual property? (zero means 'we don't need no stinkin' IP' and 5 means 'we have filed trademarks, copyrights &/or patents and are ready to defend them')",...,Which industry is your startup in?_Renewables & Environment,Which industry is your startup in?_Restaurants,Which industry is your startup in?_Retail,Which industry is your startup in?_Sporting Goods,Which industry is your startup in?_Sports,Which industry is your startup in?_Staffing & Recruiting,Which industry is your startup in?_Unknown,Which industry is your startup in?_Veterinary,Which industry is your startup in?_Wholesale,Which industry is your startup in?_Wine & Spirits
0,5.0,5.0,5,5.0,4,5,1,4.0,5.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,3.0,2.0,4,3.0,3,1,3,3.0,3.0,3.0,...,0,0,1,0,0,0,0,0,0,0
2,5.0,3.0,4,3.0,4,2,4,3.0,2.0,4.0,...,0,0,0,0,0,0,0,0,0,0
3,4.0,4.0,5,5.0,3,3,5,4.0,4.0,2.0,...,0,0,0,0,0,0,0,0,0,0
4,5.0,5.0,5,4.0,3,5,4,0.0,3.0,3.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,5.0,4.0,5,0.0,4,3,4,3.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
230,5.0,5.0,5,5.0,4,5,3,2.0,2.0,5.0,...,0,0,0,0,0,0,0,0,0,0
231,5.0,5.0,5,3.0,4,4,4,5.0,5.0,0.0,...,0,0,0,0,0,0,0,0,0,0
232,5.0,4.0,3,2.0,2,5,1,1.0,2.0,1.0,...,0,0,0,0,0,1,0,0,0,0


In [118]:
# ML Stuffs
X = cleaned.drop(['Score_Clean'], axis=1)
y = cleaned["Score_Clean"]

In [119]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression(max_iter = 1000)
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [120]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 1.00


In [121]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[28  0  0]
 [ 0 41  0]
 [ 0  0  2]]


In [122]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00        28
           2       1.00      1.00      1.00        41
           3       1.00      1.00      1.00         2

    accuracy                           1.00        71
   macro avg       1.00      1.00      1.00        71
weighted avg       1.00      1.00      1.00        71

