# Exploration

## Setup, EDA, Preprocessing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#import seaborn as sns
#from datetime import datetime

#from sklearn.model_selection import train_test_split, GridSearchCV, \
cross_val_score
#from sklearn.impute import SimpleImputer
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \
#BaggingClassifier, ExtraTreesClassifier
#from sklearn.linear_model import LogisticRegression
#from sklearn.metrics import accuracy_score, f1_score, recall_score, \
#precision_score, confusion_matrix, classification_report, roc_curve, auc, \
#average_precision_score
#from sklearn.preprocessing import StandardScaler, LabelEncoder, \
#MinMaxScaler, OneHotEncoder
#from sklearn.pipeline import Pipeline
#from xgboost import XGBClassifier
#from imblearn.over_sampling import SMOTE

#import warnings
#warnings.filterwarnings('ignore')

### Load in Data

In [None]:
df = pd.read_csv('../data/investments_VC_cleaned.csv', encoding = "unicode_escape")

### Preprocessing & Feature Engineering

In [None]:
data.columns

Index(['name', 'market', 'funding_total_usd', 'status', 'country_code',
       'state_code', 'region', 'city', 'funding_rounds', 'founded_at',
       'founded_month', 'founded_year', 'first_funding_at', 'last_funding_at',
       'seed', 'venture', 'equity_crowdfunding', 'undisclosed',
       'convertible_note', 'debt_financing', 'angel', 'grant',
       'private_equity', 'product_crowdfunding', 'round_A', 'round_B',
       'round_C', 'round_D', 'round_E', 'round_F', 'round_G', 'round_H'],
      dtype='object')

#### Missing Values

* I am removing rows with missing status or company name. 
* I am also dropping rows with a missing founded_year because this feature has high importance in many of the models & thus would prefer not to impute at risk of skewing the model. I tried both dropping and keeping  the missing founded_years (imputed). The proportion of companies in each status category & model results are fairly similar with & without imputing the median of year so I am comfortable dropping these rows.

In [None]:
data = data.dropna(subset=['status', 'name', 'founded_year'])
data.isnull().sum()

name                        0
market                   1801
funding_total_usd           0
status                      0
country_code             2936
state_code              13332
region                   2936
city                     3357
funding_rounds              0
founded_at                  0
founded_month               0
founded_year                0
first_funding_at            0
last_funding_at             0
seed                        0
venture                     0
equity_crowdfunding         0
undisclosed                 0
convertible_note            0
debt_financing              0
angel                       0
grant                       0
private_equity              0
product_crowdfunding        0
round_A                     0
round_B                     0
round_C                     0
round_D                     0
round_E                     0
round_F                     0
round_G                     0
round_H                     0
dtype: int64

In [None]:
len(data)

37563

In [None]:
# filling categoricals
data = data.fillna(value={'market': 'other', 'country_code': 'other', 
                          'region': 'other', 'city': 'other', 
                          'state_code': 'other'})

### Exploratory Analysis - Full Dataset

In [None]:
data = df

In [None]:
print(data.info())
data.head()

In [None]:
df.describe()

In [None]:
data.isnull().sum()

In [None]:
data['founded_at'].head()

In [None]:
data['region'].value_counts()

In [None]:
data['city'].value_counts()

In [None]:
data['state_code'].value_counts()

In [None]:
data['country_code'].value_counts()

In [None]:
# inspecting market feature - lots of catgories
print("Number of unique markets: ", len(set(data['market'])))
print("Markets with more than 200 companies: ",      
      str(sum(data['market'].value_counts() >= 200)))

In [None]:
fig, ax = plt.subplots(figsize = (10,10))
data['market'].value_counts()[:30].plot(kind='barh')

In [None]:
# same deal with region
print("Number of unique regions: ", len(set(data['region'])))
print("Regions with more than 200 companies: ",      
      str(sum(data['region'].value_counts() >= 200)))

In [None]:
fig, ax = plt.subplots(figsize = (10,10))
data['region'].value_counts()[:30].plot(kind='barh')

In [None]:
data.hist('equity_crowdfunding', bins=40) 
# there are some very skewed columns in the funding area