# IMPORT DATA

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from bpideep.getdata import getfulldata
import pandas as pd
import ast

In [3]:
data = pd.read_csv('../bpideep/rawdata/complete_df.csv')
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,about,achievements,alexa_rank_chart,angellist_url,app_12_months_growth_percentile,app_12_months_growth_relative,app_12_months_growth_unique,app_3_months_growth_percentile,...,traffic,traffic_summary,twitter_favorites_chart,twitter_followers_chart,twitter_tweets_chart,twitter_url,url,website_url,deep_or_not,target
0,0,0,,[],[],,,,,,...,"{'visitors': '47', 'annual_growth': -70.987654...",0.1K,[],[],[],https://twitter.com/health_cardio,https://app.dealroom.co/companies/healthcardio...,http://healthcardionexion.com,deeptech,1
1,1,1,,[],[],,,,,,...,"{'visitors': '659', 'annual_growth': None, 'so...",0.7K,[],[],[],https://twitter.com/4p_pharma,https://app.dealroom.co/companies/4p_pharma,http://4p-pharma.com,deeptech,1
2,2,2,,[],[],,,,,,...,"{'visitors': '359', 'annual_growth': None, 'so...",0.4K,[],[],[],https://twitter.com/afyrenbiotech,https://app.dealroom.co/companies/afyren,http://afyren.com/,deeptech,1
3,3,3,,[],"[{'date': '2017-02-08', 'value': 25266187}, {'...",,,,,,...,"{'visitors': '1668', 'annual_growth': None, 's...",2K,[],[],[],https://twitter.com/abbelight,https://app.dealroom.co/companies/abbelight,http://www.abbelight.com,deeptech,1
4,4,4,,[],[],,,,,,...,"{'visitors': '371', 'annual_growth': 6.9164265...",0.4K,[],[],[],https://twitter.com/ablacare,https://app.dealroom.co/companies/ablacare,https://ablacare.com,deeptech,1


In [9]:
data.shape

(455, 93)

# DATA CLEANING

## BACKGROUND_TEAM

In [None]:
def background(x) :
        backgrounds_list = []
        team = ast.literal_eval(x)
        for y in range(len(team['items'])):
                backgrounds= team['items'][y]['backgrounds']
                for u in range(len(backgrounds)):
                    backgrounds_list.append(backgrounds[u]['name'])                                
        return backgrounds_list
data['background_team'] = data['team'].map(lambda x:background(x))

## DEGREE_TEAM

In [84]:
def degree(x) :
        degree_list = []
        team = ast.literal_eval(x)
        for y in range(len(team['items'])):
                universities= team['items'][y]['universities']['items']
                if universities and universities[0]['degree'] is not None :
                    degree = universities[0]['degree']['name']
                    degree_list.append(degree)
        return degree_list
data['degree_team'] = data['team'].map(lambda x:degree(x))

## FUNDING_EMPLOYEES_RATIO

In [22]:
def funding_amounts_employees(data): 
    funding = data['total_funding_source']
    employees = data['employees_latest']
    return funding/employees
data['funding_employees_ratio'] = funding_amounts_employees(data)

## STRONG AND SUPER FOUNDER

In [None]:
data['has_strong_founder'] = data['has_strong_founder'].map({True : 1, 
                                                                False : 0})
data['has_super_founder'] = data['has_strong_founder'].map({True : 1, 
                                                               False : 0})


## GROWTH_STAGE_NUM

In [42]:
data['growth_stage'].unique()

array(['late growth', 'early growth', 'seed', nan, 'mature'], dtype=object)

In [51]:
def growth_stage_num(data):
    stage_status = data['growth_stage'].map({'mature' : 5, 
                                            'late growth' : 4,
                                            'early growth' : 3,
                                            'seed' : 1})
    return stage_status
data['growth_stage_num'] = growth_stage_num(data)

## FACEBOOK/TWITTER/ALEXA

In [53]:
def facebook_like_company_status(data): #pas assez de facebook pour appliquer // voir si il y a plus de facebook dans les non/deeptech
    likes_chart = data['facebook_likes_chart']
    last_likes = likes_chart[-1]['value']
    num_stage = data['growth_stage_num']
    return last_likes/num_stage
df['facebook_like_company_stage_ratio'] = facebook_like_company_stage(data)
data[['facebook_likes_chart','twitter_followers_chart','alexa_rank_chart']]

In [None]:
def twitter_followers_company_status(data):#pas assez de twitter pour appliquer
    followers_chart = data['twitter_followers_chart']
    last_followers = followers_chart[-1]['value']
    num_status = df['company_status_num']
    return last_followers/num_status
df['twitter_followers_company_status_ratio'] = twitter_followers_company_status(df)

In [None]:
def alexa_rank_company_status(data):#pas assez de alexa rank pour appliquer
    alexa_chart = data['alexa_rank_chart']
    last_rank = alexa_chart[-1]['value']
    num_status = data['company_status'].apply({'operationnal' : 5, 
                                              'fezfzefezfze' : 4})
    return last_rank/num_status
df['alexa_rank_company_status_ratio'] = alexa_rank_company_status(df)

## TRAFFIC VISITORS

In [None]:
def traffic_visitors(x): 
    traffic = ast.literal_eval(x)
    traffic = traffic['visitors']
    return traffic 
data['traffic_visitors'] = data['traffic'].map(lambda x : traffic(x))

## INDUSTRIES

In [113]:
def industries(x): 
    industries = ast.literal_eval(x)
    if industries : 
        industries = industries[0]['name']
        return industries
data['industrie_name'] = data['industries'].map(lambda x : industries(x))

In [156]:
data[['industrie_name','about']].groupby('industrie_name')['about'].nunique()
data['industrie_name'].isnull().sum()

60

## ENCODED TAGS (NEW_DF)

### LIST OF ALL TAGS

In [4]:
def list_of_tags(data):
    tags_list = []
    tags_data = data['tags']
    for x in range(len(tags_data)) : 
        tags = ast.literal_eval(tags_data[x])
        for y in range(len(tags)) : 
            if tags[y] not in tags_list :
                tags_list.append(tags[y])
    return tags_list
list_of_tags = list_of_tags(data)

### ENCODING 

In [5]:
def encoder(data) :
    new_df = pd.DataFrame(columns= list_of_tags)
    for u in range(len(data)):
        data_tags = data['tags'][u]
        dict_ = {}
        for n in list_of_tags :
            company_tags = ast.literal_eval(data_tags)
            if n in company_tags : 
                encoder = 1
            else : 
                encoder = 0
            dict_[n] = encoder
        new_df.loc[u] = dict_
    return new_df
encoded_df = encoder(data)

In [6]:
encoded_df

Unnamed: 0,investing,risk management,biotechnology,sustainable development goals,core sustainable impact,biomass,agritech,alternative protein,7 - affordable and clean energy,13 - climate action,...,office space,breeding,cloud data services,survey,adherent french tech one,fish,server,wildlife,prototype,herbal medicine
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
encoded_df.columns.tolist()

['investing',
 'risk management',
 'biotechnology',
 'sustainable development goals',
 'core sustainable impact',
 'biomass',
 'agritech',
 'alternative protein',
 '7 - affordable and clean energy',
 '13 - climate action',
 'imaging technology',
 'eic',
 'nanoscopy',
 'microscopy',
 '3 - good health and well-being',
 'non-invasive',
 'health care',
 'medical device',
 'femtech',
 'vivatech2019',
 'medical / healthcare',
 'touchscreen',
 'human computer interaction',
 'gesture recognition',
 'platform',
 'arts & culture',
 'search engine',
 'printing',
 '6 - clean water and sanitation',
 'testing',
 'polymer',
 'repair',
 'medical',
 'chemistry',
 'health diagnostics',
 'life science',
 'neuroscience',
 'power',
 'radio',
 'network',
 'lte',
 '4g',
 'banking',
 'streaming',
 'communication',
 'monitoring',
 'sharing economy',
 'community',
 'mapping',
 'navigation',
 'video',
 'drones',
 'packaging and containers',
 'system engineering',
 'telecommunications',
 'space tech',
 'electroni