# **Data science internship assignement - Dealroom.co**
*Done by Oc√©ane Salmeron, December 2020*

## 1. Import libraries

In [268]:
import pandas as pd
import numpy as np

# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

## 2. Import Data

In [269]:
file = 'Data/Data_Science_Internship_Assignment.xlsx'
data = pd.ExcelFile(file)
print(data.sheet_names)
df = data.parse('Data')

['Instructions', 'Data', 'Count', 'Scraping results']


Let's take a look at our raw data, and get more information about it with .shape and .info()

In [270]:
df.head(2)

Unnamed: 0,NAME,WEBSITE,TAGLINE,HQ REGION,HQ COUNTRY,HQ CITY,TAGS,LAUNCH DATE,GROWTH STAGE,LINKEDIN,TYPE
0,63336,http://63336.com,Ai-enabled q&a service that answers to various...,Europe,United Kingdom,London,mobile,"2002, September",late growth stage,,
1,@Futsal,http://futsaluk.net,Educational courses through the medium of spor...,Europe,United Kingdom,Birmingham,,2008,early growth stage,https://www.linkedin.com/company/-futsal-group...,


In [271]:
df.shape

(11582, 11)

In [272]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11582 entries, 0 to 11581
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   NAME          11582 non-null  object 
 1   WEBSITE       11582 non-null  object 
 2   TAGLINE       11453 non-null  object 
 3   HQ REGION     11582 non-null  object 
 4   HQ COUNTRY    11582 non-null  object 
 5   HQ CITY       10908 non-null  object 
 6   TAGS          9593 non-null   object 
 7   LAUNCH DATE   11582 non-null  object 
 8   GROWTH STAGE  8584 non-null   object 
 9   LINKEDIN      8596 non-null   object 
 10  TYPE          0 non-null      float64
dtypes: float64(1), object(10)
memory usage: 995.5+ KB


Our data has 11582 entries and 11 columns.

In [273]:
df.isna().sum()

NAME                0
WEBSITE             0
TAGLINE           129
HQ REGION           0
HQ COUNTRY          0
HQ CITY           674
TAGS             1989
LAUNCH DATE         0
GROWTH STAGE     2998
LINKEDIN         2986
TYPE            11582
dtype: int64

## 2. Cleaning the Data

First let's drop the column that are not relevant to our task.

In [274]:
df = df.drop(['WEBSITE', 'HQ REGION', 'HQ COUNTRY', 'HQ CITY', 'GROWTH STAGE', 'LINKEDIN'], axis=1)

In [275]:
df['LAUNCH DATE'] = pd.to_datetime(df['LAUNCH DATE'].astype(str), errors="coerce").dt.year

In [276]:
df.head(2)

Unnamed: 0,NAME,TAGLINE,TAGS,LAUNCH DATE,TYPE
0,63336,Ai-enabled q&a service that answers to various...,mobile,2002.0,
1,@Futsal,Educational courses through the medium of spor...,,2008.0,


In [277]:
# split each of the strings into a list
df['TAGS'] = df['TAGS'].str.split(pat=';')

# collect all unique tags from those lists
tags = set(df['TAGS'].explode().values)

df.loc[df['TAGS'].isnull(), 'TAGS'] = df.loc[df['TAGS'].isnull(), 'TAGS'].apply(lambda x: [])

df.head()

Unnamed: 0,NAME,TAGLINE,TAGS,LAUNCH DATE,TYPE
0,63336,Ai-enabled q&a service that answers to various...,[mobile],2002.0,
1,@Futsal,Educational courses through the medium of spor...,[],2008.0,
2,#5 Magazine,Multi-platform digital lifestyle magazines abo...,"[publishing, branding, media, platform, entert...",2007.0,
3,03Numbers,Planet Numbers are the leading provider of 03 ...,[],2008.0,
4,077football News & Media,"077Football - the deep-rooted, hyperlocal foot...","[sport, advertising, football, network, game d...",2009.0,


In [278]:
#Stopwords list from https://github.com/Yoast/YoastSEO.js/blob/develop/src/config/stopwords.js
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]


df['TAGLINE'] = df['TAGLINE'].astype(str)

# Delete all kind of punctuation
df['TAGLINE'] = df['TAGLINE'].str.replace(r'[^\w\s]','')
    
# Change all words to lower caps
df['TAGLINE'] = df['TAGLINE'].str.lower()

df['TAGLINE']= df['TAGLINE'].apply(lambda x: x.strip() if(np.all(pd.notnull(x))) else x)

df['TAGLINE']= df['TAGLINE'].apply(lambda x: [item for item in x.split() if item not in stopwords] if(np.all(pd.notnull(x))) else x)

#df['TAGLINE']= df['TAGLINE'].apply(lambda x: x.strip())

#Remove stopwords
#df['TAGLINE']= df['TAGLINE'].apply(lambda x: [item for item in x.split() if item not in stopwords])
df.head()

Unnamed: 0,NAME,TAGLINE,TAGS,LAUNCH DATE,TYPE
0,63336,"[aienabled, qa, service, answers, various, que...",[mobile],2002.0,
1,@Futsal,"[educational, courses, medium, sports, childre...",[],2008.0,
2,#5 Magazine,"[multiplatform, digital, lifestyle, magazines,...","[publishing, branding, media, platform, entert...",2007.0,
3,03Numbers,"[planet, numbers, leading, provider, 03, numbe...",[],2008.0,
4,077football News & Media,"[077football, deeprooted, hyperlocal, football...","[sport, advertising, football, network, game d...",2009.0,


In [279]:
df.loc[df['TAGLINE'].isnull(), 'TAGLINE'] = df.loc[df['TAGLINE'].isnull(), 'TAGLINE'].apply(lambda x: [])
df.head()

Unnamed: 0,NAME,TAGLINE,TAGS,LAUNCH DATE,TYPE
0,63336,"[aienabled, qa, service, answers, various, que...",[mobile],2002.0,
1,@Futsal,"[educational, courses, medium, sports, childre...",[],2008.0,
2,#5 Magazine,"[multiplatform, digital, lifestyle, magazines,...","[publishing, branding, media, platform, entert...",2007.0,
3,03Numbers,"[planet, numbers, leading, provider, 03, numbe...",[],2008.0,
4,077football News & Media,"[077football, deeprooted, hyperlocal, football...","[sport, advertising, football, network, game d...",2009.0,


In [280]:
df.isna().sum()

NAME               0
TAGLINE            0
TAGS               0
LAUNCH DATE        3
TYPE           11582
dtype: int64

In [281]:
df['ALL']=df['TAGS']+df['TAGLINE']
df.head()

Unnamed: 0,NAME,TAGLINE,TAGS,LAUNCH DATE,TYPE,ALL
0,63336,"[aienabled, qa, service, answers, various, que...",[mobile],2002.0,,"[mobile, aienabled, qa, service, answers, vari..."
1,@Futsal,"[educational, courses, medium, sports, childre...",[],2008.0,,"[educational, courses, medium, sports, childre..."
2,#5 Magazine,"[multiplatform, digital, lifestyle, magazines,...","[publishing, branding, media, platform, entert...",2007.0,,"[publishing, branding, media, platform, entert..."
3,03Numbers,"[planet, numbers, leading, provider, 03, numbe...",[],2008.0,,"[planet, numbers, leading, provider, 03, numbe..."
4,077football News & Media,"[077football, deeprooted, hyperlocal, football...","[sport, advertising, football, network, game d...",2009.0,,"[sport, advertising, football, network, game d..."


# 3. Classification

In [345]:
from collections import Counter
tags = list(df['TAGS'].explode().values)
tags= Counter(tags)
tags.most_common()

[(nan, 1989),
 ('software', 1861),
 ('media', 1087),
 ('mobile', 1059),
 ('design', 1009),
 ('data', 995),
 ('subscription', 877),
 ('platform', 854),
 ('finance', 776),
 ('deep tech', 740),
 ('social', 706),
 ('advertising', 616),
 ('search engine', 597),
 ('cloud technology', 529),
 ('saas', 491),
 ('video', 477),
 ('adtech', 468),
 ('app', 462),
 ('retail', 460),
 ('branding', 445),
 ('delivery', 437),
 ('cleantech', 437),
 ('community', 424),
 ('e-commerce', 394),
 ('local', 390),
 ('fintech', 380),
 ('commission', 365),
 ('navigation', 364),
 ('regtech compliance', 360),
 ('consulting services', 349),
 ('hardware', 340),
 ('online', 340),
 ('risk management', 336),
 ('risk', 332),
 ('service', 331),
 ('investing', 309),
 ('education', 304),
 ('payment', 302),
 ('communication', 301),
 ('content', 297),
 ('compliance', 294),
 ('ecommerce / trading', 287),
 ('monitoring', 284),
 ('banking', 280),
 ('management', 274),
 ('biotechnology', 266),
 ('automated technology', 261),
 ('manuf

In [346]:
tech=['software', 'mobile', 'design', 'data', 'deep tech', 'search engine', 'cloud technology', 'saas', 'video',
     'adtech', 'app', 'cleantech', 'e-commerce', 'fintech', 'regtech compliance', 'consulting services', 'hardware',
     'online', 'monitoring', 'social media', 'analytics', 'game', 'technology', 'enterprise software', 'big data',
     'tech', 'it', 'wireless technology', 'developer tools', 'seo', 'data analytics', 'imaging technology']
education=['21st century skills', 'research','educational','student', 'university', 'school']
government=['charity','medical / healthcare']
mature=['multinational']

In [347]:
def classify(x, tech, mature, education, government):
    if (x['LAUNCH DATE']>=1990) and set(x['ALL'])&set(tech):
        return 'Startup'
    elif set(x['ALL'])&set(education):
        return 'Universities/Schools'
    elif set(x['ALL'])&set(government):
        return 'Government/Non-profit'
    elif (x['LAUNCH DATE']<1990) and set(x['ALL'])&set(mature):
        return 'Mature companies'
    else:
        return 'Unclassified'

In [348]:
df['TYPE']=df.apply(classify, args=(tech,mature, education, government), axis=1)

In [349]:
df.head()

Unnamed: 0,NAME,TAGLINE,TAGS,LAUNCH DATE,TYPE,ALL
0,63336,"[aienabled, qa, service, answers, various, que...",[mobile],2002.0,Startup,"[mobile, aienabled, qa, service, answers, vari..."
1,@Futsal,"[educational, courses, medium, sports, childre...",[],2008.0,Universities/Schools,"[educational, courses, medium, sports, childre..."
2,#5 Magazine,"[multiplatform, digital, lifestyle, magazines,...","[publishing, branding, media, platform, entert...",2007.0,Unclassified,"[publishing, branding, media, platform, entert..."
3,03Numbers,"[planet, numbers, leading, provider, 03, numbe...",[],2008.0,Unclassified,"[planet, numbers, leading, provider, 03, numbe..."
4,077football News & Media,"[077football, deeprooted, hyperlocal, football...","[sport, advertising, football, network, game d...",2009.0,Unclassified,"[sport, advertising, football, network, game d..."


In [353]:
df['TYPE'].value_counts()

Startup                  6929
Unclassified             4393
Universities/Schools      186
Government/Non-profit      74
Name: TYPE, dtype: int64