# **Data science internship assignement - Dealroom.co**
*Done by Océane Salmeron, December 2020*

## 1. Import libraries

In [401]:
import pandas as pd
import numpy as np
from collections import Counter

# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

## 2. Import Data

In [402]:
file = 'Data/Data_Science_Internship_Assignment.xlsx'
data = pd.ExcelFile(file)
print(data.sheet_names)
df = data.parse('Data')

['Instructions', 'Data', 'Count', 'Scraping results']


Let's take a look at our raw data, and get more information about it with .shape and .info()

In [403]:
df.head(2)

Unnamed: 0,NAME,WEBSITE,TAGLINE,HQ REGION,HQ COUNTRY,HQ CITY,TAGS,LAUNCH DATE,GROWTH STAGE,LINKEDIN,TYPE
0,63336,http://63336.com,Ai-enabled q&a service that answers to various...,Europe,United Kingdom,London,mobile,"2002, September",late growth stage,,
1,@Futsal,http://futsaluk.net,Educational courses through the medium of spor...,Europe,United Kingdom,Birmingham,,2008,early growth stage,https://www.linkedin.com/company/-futsal-group...,


In [404]:
df.shape

(11582, 11)

In [405]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11582 entries, 0 to 11581
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   NAME          11582 non-null  object 
 1   WEBSITE       11582 non-null  object 
 2   TAGLINE       11453 non-null  object 
 3   HQ REGION     11582 non-null  object 
 4   HQ COUNTRY    11582 non-null  object 
 5   HQ CITY       10908 non-null  object 
 6   TAGS          9593 non-null   object 
 7   LAUNCH DATE   11582 non-null  object 
 8   GROWTH STAGE  8584 non-null   object 
 9   LINKEDIN      8596 non-null   object 
 10  TYPE          0 non-null      float64
dtypes: float64(1), object(10)
memory usage: 995.5+ KB


Our data has 11582 entries and 11 columns.

In [406]:
df.isna().sum()

NAME                0
WEBSITE             0
TAGLINE           129
HQ REGION           0
HQ COUNTRY          0
HQ CITY           674
TAGS             1989
LAUNCH DATE         0
GROWTH STAGE     2998
LINKEDIN         2986
TYPE            11582
dtype: int64

## 2. Cleaning the Data

First let's drop the column that are not relevant to our task.

In [407]:
df = df.drop(['WEBSITE', 'HQ REGION', 'HQ COUNTRY', 'HQ CITY', 'GROWTH STAGE', 'LINKEDIN'], axis=1)

In [408]:
df['LAUNCH DATE'] = pd.to_datetime(df['LAUNCH DATE'].astype(str), errors="coerce").dt.year

In [409]:
df.head(3)

Unnamed: 0,NAME,TAGLINE,TAGS,LAUNCH DATE,TYPE
0,63336,Ai-enabled q&a service that answers to various...,mobile,2002.0,
1,@Futsal,Educational courses through the medium of spor...,,2008.0,
2,#5 Magazine,Multi-platform digital lifestyle magazines abo...,publishing;branding;media;platform;entertainment,2007.0,


In [410]:
def clean_text(x):
    x = x.astype(str)
    x = x.str.replace(r'[^\w\s]','')
    x = x.str.replace('\d+', '')
    x = x.str.lower() 
    return x

In [411]:
df['TAGS'] = df['TAGS'].str.split(pat=';')
df.loc[df['TAGS'].isnull(), 'TAGS'] = df.loc[df['TAGS'].isnull(), 'TAGS'].apply(lambda x: [])

df.head()

Unnamed: 0,NAME,TAGLINE,TAGS,LAUNCH DATE,TYPE
0,63336,Ai-enabled q&a service that answers to various...,[mobile],2002.0,
1,@Futsal,Educational courses through the medium of spor...,[],2008.0,
2,#5 Magazine,Multi-platform digital lifestyle magazines abo...,"[publishing, branding, media, platform, entert...",2007.0,
3,03Numbers,Planet Numbers are the leading provider of 03 ...,[],2008.0,
4,077football News & Media,"077Football - the deep-rooted, hyperlocal foot...","[sport, advertising, football, network, game d...",2009.0,


In [412]:
#Stopwords list from https://github.com/Yoast/YoastSEO.js/blob/develop/src/config/stopwords.js
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [413]:
df['TAGLINE']= clean_text(df['TAGLINE'])
df['TAGLINE'] = df['TAGLINE'].str.strip()
df['TAGLINE']= df['TAGLINE'].apply(lambda x: [item for item in x.split() if item not in stopwords] if(np.all(pd.notnull(x))) else x)
df.loc[df['TAGLINE'].isnull(), 'TAGLINE'] = df.loc[df['TAGLINE'].isnull(), 'TAGLINE'].apply(lambda x: [])

df.head()

Unnamed: 0,NAME,TAGLINE,TAGS,LAUNCH DATE,TYPE
0,63336,"[aienabled, qa, service, answers, various, que...",[mobile],2002.0,
1,@Futsal,"[educational, courses, medium, sports, childre...",[],2008.0,
2,#5 Magazine,"[multiplatform, digital, lifestyle, magazines,...","[publishing, branding, media, platform, entert...",2007.0,
3,03Numbers,"[planet, numbers, leading, provider, numbers, ...",[],2008.0,
4,077football News & Media,"[football, deeprooted, hyperlocal, football, n...","[sport, advertising, football, network, game d...",2009.0,


In [414]:
df.isna().sum()

NAME               0
TAGLINE            0
TAGS               0
LAUNCH DATE        3
TYPE           11582
dtype: int64

In [415]:
df['ALL']=df['TAGS']+df['TAGLINE']
df.head()

Unnamed: 0,NAME,TAGLINE,TAGS,LAUNCH DATE,TYPE,ALL
0,63336,"[aienabled, qa, service, answers, various, que...",[mobile],2002.0,,"[mobile, aienabled, qa, service, answers, vari..."
1,@Futsal,"[educational, courses, medium, sports, childre...",[],2008.0,,"[educational, courses, medium, sports, childre..."
2,#5 Magazine,"[multiplatform, digital, lifestyle, magazines,...","[publishing, branding, media, platform, entert...",2007.0,,"[publishing, branding, media, platform, entert..."
3,03Numbers,"[planet, numbers, leading, provider, numbers, ...",[],2008.0,,"[planet, numbers, leading, provider, numbers, ..."
4,077football News & Media,"[football, deeprooted, hyperlocal, football, n...","[sport, advertising, football, network, game d...",2009.0,,"[sport, advertising, football, network, game d..."


# 3. Classification

In [416]:
tags = list(df['TAGLINE'].explode().values)
tags= Counter(tags)
tags.most_common()

[('services', 1090),
 ('company', 1047),
 ('solutions', 1040),
 ('software', 851),
 ('online', 810),
 ('uk', 683),
 ('business', 661),
 ('management', 660),
 ('technology', 603),
 ('digital', 586),
 ('provider', 548),
 ('marketing', 524),
 ('leading', 496),
 ('development', 451),
 ('design', 440),
 ('mobile', 431),
 ('service', 416),
 ('products', 399),
 ('based', 387),
 ('data', 365),
 ('platform', 356),
 ('agency', 355),
 ('media', 354),
 ('web', 352),
 ('home', 342),
 ('global', 334),
 ('systems', 329),
 ('businesses', 289),
 ('social', 289),
 ('london', 277),
 ('industry', 269),
 ('website', 252),
 ('providing', 250),
 ('group', 240),
 ('people', 235),
 ('support', 231),
 ('specialist', 229),
 ('companies', 216),
 ('content', 216),
 ('market', 213),
 ('independent', 211),
 ('ltd', 211),
 ('one', 203),
 ('cloud', 194),
 ('range', 192),
 ('network', 191),
 ('new', 191),
 ('information', 190),
 ('world', 188),
 ('security', 188),
 ('innovative', 187),
 ('provide', 184),
 ('quality', 1

In [417]:
tech=['software', 'mobile', 'design', 'data', 'deep tech', 'search engine', 'cloud technology', 'saas', 'video',
     'adtech', 'app', 'cleantech', 'e-commerce', 'fintech', 'regtech compliance', 'consulting services', 'hardware',
     'online', 'monitoring', 'social media', 'analytics', 'game', 'technology', 'enterprise software', 'big data',
     'tech', 'it', 'wireless technology', 'developer tools', 'seo', 'data analytics', 'imaging technology']
education=['21st century skills', 'research','educational','student', 'university', 'school', 'certification', 
           'e-learnin', 'study', 'studies', 'tutorials', 'academic','assesment','academic','academics', 'learning',
          'skills', 'teach', 'teacher']
government=['charity','medical / healthcare']

In [418]:
def classify(x, tech, education, government):
    
    dic = {'Startup' : len(set(x['ALL'])&set(tech)),
            'Universities/Schools' : len(set(x['ALL'])&set(education)),
            'Government/Non-profit' : len(set(x['ALL'])&set(government))
           }
    
    maximum = max(dic, key=dic.get)
    entity = maximum
    
    if(dic[maximum] == 0):
        if (x['LAUNCH DATE']<1990):
            entity = 'Mature'
        else :
            entity = 'Unclassified'
    
    elif (maximum == 'Startup'):
        if (x['LAUNCH DATE']<1990):
            entity = 'Mature'
            
    return entity
    

In [419]:
df['TYPE']=df.apply(classify, args=(tech, education, government), axis=1)

In [420]:
df.head()

Unnamed: 0,NAME,TAGLINE,TAGS,LAUNCH DATE,TYPE,ALL
0,63336,"[aienabled, qa, service, answers, various, que...",[mobile],2002.0,Startup,"[mobile, aienabled, qa, service, answers, vari..."
1,@Futsal,"[educational, courses, medium, sports, childre...",[],2008.0,Universities/Schools,"[educational, courses, medium, sports, childre..."
2,#5 Magazine,"[multiplatform, digital, lifestyle, magazines,...","[publishing, branding, media, platform, entert...",2007.0,Unclassified,"[publishing, branding, media, platform, entert..."
3,03Numbers,"[planet, numbers, leading, provider, numbers, ...",[],2008.0,Unclassified,"[planet, numbers, leading, provider, numbers, ..."
4,077football News & Media,"[football, deeprooted, hyperlocal, football, n...","[sport, advertising, football, network, game d...",2009.0,Unclassified,"[sport, advertising, football, network, game d..."


In [421]:
df['TYPE'].value_counts()

Startup                  6913
Unclassified             4138
Universities/Schools      286
Mature                    173
Government/Non-profit      72
Name: TYPE, dtype: int64