# **Data science internship assignement - Dealroom.co**
*Done by Oc√©ane Salmeron, December 2020*

## 1. Import libraries

In [1]:
import pandas as pd
import numpy as np

# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

## 2. Import Data

In [2]:
file = 'Data/Data_Science_Internship_Assignment.xlsx'
data = pd.ExcelFile(file)
print(data.sheet_names)
df = data.parse('Data')

['Instructions', 'Data', 'Count', 'Scraping results']


Let's take a look at our raw data, and get more information about it with .shape and .info()

In [3]:
df.head(2)

Unnamed: 0,NAME,WEBSITE,TAGLINE,HQ REGION,HQ COUNTRY,HQ CITY,TAGS,LAUNCH DATE,GROWTH STAGE,LINKEDIN,TYPE
0,63336,http://63336.com,Ai-enabled q&a service that answers to various...,Europe,United Kingdom,London,mobile,"2002, September",late growth stage,,
1,@Futsal,http://futsaluk.net,Educational courses through the medium of spor...,Europe,United Kingdom,Birmingham,,2008,early growth stage,https://www.linkedin.com/company/-futsal-group...,


In [4]:
df.shape

(11582, 11)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11582 entries, 0 to 11581
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   NAME          11582 non-null  object 
 1   WEBSITE       11582 non-null  object 
 2   TAGLINE       11453 non-null  object 
 3   HQ REGION     11582 non-null  object 
 4   HQ COUNTRY    11582 non-null  object 
 5   HQ CITY       10908 non-null  object 
 6   TAGS          9593 non-null   object 
 7   LAUNCH DATE   11582 non-null  object 
 8   GROWTH STAGE  8584 non-null   object 
 9   LINKEDIN      8596 non-null   object 
 10  TYPE          0 non-null      float64
dtypes: float64(1), object(10)
memory usage: 995.5+ KB


Our data has 11582 entries and 11 columns.

In [6]:
df.isna().sum()

NAME                0
WEBSITE             0
TAGLINE           129
HQ REGION           0
HQ COUNTRY          0
HQ CITY           674
TAGS             1989
LAUNCH DATE         0
GROWTH STAGE     2998
LINKEDIN         2986
TYPE            11582
dtype: int64

## 2. Cleaning the Data

First let's drop the column that are not relevant to our task.

In [7]:
df = df.drop(['WEBSITE', 'HQ REGION', 'HQ COUNTRY', 'HQ CITY', 'LINKEDIN'], axis=1)

In [11]:
df['LAUNCH DATE'] = pd.to_datetime(df['LAUNCH DATE'].astype(str), errors="coerce").dt.year

In [12]:
df.head(2)

Unnamed: 0,NAME,TAGLINE,TAGS,LAUNCH DATE,GROWTH STAGE,TYPE
0,63336,Ai-enabled q&a service that answers to various...,mobile,2002.0,late growth stage,
1,@Futsal,Educational courses through the medium of spor...,,2008.0,early growth stage,


In [13]:
df[['TAGS','GROWTH STAGE','TAGLINE']] = df[['TAGS','GROWTH STAGE','TAGLINE']].fillna('')

In [14]:
# split each of the strings into a list
df['TAGS'] = df['TAGS'].str.split(pat=';')

# collect all unique tags from those lists
tags = set(df['TAGS'].explode().values)

df.head()

Unnamed: 0,NAME,TAGLINE,TAGS,LAUNCH DATE,GROWTH STAGE,TYPE
0,63336,Ai-enabled q&a service that answers to various...,[mobile],2002.0,late growth stage,
1,@Futsal,Educational courses through the medium of spor...,[],2008.0,early growth stage,
2,#5 Magazine,Multi-platform digital lifestyle magazines abo...,"[publishing, branding, media, platform, entert...",2007.0,,
3,03Numbers,Planet Numbers are the leading provider of 03 ...,[],2008.0,early growth stage,
4,077football News & Media,"077Football - the deep-rooted, hyperlocal foot...","[sport, advertising, football, network, game d...",2009.0,late growth stage,


In [15]:
#Stopwords list from https://github.com/Yoast/YoastSEO.js/blob/develop/src/config/stopwords.js
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

# Delete all kind of punctuation
df['TAGLINE'] = df['TAGLINE'].astype('string')
df['TAGLINE'] = df['TAGLINE'].str.replace(r'[^\w\s]','')
    
# Change all words to lower caps
df['TAGLINE'] = df['TAGLINE'].str.lower()
df['TAGLINE'] = df['TAGLINE'].apply(lambda x: x.strip())

#Remove stopwords
df['TAGLINE']= df['TAGLINE'].apply(lambda x: [item for item in x.split() if item not in stopwords])
df.head()

Unnamed: 0,NAME,TAGLINE,TAGS,LAUNCH DATE,GROWTH STAGE,TYPE
0,63336,"[aienabled, qa, service, answers, various, que...",[mobile],2002.0,late growth stage,
1,@Futsal,"[educational, courses, medium, sports, childre...",[],2008.0,early growth stage,
2,#5 Magazine,"[multiplatform, digital, lifestyle, magazines,...","[publishing, branding, media, platform, entert...",2007.0,,
3,03Numbers,"[planet, numbers, leading, provider, 03, numbe...",[],2008.0,early growth stage,
4,077football News & Media,"[077football, deeprooted, hyperlocal, football...","[sport, advertising, football, network, game d...",2009.0,late growth stage,
