# Data gathering

The goal of this notebook is to gather, clean and manipulate all the data that we will be using during the project.

In [1]:
import pandas as pd
import numpy as np

# Scrapping packages
from bs4 import BeautifulSoup as bs

# Google Trends API connection
# documentation -> https://pypi.org/project/pytrends/#api-methods
from pytrends.request import TrendReq
pytrends = TrendReq()

# 1. Data from canva.com

In [2]:
# Read text file with Canva's HTML "https://www.canva.com/create-a-design"
f = open('../project-4/canva.txt', 'r')
content = f.read()

soup = bs(content, 'html.parser')

In [3]:
# List with different categories
cats_id = {'Video': 'genre-video',
           'Social Media': 'genre-social-media',
           'Documents': 'genre-documents', 
           'Personal': 'genre-personal',
           'Education': 'genre-education',
           'Marketing': 'genre-marketing',
           'Events': 'genre-events',
           'Ads': 'genre-ads'}

# Scrape from canva.com the subcategories
for key in cats_id:
    cats_id[key] = soup.find('div', {'id':f'{cats_id[key]}'}).find_all('div', {'class':'t24rzA'})
    cats_id[key] = [subcategory.text for subcategory in cats_id[key]]
    
# Create the DataFrame where we will have Category + Sub-category
cats = pd.DataFrame(columns=['Category'])

# Populate the DataFrame
for key in cats_id:
    cats_temp = pd.DataFrame(cats_id[key], columns=['Sub-category'])
    cats_temp['Category'] = key
    cats = pd.concat([cats, cats_temp])

cats = cats.reset_index(drop=True)
cats.head()

Unnamed: 0,Category,Sub-category
0,Video,Facebook Video
1,Video,Social Feed Video Ad
2,Video,Pinterest Video Pin
3,Video,In-Stream Video Ad
4,Video,Youtube Intro


In [4]:
# Drop 'Facebook Video' from Social Media category to have unique Sub-categories
cats.drop(8, axis=0, inplace=True)
cats = cats.reset_index(drop=True)

In [5]:
cats.head()

Unnamed: 0,Category,Sub-category
0,Video,Facebook Video
1,Video,Social Feed Video Ad
2,Video,Pinterest Video Pin
3,Video,In-Stream Video Ad
4,Video,Youtube Intro


There are 8 different categories that contain a total of 71 sub-categories 

# 2. Google trends

In [6]:
# API call to Google Trends
def get_trend(keyword:str):
    
        # Build the payload
        kw_list = [keyword]
        pytrends.build_payload(kw_list, cat=0, timeframe=f'today 5-y', geo='', gprop='') # 5 years | globally
        df = pytrends.interest_over_time().reset_index()
        df = df.rename(columns={'date': 'Date', keyword: 'Trend'})
        return df

In [7]:
# Dictionary with keyword that is being evaluated per each subcategory 'sub'
sub_dic = {}

for sub in cats['Sub-category']:
    sub_dic.update({sub: {'Keyword': sub + " template"}})

# Create a list with the keys
clean_list = sub_dic.keys()

for category in sub_dic:
    sub_dic[category].update({category: get_trend(sub_dic[category]['Keyword'])})

In [8]:
# We will need to use those keys
print(clean_list)

dict_keys(['Facebook Video', 'Social Feed Video Ad', 'Pinterest Video Pin', 'In-Stream Video Ad', 'Youtube Intro', 'Video Collage', 'Slideshow Video', 'Animated Social Media', 'Instagram Post', 'Instagram Story', 'Facebook Post', 'Facebook Story', 'Facebook Cover', 'Facebook App Ad', 'Social Media', 'Your Story', 'Whatsapp Story', 'Presentation', 'A4 Document', 'Letter', 'Letterhead', 'Resume', 'Presentation (4:3)', 'Report', 'Daily Report', 'Memo', 'Invoice', 'Card', 'Birthday Card', 'Photo Collage', 'Photo Book', 'Calendar', 'Planner', 'Storyboard', 'Graphic Organiser', 'Scrapbook', 'Desktop Wallpaper', 'Postcard', 'Yearbook', 'Class Schedule', 'Worksheet', 'Lesson Plan', 'Report Card', 'Bookmark', 'Seating Chart', 'Table of Contents', 'Mind Map', 'Logo', 'Poster', 'Flyer', 'Business Card', 'Infographic', 'Brochure', 'Product Label', 'Gift Certificate', 'Gift Tag', 'Ticket', 'Newsletter', 'Invitation', 'Birthday Invitation', 'Wedding Invitation (Portrait)', 'Announcement', 'Programme

In [9]:
# Check the output
sub_dic['Facebook Video']['Facebook Video'].head()

Unnamed: 0,Date,Trend,isPartial
0,2015-07-12,92,False
1,2015-07-19,27,False
2,2015-07-26,26,False
3,2015-08-02,65,False
4,2015-08-09,52,False


# 3. Key word planner (Search Vol.) + Canva.com (Number of templates)

Importing the weekly search range

In [10]:
# Prepare DataFrame for search vol addition
cats_clean = pd.DataFrame(columns=['Category', 'Sub-category', 'Keyword'])

for sub in clean_list:
    cats_temp = cats[cats["Sub-category"] == sub].assign(Keyword=sub_dic[sub]["Keyword"])
    cats_clean = pd.concat([cats_clean, cats_temp])
    
cats_clean = cats_clean.reset_index(drop=True)

In [11]:
cats_clean

Unnamed: 0,Category,Sub-category,Keyword
0,Video,Facebook Video,Facebook Video template
1,Video,Social Feed Video Ad,Social Feed Video Ad template
2,Video,Pinterest Video Pin,Pinterest Video Pin template
3,Video,In-Stream Video Ad,In-Stream Video Ad template
4,Video,Youtube Intro,Youtube Intro template
...,...,...,...
66,Ads,Instagram Ad,Instagram Ad template
67,Ads,Large Rectangle Ad,Large Rectangle Ad template
68,Ads,Leaderboard Ad,Leaderboard Ad template
69,Ads,Wide Skyscraper Ad,Wide Skyscraper Ad template


In [12]:
# Export to csv to add Search Vol
cats_clean.to_csv('../project-4/data/temp-data/categories-pre.csv')

Added ['Search Vol (min)', 'Search Vol (max)', 'Number of templates'] in excel due to the short timeframe for this project

**['Search Vol']** source: average monthly search volume from 'https://ads.google.com/aw/keywordplanner/ideas/new?ocid=288160316&euid=220210981&__u=7630327869&uscid=288160316&__c=9136488284&authuser=0&__e=2685651001&sf=barebones&subid=de-de-et-g-aw-a-tools-kwp_bb-awhp_xin1%21o2'

**['Number of templates']** source: canva.com

In [13]:
# Open excel file
categories = pd.read_excel('../project-4/data/clean-data/categories-post.xlsx')
categories.drop('Unnamed: 0', axis=1, inplace=True)

# Explore data
categories.head(10)

Unnamed: 0,Category,Sub-category,Keyword,Search Vol (min),Search Vol (max),Number of templates
0,Video,Facebook Video,Facebook Video template,23.076923,230.769231,145
1,Video,Social Feed Video Ad,Social Feed Video Ad template,2.307692,23.076923,159
2,Video,Pinterest Video Pin,Pinterest Video Pin template,2.307692,23.076923,21
3,Video,In-Stream Video Ad,In-Stream Video Ad template,2.307692,23.076923,149
4,Video,Youtube Intro,Youtube Intro template,230.769231,2307.692308,126
5,Video,Video Collage,Video Collage template,2.307692,23.076923,340
6,Video,Slideshow Video,Slideshow Video template,23.076923,230.769231,100
7,Social Media,Animated Social Media,Animated Social Media template,2.307692,23.076923,207
8,Social Media,Instagram Post,Instagram Post template,2307.692308,23076.923077,3100
9,Social Media,Instagram Story,Instagram Story template,2307.692308,23076.923077,1746


## 4. Check Data types and merge files

Our output is composed of: 

    (i) 'categories' containing information of each sub-category
    (ii) 'sub-dic' containing as many dataframes as sub-categories with search trend + info in categories grouped by week

In [14]:
categories.dtypes

Category                object
Sub-category            object
Keyword                 object
Search Vol (min)       float64
Search Vol (max)       float64
Number of templates      int64
dtype: object

In [15]:
sub_dic['Facebook Video']['Facebook Video'].head()

Unnamed: 0,Date,Trend,isPartial
0,2015-07-12,92,False
1,2015-07-19,27,False
2,2015-07-26,26,False
3,2015-08-02,65,False
4,2015-08-09,52,False


In [16]:
for cat, sub in zip(categories['Category'], categories['Sub-category']):
    sub_dic[sub][sub] = (sub_dic[sub][sub].assign(Category=cat)
                                          .assign(Sub=sub))

In [17]:
sub_dic['Pinterest Video Pin']['Pinterest Video Pin'].head()

Unnamed: 0,index,Category,Sub


In [18]:
for cat, sub in zip(categories['Category'], categories['Sub-category']):
    sub_dic[sub][sub] = sub_dic[sub][sub].rename(columns={'Sub': 'Sub-category'})
    sub_dic[sub][sub] = sub_dic[sub][sub].merge(categories, how="left", on=['Category', 'Sub-category'])

In [19]:
sub_dic['Facebook Video']['Facebook Video'].head()

Unnamed: 0,Date,Trend,isPartial,Category,Sub-category,Keyword,Search Vol (min),Search Vol (max),Number of templates
0,2015-07-12,92,False,Video,Facebook Video,Facebook Video template,23.076923,230.769231,145
1,2015-07-19,27,False,Video,Facebook Video,Facebook Video template,23.076923,230.769231,145
2,2015-07-26,26,False,Video,Facebook Video,Facebook Video template,23.076923,230.769231,145
3,2015-08-02,65,False,Video,Facebook Video,Facebook Video template,23.076923,230.769231,145
4,2015-08-09,52,False,Video,Facebook Video,Facebook Video template,23.076923,230.769231,145


In [20]:
# Add column with Search Vol mean & a column with Search Vol
for cat, sub in zip(categories['Category'], categories['Sub-category']):
    try:
        sub_dic[sub][sub]['Search Vol (avge)'] = (sub_dic[sub][sub]['Search Vol (max)'] - sub_dic[sub][sub]['Search Vol (min)']) / 2 + sub_dic[sub][sub]['Search Vol (min)']
    
        # Change 'Trend' to float
        sub_dic[sub][sub]['Search Vol (avge)'] = sub_dic[sub][sub]['Search Vol (avge)'].astype(int) 
    
        sub_dic[sub][sub]['Search Vol'] = (sub_dic[sub][sub]['Trend'] * sub_dic[sub][sub]['Search Vol (avge)']) / 100
    
    except:
        pass

In [21]:
sub_dic['Instagram Post']['Instagram Post'].head()

Unnamed: 0,Date,Trend,isPartial,Category,Sub-category,Keyword,Search Vol (min),Search Vol (max),Number of templates,Search Vol (avge),Search Vol
0,2015-07-12,10,False,Social Media,Instagram Post,Instagram Post template,2307.692308,23076.923077,3100,12692,1269.2
1,2015-07-19,9,False,Social Media,Instagram Post,Instagram Post template,2307.692308,23076.923077,3100,12692,1142.28
2,2015-07-26,0,False,Social Media,Instagram Post,Instagram Post template,2307.692308,23076.923077,3100,12692,0.0
3,2015-08-02,7,False,Social Media,Instagram Post,Instagram Post template,2307.692308,23076.923077,3100,12692,888.44
4,2015-08-09,0,False,Social Media,Instagram Post,Instagram Post template,2307.692308,23076.923077,3100,12692,0.0


## Export clean data as .csv

In [22]:
# Export as .csv
categories.to_csv('../project-4/data/clean-data/categories-post.csv')

In [23]:
# Export files with trend info as .csv
csv_names = []

# Replace ' ' for '-'
for sub in clean_list:
    csv_names.append(sub.replace(' ', '-').lower())

# Export as .csv
for sub, name in zip(sub_dic, csv_names):
    sub_dic[sub][sub].to_csv(f'../project-4/data/clean-data/{name}.csv')

## Merge trend dataframes into one

In [24]:
sub_dic['Instagram Ad']['Instagram Ad']

Unnamed: 0,Date,Trend,isPartial,Category,Sub-category,Keyword,Search Vol (min),Search Vol (max),Number of templates,Search Vol (avge),Search Vol
0,2015-07-12,0,False,Ads,Instagram Ad,Instagram Ad template,230.769231,2307.692308,111,1269,0.00
1,2015-07-19,0,False,Ads,Instagram Ad,Instagram Ad template,230.769231,2307.692308,111,1269,0.00
2,2015-07-26,0,False,Ads,Instagram Ad,Instagram Ad template,230.769231,2307.692308,111,1269,0.00
3,2015-08-02,0,False,Ads,Instagram Ad,Instagram Ad template,230.769231,2307.692308,111,1269,0.00
4,2015-08-09,0,False,Ads,Instagram Ad,Instagram Ad template,230.769231,2307.692308,111,1269,0.00
...,...,...,...,...,...,...,...,...,...,...,...
256,2020-06-07,44,False,Ads,Instagram Ad,Instagram Ad template,230.769231,2307.692308,111,1269,558.36
257,2020-06-14,22,False,Ads,Instagram Ad,Instagram Ad template,230.769231,2307.692308,111,1269,279.18
258,2020-06-21,46,False,Ads,Instagram Ad,Instagram Ad template,230.769231,2307.692308,111,1269,583.74
259,2020-06-28,30,False,Ads,Instagram Ad,Instagram Ad template,230.769231,2307.692308,111,1269,380.70


In [25]:
trends_all = pd.DataFrame(columns=['Date', 'Trend', 'Category', 'Sub-category', 'Keyword', 'Search Vol (min)', 'Search Vol (max)', 'Number of templates', 'Search Vol (avge)', 'Search Vol'])

for sub in clean_list:
    trends_all = pd.concat([trends_all, sub_dic[sub][sub]])

In [27]:
trends_all.reset_index(drop=True)

Unnamed: 0,Date,Trend,Category,Sub-category,Keyword,Search Vol (min),Search Vol (max),Number of templates,Search Vol (avge),Search Vol,isPartial,index
0,2015-07-12,92,Video,Facebook Video,Facebook Video template,23.076923,230.769231,145,126,115.92,False,
1,2015-07-19,27,Video,Facebook Video,Facebook Video template,23.076923,230.769231,145,126,34.02,False,
2,2015-07-26,26,Video,Facebook Video,Facebook Video template,23.076923,230.769231,145,126,32.76,False,
3,2015-08-02,65,Video,Facebook Video,Facebook Video template,23.076923,230.769231,145,126,81.90,False,
4,2015-08-09,52,Video,Facebook Video,Facebook Video template,23.076923,230.769231,145,126,65.52,False,
...,...,...,...,...,...,...,...,...,...,...,...,...
15394,2020-06-07,44,Ads,Instagram Ad,Instagram Ad template,230.769231,2307.692308,111,1269,558.36,False,
15395,2020-06-14,22,Ads,Instagram Ad,Instagram Ad template,230.769231,2307.692308,111,1269,279.18,False,
15396,2020-06-21,46,Ads,Instagram Ad,Instagram Ad template,230.769231,2307.692308,111,1269,583.74,False,
15397,2020-06-28,30,Ads,Instagram Ad,Instagram Ad template,230.769231,2307.692308,111,1269,380.70,False,


In [28]:
# Export as .csv
trends_all.to_csv('../project-4/data/clean-data/trends-all.csv')