# Data gathering

The goal of this notebook is to gather, clean and manipulate all the data that we will be using during the project.

In [1]:
import pandas as pd
import numpy as np

# Scrapping packages
from bs4 import BeautifulSoup as bs

# Google Trends API connection
# documentation -> https://pypi.org/project/pytrends/#api-methods
from pytrends.request import TrendReq
pytrends = TrendReq()

# 1. Data from canva.com

In [2]:
# Read text file with Canva's HTML "https://www.canva.com/create-a-design"
f = open('../project-4/canva.txt', 'r')
content = f.read()

soup = bs(content, 'html.parser')

In [3]:
# List with different categories
cats_id = {'Video': 'genre-video',
           'Social Media': 'genre-social-media',
           'Documents': 'genre-documents', 
           'Personal': 'genre-personal',
           'Education': 'genre-education',
           'Marketing': 'genre-marketing',
           'Events': 'genre-events',
           'Ads': 'genre-ads'}

# Scrape from canva.com the subcategories
for key in cats_id:
    cats_id[key] = soup.find('div', {'id':f'{cats_id[key]}'}).find_all('div', {'class':'t24rzA'})
    cats_id[key] = [subcategory.text for subcategory in cats_id[key]]
    
# Create the DataFrame where we will have Category + Sub-category
cats = pd.DataFrame(columns=['Category'])

# Populate the DataFrame
for key in cats_id:
    cats_temp = pd.DataFrame(cats_id[key], columns=['Sub-category'])
    cats_temp['Category'] = key
    cats = pd.concat([cats, cats_temp])

In [4]:
# Explore the results
cats

Unnamed: 0,Category,Sub-category
0,Video,Facebook Video
1,Video,Social Feed Video Ad
2,Video,Pinterest Video Pin
3,Video,In-Stream Video Ad
4,Video,Youtube Intro
...,...,...
1,Ads,Instagram Ad
2,Ads,Large Rectangle Ad
3,Ads,Leaderboard Ad
4,Ads,Wide Skyscraper Ad


There are 8 different categories that contain a total of 72 sub-categories 

# 2. Google trends

In [5]:
# API call to Google Trends
def get_trend(keyword:str):
    
        # Build the payload
        kw_list = [keyword]
        pytrends.build_payload(kw_list, cat=0, timeframe=f'today 5-y', geo='', gprop='') # 5 years | globally
        df = pytrends.interest_over_time().reset_index()
        df = df.rename(columns={'date': 'Date', keyword: 'Trend'})
        return df
    
# .drop('isPartial', axis=1, inplace=True)

In [6]:
# Dictionary with keyword that is being evaluated per each subcategory 'sub'
sub_dic = {}

for sub in cats['Sub-category']:
    sub_dic.update({sub: {'Keyword': sub + " template"}})

# Create a list with the keys
clean_list = sub_dic.keys()

for category in sub_dic:
    sub_dic[category].update({category: get_trend(sub_dic[category]['Keyword'])})
#   sub_dic[category] = get_trend(sub_dic[category])

In [7]:
# We will need to use those keys
print(clean_list)

dict_keys(['Facebook Video', 'Social Feed Video Ad', 'Pinterest Video Pin', 'In-Stream Video Ad', 'Youtube Intro', 'Video Collage', 'Slideshow Video', 'Animated Social Media', 'Instagram Post', 'Instagram Story', 'Facebook Post', 'Facebook Story', 'Facebook Cover', 'Facebook App Ad', 'Social Media', 'Your Story', 'Whatsapp Story', 'Presentation', 'A4 Document', 'Letter', 'Letterhead', 'Resume', 'Presentation (4:3)', 'Report', 'Daily Report', 'Memo', 'Invoice', 'Card', 'Birthday Card', 'Photo Collage', 'Photo Book', 'Calendar', 'Planner', 'Storyboard', 'Graphic Organiser', 'Scrapbook', 'Desktop Wallpaper', 'Postcard', 'Yearbook', 'Class Schedule', 'Worksheet', 'Lesson Plan', 'Report Card', 'Bookmark', 'Seating Chart', 'Table of Contents', 'Mind Map', 'Logo', 'Poster', 'Flyer', 'Business Card', 'Infographic', 'Brochure', 'Product Label', 'Gift Certificate', 'Gift Tag', 'Ticket', 'Newsletter', 'Invitation', 'Birthday Invitation', 'Wedding Invitation (Portrait)', 'Announcement', 'Programme

In [8]:
# Check the output
sub_dic['Facebook Video']['Facebook Video'].head()

Unnamed: 0,Date,Trend,isPartial
0,2015-07-12,43,False
1,2015-07-19,0,False
2,2015-07-26,21,False
3,2015-08-02,21,False
4,2015-08-09,42,False


# 3. Key word planner (Search Vol.) + Canva.com (Number of templates)

Importing the weekly search range

In [9]:
# Prepare DataFrame for search vol addition
cats_clean = pd.DataFrame(columns=['Category', 'Sub-category', 'Keyword'])

for sub in clean_list:
    cats_temp = cats[cats["Sub-category"] == sub].assign(Keyword=sub_dic[sub]["Keyword"])
    cats_clean = pd.concat([cats_clean, cats_temp])
    
cats_clean = cats_clean.reset_index(drop=True)

In [10]:
cats_clean

Unnamed: 0,Category,Sub-category,Keyword
0,Video,Facebook Video,Facebook Video template
1,Social Media,Facebook Video,Facebook Video template
2,Video,Social Feed Video Ad,Social Feed Video Ad template
3,Video,Pinterest Video Pin,Pinterest Video Pin template
4,Video,In-Stream Video Ad,In-Stream Video Ad template
...,...,...,...
67,Ads,Instagram Ad,Instagram Ad template
68,Ads,Large Rectangle Ad,Large Rectangle Ad template
69,Ads,Leaderboard Ad,Leaderboard Ad template
70,Ads,Wide Skyscraper Ad,Wide Skyscraper Ad template


In [11]:
# Export to csv to add Search Vol
cats_clean.to_csv('../project-4/data/temp-data/categories-pre.csv')

Added ['Search Vol (min)', 'Search Vol (max)', 'Number of templates'] in excel due to the short timeframe for this project

**['Search Vol']** source: average monthly search volume from 'https://ads.google.com/aw/keywordplanner/ideas/new?ocid=288160316&euid=220210981&__u=7630327869&uscid=288160316&__c=9136488284&authuser=0&__e=2685651001&sf=barebones&subid=de-de-et-g-aw-a-tools-kwp_bb-awhp_xin1%21o2'

**['Number of templates']** source: canva.com

In [12]:
# Open excel file
categories = pd.read_excel('../project-4/data/clean-data/categories-post.xlsx')
categories.drop('Unnamed: 0', axis=1, inplace=True)

# Explore data
categories.head()

Unnamed: 0,Category,Sub-category,Keyword,Search Vol (min),Search Vol (max),Number of templates
0,Video,Facebook Video,Facebook Video template,100,1000,145
1,Social Media,Facebook Video,Facebook Video template,10,100,145
2,Video,Social Feed Video Ad,Social Feed Video Ad template,10,100,159
3,Video,Pinterest Video Pin,Pinterest Video Pin template,10,100,21
4,Video,In-Stream Video Ad,In-Stream Video Ad template,10,100,149


## 4. Check Data types

Our output is composed of: 

    (i) 'categories' containing information of each sub-category
    (ii) 'sub-dic' containing as many dataframes as sub-categories with search trend grouped by week

In [13]:
categories.dtypes

Category               object
Sub-category           object
Keyword                object
Search Vol (min)        int64
Search Vol (max)        int64
Number of templates     int64
dtype: object

In [14]:
sub_dic['Facebook Video']['Facebook Video'].head()

Unnamed: 0,Date,Trend,isPartial
0,2015-07-12,43,False
1,2015-07-19,0,False
2,2015-07-26,21,False
3,2015-08-02,21,False
4,2015-08-09,42,False


## Export clean data as .csv

In [15]:
# Export as .csv
categories.to_csv('../project-4/data/clean-data/categories-post.csv')

In [16]:
# Export files with trend info as .csv
csv_names = []

# Replace ' ' for '-'
for sub in clean_list:
    csv_names.append(sub.replace(' ', '-').lower())

# Export as .csv
for sub, name in zip(sub_dic, csv_names):
    sub_dic[sub][sub].to_csv(f'../project-4/data/clean-data/{name}.csv')

## [BONUS] Calculate min, avg etx

In [17]:
# Populate the DataFrames with seach volumes
for sub in sub_dic:
    
    # Create new columns
    sub_dic[sub] = (sub_dic[sub]
           .assign(Min=round((search_vol[sub]['min']*12) / 52))
           .assign(Max=round((search_vol[sub]['max']*12) / 52))
           .assign(Average=round((((search_vol[sub]['min'] * 12) / 52) + ((search_vol[sub]['max']*12)/52)) / 2))
               )
    
    # Generate new column with random per week
    sub_dic[sub]['Random'] = [np.random.randint(sub_dic[sub]['Min'].min(), sub_dic[sub]['Max'].max()) for row in range(len(sub_dic[sub]))]
    
    # Create a colum trend * mean
    sub_dic[sub]['Search Vol'] = round((sub_dic[sub]['Trend'] / 100) * sub_dic[sub]['Average'], 0)

AttributeError: 'dict' object has no attribute 'assign'

In [None]:
# Check the output
sub_dic['a_4'].head()