In [None]:
import pandas as pd
import requests as req
import numpy as np

# Scrapping packages
from bs4 import BeautifulSoup as bs

# Google Trends API connection
# documentation -> https://pypi.org/project/pytrends/#api-methods
from pytrends.request import TrendReq
pytrends = TrendReq()

# Statistics + plotting
import matplotlib
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm


##### CLEAN IT

# 1. Data from canva.com

In [None]:
# Read text file with Canva's HTML "https://www.canva.com/create-a-design"
f = open('../project-4/canva.txt', 'r')
content = f.read()

soup = bs(content, 'html.parser')

In [3]:
# List with different categories
cats_id = {'Video': 'genre-video',
           'Social Media': 'genre-social-media',
           'Documents': 'genre-documents', 
           'Personal': 'genre-personal',
           'Education': 'genre-education',
           'Marketing': 'genre-marketing',
           'Events': 'genre-events',
           'Ads': 'genre-ads'}

# Scrape from canva.com the subcategories
for key in cats_id:
    cats_id[key] = soup.find('div', {'id':f'{cats_id[key]}'}).find_all('div', {'class':'t24rzA'})
    cats_id[key] = [subcategory.text for subcategory in cats_id[key]]
    
# Create the DataFrame where we will have Category + Sub-category
cats = pd.DataFrame(columns=['Category'])

# Populate the DataFrame
for key in cats_id:
    cats_temp = pd.DataFrame(cats_id[key], columns=['Sub-category'])
    cats_temp['Category'] = key
    cats = pd.concat([cats, cats_temp])

In [4]:
# Explore the results
cats

Unnamed: 0,Category,Sub-category
0,Video,Facebook Video
1,Video,Social Feed Video Ad
2,Video,Pinterest Video Pin
3,Video,In-Stream Video Ad
4,Video,Youtube Intro
...,...,...
1,Ads,Instagram Ad
2,Ads,Large Rectangle Ad
3,Ads,Leaderboard Ad
4,Ads,Wide Skyscraper Ad


There are 8 different categories that contain a total of 72 sub-categories 

# 2. Google trends

In [6]:
# API call to Google Trends
def get_trend(keyword:str):
    
        # Build the payload
        kw_list = [keyword]
        pytrends.build_payload(kw_list, cat=0, timeframe=f'today 5-y', geo='', gprop='') # 5 years | globally
        df = pytrends.interest_over_time().reset_index()
        df = df.rename(columns={'date': 'Date', keyword: 'Trend'})
        return df
    
# .drop('isPartial', axis=1, inplace=True)

In [7]:
# Dictionary with keyword that is being evaluated per each subcategory 'sub'
sub_dic = {}

for sub in cats['Sub-category']:
    sub_dic.update({sub: sub + " template"})

# Create a list with the keys
clean_list = sub_dic.keys()

for category in sub_dic:
    sub_dic[category] = get_trend(sub_dic[category])

In [10]:
# We will need to use those keys
print(clean_list)

dict_keys(['Facebook Video', 'Social Feed Video Ad', 'Pinterest Video Pin', 'In-Stream Video Ad', 'Youtube Intro', 'Video Collage', 'Slideshow Video', 'Animated Social Media', 'Instagram Post', 'Instagram Story', 'Facebook Post', 'Facebook Story', 'Facebook Cover', 'Facebook App Ad', 'Social Media', 'Your Story', 'Whatsapp Story', 'Presentation', 'A4 Document', 'Letter', 'Letterhead', 'Resume', 'Presentation (4:3)', 'Report', 'Daily Report', 'Memo', 'Invoice', 'Card', 'Birthday Card', 'Photo Collage', 'Photo Book', 'Calendar', 'Planner', 'Storyboard', 'Graphic Organiser', 'Scrapbook', 'Desktop Wallpaper', 'Postcard', 'Yearbook', 'Class Schedule', 'Worksheet', 'Lesson Plan', 'Report Card', 'Bookmark', 'Seating Chart', 'Table of Contents', 'Mind Map', 'Logo', 'Poster', 'Flyer', 'Business Card', 'Infographic', 'Brochure', 'Product Label', 'Gift Certificate', 'Gift Tag', 'Ticket', 'Newsletter', 'Invitation', 'Birthday Invitation', 'Wedding Invitation (Portrait)', 'Announcement', 'Programme

In [11]:
# Check the output
sub_dic['Facebook Video'].head()

Unnamed: 0,Date,Trend,isPartial
0,2015-07-12,21,False
1,2015-07-19,42,False
2,2015-07-26,21,False
3,2015-08-02,21,False
4,2015-08-09,62,False


# 3. Key word planner

Importing the weekly search range

In [13]:
keywords = [
    {'text': 'mars cruise', 'matchType': 'BROAD'},
    {'text': 'cheap cruise', 'matchType': 'PHRASE'},
    {'text': 'cruise', 'matchType': 'EXACT'}
]

keyword_estimate_requests = []
for keyword in keywords:
  keyword_estimate_requests.append({
      'keyword': {
          'xsi_type': 'Keyword',
          'matchType': keyword['matchType'],
          'text': keyword['text']
      }
  })


# Create ad group estimate requests.
adgroup_estimate_requests = [{
    'keywordEstimateRequests': keyword_estimate_requests,
    'maxCpc': {
        'xsi_type': 'Money',
        'microAmount': '1000000'
    }
}]

In [None]:
# Average monthly search volume from 'https://ads.google.com/aw/keywordplanner/ideas/new?ocid=288160316&euid=220210981&__u=7630327869&uscid=288160316&__c=9136488284&authuser=0&__e=2685651001&sf=barebones&subid=de-de-et-g-aw-a-tools-kwp_bb-awhp_xin1%21o2'
search_vol = {'presentation':{'min': 10000, 'max': 100000}, 
              'a_4': {'min': 10, 'max': 100}, 
              'letter': {'min': 10000, 'max': 100000}, 
              'letterhead': {'min': 10000, 'max': 100000},
              'resume': {'min': 100000, 'max': 1000000},
              'presentation_4_3': {'min': 10, 'max': 100},
              'report': {'min': 10000, 'max': 100000},
              'daily_report': {'min': 1000, 'max': 100000},
              'memo': {'min': 10000, 'max': 100000},
              'invoice': {'min': 100000, 'max': 1000000}}
                
# Populate the DataFrames with seach volumes
for sub in sub_dic:
    
    # Create new columns
    sub_dic[sub] = (sub_dic[sub]
           .assign(Min=round((search_vol[sub]['min']*12) / 52))
           .assign(Max=round((search_vol[sub]['max']*12) / 52))
           .assign(Average=round((((search_vol[sub]['min'] * 12) / 52) + ((search_vol[sub]['max']*12)/52)) / 2))
               )
    
    # Generate new column with random per week
    sub_dic[sub]['Random'] = [np.random.randint(sub_dic[sub]['Min'].min(), sub_dic[sub]['Max'].max()) for row in range(len(sub_dic[sub]))]
    
    # Create a colum trend * mean
    sub_dic[sub]['Search Vol'] = round((sub_dic[sub]['Trend'] / 100) * sub_dic[sub]['Average'], 0)

In [None]:
# Check the output
sub_dic['a_4'].head()

In [None]:
## BONUS - Integrate with the API

# 4. Templates that Canva is offering

Count the # of templates that Canva is offering in its website

In [None]:
templates_vol = {'presentation': 1000, 
                 'a_4': 3500, 
                 'letter': 886, 
                 'letterhead': 886,
                 'resume': 1483,
                 'presentation_4_3': 1146,
                 'report': 1273,
                 'daily_report': 98,
                 'memo': 426,
                 'invoice': 167}

for sub in sub_dic:

    # Assign to DataFrames
    sub_dic[sub]['Number of templates'] = templates_vol[sub]

In [None]:
sub_dic['a_4'].head()

## Check Data types + final cleaning

In [None]:
sub_dic['a_4'].dtypes

# Basic plotting

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Volume of searchs
sub_colors = {'presentation': 'red', 
              'a_4': 'orange', 
              'letter': 'blue', 
              'letterhead': 'grey',
              'resume': 'black',
              'presentation_4_3': 'yellow',
              'report': 'purple',
              'daily_report': 'brown',
              'memo': 'green',
              'invoice': 'magenta'}

plt.figure(figsize=(15,5))

for sub, colr in zip(sub_dic, sub_colors.values()):
    
    # Create a graph with the volume of searches
    ax = sns.lineplot(data=sub_dic[sub], 
                  x='Date', 
                  y='Search Vol',
                  color=colr)
    
# [Bonus] --> Adding legen with sub category name

Resume and Invoice sub-categories have much more search vol than the rest. For the sake of the analysis, could be interesting to add those sub-categories with less than 10.000 monthly visits.

In [None]:
# Same plot with subcategories with less than 10.000 monthly searches on average
sub_dic_gp = {'Under 10k': pd.DataFrame(columns=['Date', 'Search Vol']),
              'Under 100k': pd.DataFrame(columns=['Date', 'Search Vol'])}

# Aggregate DataFrames by groups depending on Search Vol.
for sub, colr in zip(sub_dic, sub_colors.values()):
    
    # Subcategories with an avge of less than 10k monthly seaches
    if (sub_dic[sub]['Search Vol'].mean() * 52) / 12 < 10000:
        sub_dic_gp['Under 10k'] = pd.concat([sub_dic_gp['Under 10k'], sub_dic[sub]])
    
    # Subcategories with an avge of less than 10k monthly seaches
    elif ((sub_dic[sub]['Search Vol'].mean() * 52) / 12 < 100000) & ((sub_dic[sub]['Search Vol'].mean() * 52) / 12 > 10000):
        sub_dic_gp['Under 100k'] = pd.concat([sub_dic_gp['Under 100k'], sub_dic[sub]])
    
    # The rest will be analysed individually
    else:
        sub_dic_gp[sub] = sub_dic[sub]

# Grouping by day and summing 'Seatch Vol.'
sub_dic_gp['Under 10k'] = sub_dic_gp['Under 10k'].groupby('Date', as_index=False).agg('sum')
sub_dic_gp['Under 100k'] = sub_dic_gp['Under 100k'].groupby('Date', as_index=False).agg('sum')

In [None]:
# Checking the results
sub_dic_gp['Under 100k'].head()

Now we have a dictionary with a DataFrame that aggregates the subcategories with less that 10k monthly searches as 'Under 10k', a DataFrame that aggregates the subcategories with less that 100k monthly searches as 'Under 100k' and individual DataFrames for those subcategories with more than 100k monthly searches.

In [None]:
# Plot 
plt.figure(figsize=(15,5))

for sub, colr in zip(sub_dic_gp, sub_colors.values()):
    
    # Create a graph with the volume of searches
    ax = sns.lineplot(data=sub_dic_gp[sub], 
                  x='Date', 
                  y='Search Vol',
                  color=colr)
    
## [BONUS] - Add legends

It would be useful to list the subcategories in each category.

In [None]:
# [BONUS] - List a summary with the categories in each bin. [Under 10k] | [Under 100k] | [Over 100k]

### Find 'Search Vol' trend

In [None]:
# Combine all searches and find the trend. 
# We will be able to compare each sub_category with the overall trend
# Statistical analysis

In [None]:
sub_dic_gp['Total'] = pd.DataFrame(columns=['Date', 'Search Vol'])

# Concat all DataFrames into one
for sub in sub_dic:
    sub_dic_gp['Total'] = pd.concat([sub_dic_gp['Total'], sub_dic[sub]])

sub_dic_gp['Total'] = sub_dic_gp['Total'].groupby('Date', as_index=False).agg('sum')

#### Calculate the fitted line

In [None]:
# Create a constant
sub_dic_gp['Total'] = sm.add_constant(sub_dic_gp['Total'])
sub_dic_gp['Total'] = sub_dic_gp['Total'].assign(t=range(1, len(sub_dic_gp['Total']) + 1))
# sub_dic_gp['Total'] = sub_dic_gp['Total'].assign(t2=range(1, (len(sub_dic_gp['Total']) + 1)**2))

# Run OLS
X = sub_dic_gp['Total'][["const", "t"]]
y = sub_dic_gp['Total']["Search Vol"]

lin_reg = sm.OLS(y, X)
results_total = lin_reg.fit()

results_total.summary()

In [None]:
sub_dic_gp['Total'] = sub_dic_gp['Total'].assign(y_hat=results_total.predict(X))

plt.figure(figsize=(15,5))
    
sns.scatterplot(data=sub_dic_gp['Total'], 
                x='t', 
                y='Search Vol');

sns.lineplot(data=sub_dic_gp['Total'],
             x='t',
             y="y_hat",
             color="green");

In [None]:
# Run OLS
lin_reg = sm.OLS(sub_dic_gp['Total']["Date"],
                 sub_dic_gp['Total'][['const', 'Search Vol']])

result = lin_reg.fit()

corr = sm.add_constant(corr)
corr.head()

# Run OLS
lin_reg = sm.OLS(corr["Search Vol (avge)"],
                 corr[["const", "Number of templates"]])

result = lin_reg.fit()

result.summary()
# result.summary()

In [None]:
plt.figure(figsize=(15,5))
    
    # Create a graph with the total volume of searches
ax = sns.lineplot(data=sub_dic_gp['Total'], 
                  x='Date', 
                  y='Search Vol',)

ax = sns.regplot(data=sub_dic_gp['Total'], 
                  x='Date', 
                  y='Search Vol',)

In [None]:
# Find if there is a correlation between 'Templates' that Canva is offering and 'Search Vol' 
# We will be able to do a linear regression
# Statistical analysis

### 'Search Vol' vs. # 'Template '

Is there a correlation between the Search Vol for the Keywords assigned to the category and the number of templates that Canva is offering? Is it statistically significant?

In [None]:
corr = pd.DataFrame(columns=['Sub-Category', 'Search Vol (avge)', 'Number of templates'])

corr['Sub-Category'] = sub_dic.keys()
corr['Search Vol (avge)'] = [int(sub_dic[sub]['Search Vol'].mean()) for sub in sub_dic]
corr['Number of templates'] = [int(sub_dic[sub]['Number of templates'].mean()) for sub in sub_dic]

corr

In [None]:
# Scatter plot to infere if it makes sense to calculate the fitted line
plt.figure(figsize=(15,5))
    
sns.scatterplot(data=corr, 
                x='Number of templates', 
                y='Search Vol (avge)',
                hue='Sub-Category');

In [None]:
# See if there is a correlation between the two variables
corr = sm.add_constant(corr)
corr.head()

# Run OLS
lin_reg = sm.OLS(corr["Search Vol (avge)"],
                 corr[["const", "Number of templates"]])

result = lin_reg.fit()

result.summary()

In [None]:
corr = corr.assign(y_hat=result.predict(corr[["const", "Number of templates"]]))

In [None]:
plt.figure(figsize=(15,5))
    
sns.scatterplot(data=corr, 
                x='Number of templates', 
                y='Search Vol (avge)',
                hue='Sub-Category');

sns.lineplot(data=corr,
             x='Number of templates',
             y="y_hat",
             color="g");

---------------------------------------------------

## Bonus

1. Try to get the data with requests. Now it is hardcoded
2. Segment the subcategories by categories


In [None]:
pd.DataFrame(columns=["Subcategory", "Categoty"])

In [None]:
headers = {'authority': 'www.canva.com',
'method': 'GET',
'path': '/create-a-design',
'scheme': 'https',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'cookie': '__cfduid=dac51796841329b70661ebd07a4b3a8901591811643; CDI=888a9880-c327-4479-8e8b-9ba1da774a1a; CL=es-ES; ajs_group_id=null; ajs_anonymous_id=%22c1a4a653-107a-4341-b28e-dcdaf8b30169%22; _gcl_au=1.1.1259495208.1591811645; _ga=GA1.2.1022411617.1591811645; ab.storage.deviceId.320f7332-8571-45d7-b342-c54192dae547=%7B%22g%22%3A%223abc1a29-8121-d7ca-340a-f31fe27706a3%22%2C%22c%22%3A1591811645287%2C%22l%22%3A1591811645287%7D; _fbp=fb.1.1591811645414.41094145; CB=BADs5VlD2Bg; CS=1; ajs_user_id=%22UADs5dLCEjE%22; ab.storage.userId.320f7332-8571-45d7-b342-c54192dae547=%7B%22g%22%3A%22UADs5dLCEjE%22%2C%22c%22%3A1591811662791%2C%22l%22%3A1591811662791%7D; __stripe_mid=c81db500-bdf2-487a-a5ad-75ae9d22e99b; campaignTracker=CareersPage; _gid=GA1.2.1025166503.1593962736; CAZ=boxflnJsf51aH3f71XiopuK3CrO8cuyuw5GrquQTD0tB_GJG-s5RmTzX-G-dXfiWHRCZV2y7VciCP03NELbLJ5tNINCVqYopTdPp0n1kjTsxTykxlMMcONC6yL8Y-FDGr_yr6PwliRtGM8McXtxwoy1TY6qBhoKyCphBLIBg2ZpjJl5yTfHNdwnStMjDlJ3luXbDPF02J0GO2pQyiKvgMEIGohhRP54XtaKNrSHHpJtTRM9sTt1hjHgd-IUrOPEYDT0pS96LxcX1wRC_LrECNorvyQy6aQsWTstopMKSvJ6ZnbLh5Yu_5gWDcAUa2n1TkZI1zqHu_F_hQhTmOAH-60UskCnfkFHNa2UPQTkgbET9KsrE2cYhKWV3xrlh11zJSXjSK5KeRyurWLo6gkgcbxyOJhSs-eep2K7J4WC1MXUqUW2czm-NS4PZ8FTK-iEpBPpLp9vyMuSIFBR1q2pNKXYwZZRLhsmXAqYCvBsxUq-YdsJnftP06QLiyTJv_myWKX4n35ye1qsiuvQ9slYH6CrBfptmNSUFjK3ShZzrjCRiM2D4S9IfuEHizzp-C5cw8peDgnmIozjPZ4iaY-Nj0hDqvm1NNjb7KXarqVcH3B0xWoVKTP8ViM5hHDzvkHYx32EEr6vzsNQCowgbU09xZRHBkexnW-TRF2yvzAGrok4_SIrDZNXJgpFmC1kIj4bpzjtWgP12DVlh5Ozg-tXEtgTloa2SL0mSHchEG3N0FKcrc-3w; CID=_FTzF4vDbEzgzTdotlrRYpAVj_CG6qR2E5VkUbDA7N18UAf-k8DFGqh9ph_dViSM4aeyHv7RoKZ_MV12GlLN6znSv0OJoIst4JvST3qqAnKSVf72; CUI=RAVzK9FjV4qqVyq91U-AAcoMQOJY2LUfEowvOApAtSrMP9gIZpgZhHuTSb6HcZB_5HKR3w; _uetsid=76b9bd64-a296-63f2-c087-1570adb8bf21; _uetvid=233327d4-113a-e5a7-fd6f-b7cabb771c47; ab.storage.sessionId.320f7332-8571-45d7-b342-c54192dae547=%7B%22g%22%3A%22939a268b-0650-81f9-ceb0-7ab6c1e10c12%22%2C%22e%22%3A1594023258512%2C%22c%22%3A1594020164402%2C%22l%22%3A1594021458512%7D',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
'token': '...'}


In [None]:
soup.select('div', {'id':'genre-video'})

# soup2 = bs(subset, 'html.parser')

# soup2.find_all('p', {'class':'MpjE4w bSncsg n9zSJA ZTpOuQ u9XXAg HnhAQA l1bE7Q mNfh2Q'})

#for cat in subCat_list:
#    sub_cat.append(cat.text)

#        for card in soup.select("div.result-card__contents"):
 #           title = card.findChild("h3", recursive=False)
  #          company = card.findChild("h4", recursive=False)
   #         location = card.findChild("span", attrs={"class": "job-result-card__location"}, recursive=True)
    #        titles.append(title.string)
     #       companies.append(company.string)
      #      locations.append(location.string)

In [None]:
#video = soup.find_all('div', {'class': 'K1bdNw ElK36A eGNjaw RTlscw'})

#[category.find('p', {'class': 'MpjE4w bSncsg n9zSJA ZTpOuQ u9XXAg HnhAQA l1bE7Q mNfh2Q'}).text for category in video]
    
#video_list = []
#for vid_cat in video:
#    video_list.append(vid_cat.text)
    
#video_list 

In [None]:
### LEGACY CODE
# Create DataFrame where we will combine the subcategory info
#tem = pd.DataFrame(columns=['Subcategory', 'Category'])

## MVP version
#categories = {'Documents': ['Presentation', 'A4 Document', 'Letter', 'Letterhead', 'Resume', 'Presentation (4:3)', 'Report', 'Daily Report', 'Memo', 'Invoice']}

#documents = pd.DataFrame(columns=['Subcategory', 'Category'])
#documents['Subcategory'] = ['Presentation', 'A4 Document', 'Letter', 'Letterhead', 'Resume', 'Presentation (4:3)', 'Report', 'Daily Report', 'Memo', 'Invoice']
#documents['Category'] = 'Documents'
#documents

# Create a dictionary with the cat - sub_cat relation
### (BONUS) scrape it

# Populate Subcategory Series
#tem['Subcategory'] = list(set(sub_cat))

#tem[tem['Subcategory'] in categories.values()]

### LEGACY CODE
# Dictionary with keyword that is being evaluated per each subcategory 'sub'
sub_dic = {'presentation': 'presentation template',
           'a_4': 'A4 template',
           'letter': 'letter template' ,
           'letterhead': 'letterhead template',
           'resume': 'resume template',
           'presentation_4_3': '4:3 template',
           'report': 'report template',
           'daily_report': 'daily report template',
           'memo': 'memo template',
           'invoice': 'invoice template'}

for sub in cats['Sub-category']:
    sub_dic.update({sub: sub + " template"})

# Create a list with the keys
clean_list = sub_dic.keys()

for category in sub_dic:
    sub_dic[category] = get_trend(sub_dic[category])
    
# LEGACY CODE
#presentation = get_trend(clean_dic['presentation'])
#a4 = get_trend(clean_dic['a4'])
#letter = get_trend(clean_dic['letter'])
#letterhead = get_trend(clean_dic['letterhead'])
#resume = get_trend(clean_dic['resume'])
#presentation_4_3 = get_trend(clean_dic['presentation_4_3'])
#report = get_trend(clean_dic['report'])
#daily_report = get_trend(clean_dic['daily_report'])
#memo = get_trend(clean_dic['memo'])
#invoice = get_trend(clean_dic['invoice'])