# Data exploratory analysis

The goal of this notebook is to explore the data and identify potential interesting insights

In [2]:
import pandas as pd
import numpy as np

# Statistics + plotting
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

## 1. Import data

In [7]:
# Open categories.cvs
trends_all = pd.read_csv('../project-4/data/clean-data/trends-all.csv')

# Explore data

trends_all.drop('Unnamed: 0', axis=1, inplace=True)
trends_all.head()

Unnamed: 0,Date,Trend,Category,Sub-category,Keyword,Search Vol (min),Search Vol (max),Number of templates,Search Vol (avge),Search Vol,isPartial,index
0,2015-07-12,21,Video,Facebook Video,Facebook Video template,23.076923,230.769231,145,126,48.461538,False,
1,2015-07-19,31,Video,Facebook Video,Facebook Video template,23.076923,230.769231,145,126,71.538462,False,
2,2015-07-26,20,Video,Facebook Video,Facebook Video template,23.076923,230.769231,145,126,46.153846,False,
3,2015-08-02,41,Video,Facebook Video,Facebook Video template,23.076923,230.769231,145,126,94.615385,False,
4,2015-08-09,27,Video,Facebook Video,Facebook Video template,23.076923,230.769231,145,126,62.307692,False,


## 2. Checking data types

In [10]:
trends_all.dtypes

Date                   datetime64[ns]
Trend                           int64
Category                       object
Sub-category                   object
Keyword                        object
Search Vol (min)              float64
Search Vol (max)              float64
Number of templates             int64
Search Vol (avge)               int64
Search Vol                    float64
isPartial                        bool
index                         float64
dtype: object

In [11]:
trends_all['Date'] = pd.to_datetime(trends_all['Date'])

## 3. Basic plotting

### 3.1 Search volume evolution

In [None]:
# Volume of searchs
plt.figure(figsize=(15,5))

for sub in trends:
    try:
    # Create a graph with the volume of searches
        ax = sns.lineplot(data=trends[sub], 
                          x='Date', 
                          y='Search Vol')
    except:
        pass

Resume sub-categorie has much more search vol than the rest. For the sake of the analysis, could be interesting to add those sub-categories with less than 10.000 monthly visits.

In [None]:
# Same plot with subcategories with less than 10.000 monthly searches on average
trends_gp = {'Under 1k': pd.DataFrame(columns=['Date', 'Search Vol']),
             'Under 10k': pd.DataFrame(columns=['Date', 'Search Vol']),
             'Under 100k': pd.DataFrame(columns=['Date', 'Search Vol'])}

# Aggregate DataFrames by groups depending on Search Vol.
for sub in trends:
    try:
        # Subcategories with an avge of less than 1k monthly seaches
        if (trends[sub]['Search Vol'].mean() < 1000):
            trends_gp['Under 1k'] = pd.concat([trends_gp['Under 1k'], trends[sub]])
        
        # Subcategories with an avge of less than 10k monthly seaches
        elif (trends[sub]['Search Vol'].mean() < 10000):
            trends_gp['Under 10k'] = pd.concat([trends_gp['Under 10k'], trends[sub]])
    
        # Subcategories with an avge of less than 100k monthly seaches
        elif ((trends[sub]['Search Vol'].mean() < 100000) & (trends[sub]['Search Vol'].mean() > 10000)):
            trends_gp['Under 100k'] = pd.concat([trends_gp['Under 100k'], trends[sub]])
            
        # The rest will be analysed individually
        else:
            trends_gp[sub] = trends[sub]

    except:
        pass
    
# Grouping by day and summing 'Seatch Vol.'
trends_gp['Under 1k'] = trends_gp['Under 1k'].groupby('Date', as_index=False).agg('sum')
trends_gp['Under 10k'] = trends_gp['Under 10k'].groupby('Date', as_index=False).agg('sum')
trends_gp['Under 100k'] = trends_gp['Under 100k'].groupby('Date', as_index=False).agg('sum')

In [None]:
# Checking the results
trends_gp['Under 100k'].head()

Now we have a dictionary with a DataFrame that aggregates the subcategories with less than 1k monthly searches as 'Under 1k', less than 10k monthly searches as 'Under 10k', a DataFrame that aggregates the subcategories with less that 100k monthly searches as 'Under 100k' and individual DataFrames for those subcategories with more than 100k monthly searches.

In [None]:
# Plot 
plt.figure(figsize=(15,5))

for sub in trends_gp:
    
    # Create a graph with the volume of searches
    ax = sns.lineplot(data=trends_gp[sub], 
                  x='Date', 
                  y='Search Vol')
    
## [BONUS] - Add legends

It would be useful to list the subcategories in each category.

In [None]:
# [BONUS] - List a summary with the categories in each bin. [Under 1k] | [Under 10k] | [Under 100k]

### 3.2 Search volume aggregated trend

In [None]:
# Combine all searches and find the trend. 
# We will be able to compare each sub_category with the overall trend
# Statistical analysis

In [None]:
trends_gp['Total'] = pd.DataFrame(columns=['Date', 'Search Vol'])

# Concat all DataFrames into one
for sub in trends:
    trends_gp['Total'] = pd.concat([trends_gp['Total'], trends[sub]])

trends_gp['Total'] = trends_gp['Total'].groupby('Date', as_index=False).agg('sum')

#### Calculate the fitted line

In [None]:
# Create a constant
trends_gp['Total'] = sm.add_constant(trends_gp['Total'], has_constant='add')
trends_gp['Total'] = trends_gp['Total'].assign(t=range(1, len(trends_gp['Total']) + 1))
# sub_dic_gp['Total'] = sub_dic_gp['Total'].assign(t2=range(1, (len(sub_dic_gp['Total']) + 1)**2))

In [None]:
# Run OLS
X = trends_gp['Total'][["const", "t"]]
y = trends_gp['Total']["Search Vol"]

lin_reg = sm.OLS(y, X)
results_total = lin_reg.fit()

results_total.summary()

In [None]:
trends_gp['Total'] = trends_gp['Total'].assign(y_hat=results_total.predict(X))

plt.figure(figsize=(15,5))
    
sns.scatterplot(data=trends_gp['Total'], 
                x='t', 
                y='Search Vol');

sns.lineplot(data=trends_gp['Total'],
             x='t',
             y="y_hat",
             color="green");

In [None]:
# Find if there is a correlation between 'Templates' that Canva is offering and 'Search Vol' 
# We will be able to do a linear regression
# Statistical analysis

### 3.3 Search Volume vs. Number of Templates

Is there a correlation between the Search Vol for the Keywords assigned to the category and the number of templates that Canva is offering? Is it statistically significant?

We need to add a column with the Search Vol (avge) to the categories DataFrame

In [None]:
# Add a column with the Search Vol (avge)
categories['Search Vol (mean)'] = (categories['Search Vol (max)'] - categories['Search Vol (min)']) / 2 + categories['Search Vol (min)']

# Bonus, add the Search Vol with the trends

In [None]:
categories.head()

In [None]:
# Scatter plot to infere if it makes sense to calculate the fitted line
plt.figure(figsize=(15,5))
    
sns.scatterplot(data=categories, 
                x='Search Vol (mean)', 
                y='Number of templates',
                hue='Category');

In [None]:
# See if there is a correlation between the two variables
categories = sm.add_constant(categories, has_constant='add')

# Run OLS
lin_reg = sm.OLS(categories["Number of templates"],
                 categories[["const", "Search Vol (mean)"]])

result = lin_reg.fit()

result.summary()

In [None]:
categories = categories.assign(y_hat=result.predict(categories[["const", "Search Vol (mean)"]]))

In [None]:
plt.figure(figsize=(15,5))
    
sns.scatterplot(data=categories, 
                x='Search Vol (mean)', 
                y='Number of templates',
                hue='Category');

sns.lineplot(data=categories,
             x='Search Vol (mean)',
             y="y_hat",
             color="g");

Writte CONCLUSION

In [3]:
## LEGACY code

# Open .csv with Search trends and store them into a list of dics

csv_names = []
trends = {}

# Replace ' ' for '-'
for sub in categories['Sub-category']:
    csv_names.append(sub.replace(' ', '-').lower())
    
# Import .csv files
for name, sub in zip(csv_names, categories['Sub-category']):
    trends.update({sub: pd.read_csv(f'../project-4/data/clean-data/{name}.csv')})
    trends[sub].drop('Unnamed: 0', axis=1, inplace=True)
    
# Explore the data
trends['Instagram Ad']

Unnamed: 0,Date,Trend,isPartial,Category,Sub-category,Keyword,Search Vol (min),Search Vol (max),Number of templates,Search Vol (avge),Search Vol
0,2015-07-12,0,False,Ads,Instagram Ad,Instagram Ad template,230.769231,2307.692308,111,1269,0.000000
1,2015-07-19,0,False,Ads,Instagram Ad,Instagram Ad template,230.769231,2307.692308,111,1269,0.000000
2,2015-07-26,0,False,Ads,Instagram Ad,Instagram Ad template,230.769231,2307.692308,111,1269,0.000000
3,2015-08-02,0,False,Ads,Instagram Ad,Instagram Ad template,230.769231,2307.692308,111,1269,0.000000
4,2015-08-09,0,False,Ads,Instagram Ad,Instagram Ad template,230.769231,2307.692308,111,1269,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
256,2020-06-07,20,False,Ads,Instagram Ad,Instagram Ad template,230.769231,2307.692308,111,1269,461.538462
257,2020-06-14,20,False,Ads,Instagram Ad,Instagram Ad template,230.769231,2307.692308,111,1269,461.538462
258,2020-06-21,31,False,Ads,Instagram Ad,Instagram Ad template,230.769231,2307.692308,111,1269,715.384615
259,2020-06-28,42,False,Ads,Instagram Ad,Instagram Ad template,230.769231,2307.692308,111,1269,969.230769
