## First Inspection

# Language Analysis

In [None]:
SELECT 
    date,
    language,
    traffic
FROM (
    SELECT 
        date,
        traffic,
        page,
        CASE
          WHEN REGEXP_CONTAINS(page, r'[a-z][a-z].wikipedia.org') THEN
            SUBSTR(REGEXP_EXTRACT(page, r'(.{2}).wikipedia.org'), 1, 2)
          ELSE
            'na'
        END AS language
    FROM 
        `web_traffic_dataset.web_traffic_tb`
)

Data visualization in Looker Studio:

![Plot language analysis](../plots_bigquery/plot_language_analysis.png)

Insights:
- Pages in English
- English and Russian show very large spikes around August 2016
  - Maybe due to the Summer Olympics and the election in the US
- Spanish has a periodic cycle 

# Weekdays - Weekend Analysis

In [None]:
# Extract day of the week
df_melted['day_of_week'] = df_melted['date'].dt.day_name()

# Calculate the average and standard deviation of views for each day of the week
pivot_table_mean = df_melted.pivot_table(values='views', index='day_of_week', aggfunc='mean')

# Reindex to ensure days are in the correct order
days_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
pivot_table_mean = pivot_table_mean.reindex(days_of_week)

In [None]:
# Plot the average views with standard deviation as error bars
plt.figure(figsize=(10, 6))
plt.plot(pivot_table_mean.index, pivot_table_mean['views'], color='skyblue')
plt.xlabel('Day of the Week')
plt.ylabel('Average Views')
plt.title('Average Views for Each Day of the Week')
plt.xticks(rotation=45)
plt.grid(axis='y')

# Show the plot
plt.show()

Data visualization in Looker Studio:

In [None]:
![Weekday analysis](../plots_bigquery/plot_weekday_analysis.png)

Insights:
- More views on Monday and Sunday

# Months Analysis

In [None]:
df_melted['month'] = df_melted['date'].dt.month_name()

# Use pivot_table to calculate the average and standard deviation of views for each month
pivot_table = df_melted.pivot_table(values='views', index='month', aggfunc='mean')

# Define the correct order of months
months_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

# Reindex to ensure months are in the correct order
pivot_table = pivot_table.reindex(months_order)

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(pivot_table.index, pivot_table['views'], color='skyblue')
plt.xlabel('Month')
plt.ylabel('Average Views')
plt.title('Average Views per Month')
plt.xticks(rotation=45)
plt.grid(axis='y')

Insights:
- Less during the warmer months, but peak in August due to elections and sports

In [None]:
# Save
Path("../data/processed").mkdir(parents=True, exist_ok=True)
df.to_csv('../data/processed/train_1_processed.csv', index=False)