# The notebook will include the following sections:

The goal is to create a sample dataset for a data visualization project on looker studio. 
The data is fictif. 

## I. Dataset cleaning
- Uploading the dataset
- Converting column types
- Renaming columns
- Removing columns

## II. Dataset completion
- Adding columns
- Generating randomized values to fill the columns
- Exporting the dataset as a csv file

## <span style="color:blue"> I. Dataset cleaning </span>   

 <span style="color:red">Uploading the dataset  </span>  

In [73]:
import pandas as pd 

# Display all columns
pd.pandas.set_option('display.max_columns', None)

# Read the csv file 
df = pd.read_csv("downloads/website_traffic.csv")
df

Unnamed: 0,Row,Day,Day.Of.Week,Date,Page.Loads,Unique.Visits,First.Time.Visits,Returning.Visits
0,1,Sunday,1,9/14/2014,2146,1582,1430,152
1,2,Monday,2,9/15/2014,3621,2528,2297,231
2,3,Tuesday,3,9/16/2014,3698,2630,2352,278
3,4,Wednesday,4,9/17/2014,3667,2614,2327,287
4,5,Thursday,5,9/18/2014,3316,2366,2130,236
...,...,...,...,...,...,...,...,...
2162,2163,Saturday,7,8/15/2020,2221,1696,1373,323
2163,2164,Sunday,1,8/16/2020,2724,2037,1686,351
2164,2165,Monday,2,8/17/2020,3456,2638,2181,457
2165,2166,Tuesday,3,8/18/2020,3581,2683,2184,499


 <span style="color:red"> Converting column types  </span> 

In [74]:
# Convert the columns to int type 

# First remove the commas from the values in the "Unique.Visits" column and then convert to integers
df['Page.Loads'] = df['Page.Loads'].str.replace(',', '').astype(int)
df['Unique.Visits'] = df['Unique.Visits'].str.replace(',', '').astype(int)
df['First.Time.Visits'] = df['First.Time.Visits'].str.replace(',', '').astype(int)
df['Returning.Visits'] = df['Returning.Visits'].str.replace(',', '').astype(int)

In [75]:
# Checking the value types of the columns
df.dtypes

Row                   int64
Day                  object
Day.Of.Week           int64
Date                 object
Page.Loads            int32
Unique.Visits         int32
First.Time.Visits     int32
Returning.Visits      int32
dtype: object

 <span style="color:red"> Renaming columns  </span> 

In [76]:
# Rename the columns
df.rename(columns={'Day': 'day'}, inplace=True)
df.rename(columns={'Day.Of.Week': 'day_of_week'}, inplace=True)
df.rename(columns={'Date': 'date'}, inplace=True)
df.rename(columns={'Page.Loads': 'page_loads'}, inplace=True)
df.rename(columns={'Unique.Visits': 'total_visits'}, inplace=True)
df.rename(columns={'First.Time.Visits': 'unique_visits'}, inplace=True)
df.rename(columns={'Returning.Visits': 'returning_visits '}, inplace=True)

 <span style="color:red"> Removing columns  </span> 

In [77]:
# Drop the 'Row' column
df.drop('Row', axis=1, inplace=True)

## <span style="color:blue"> I. Dataset completion </span>  

<span style="color:red"> Adding columns </span>

In [78]:
# Assigning the columns to the df
# = 0 in order to fill the columns with the values 0 
df['campagne_visits'] = 0
df['single_page_visits'] = 0
df['average_duration'] = 0
df['total_actions'] = 0

<span style="color:red">Generating randomized values to fill the columns </span>

In [79]:
import numpy as np

# Generate random values for the column campagne_visits
# Ensure the total number of visits amounts to either 1/3 or 1/4 of the total visits represented by the column total_visits
# // 3 will divide each row of total_visits by 3 and assign that value to the campagne_visits 
# // 2 + 1 calculates the division of the two columns by 2, then adds 1 to that value
# size=len(df) ensures that the campagne_visits will have all its rows filled 

# The following code will generate random integers between the value of 1/3 of 'total_visits' and the value of 1/2 of 'total_visits' for each row
df['campagne_visits'] = np.random.randint(df['total_visits'] // 3, df['total_visits'] // 2 + 1, size=len(df))

# Ensure the total number of single page visits amount to either 1/4 or 1/6 of total_visits
df['single_page_visits'] = np.random.randint(df['total_visits'] // 6, df['total_visits'] // 4, size=len(df))

# Generate random values for the 'total_actions' column
# The range 300 and 2001 has been chosen following the analysis of the minimum and maximum values of the total_visits column
df['total_actions'] = np.random.randint(300, 2001, size=len(df))


In [80]:
import random

# Function to generate random time between 1 minute 10 seconds and 3 minutes
def generate_random_time():
    minutes = random.randint(1, 2)  # Generate random minutes between 1 and 2
    seconds = random.randint(10, 59)  # Generate random seconds between 10 and 59
    return f'{minutes:02d}:{seconds:02d}:00'  # Format the time as HH:MM:SS

# Add random values to the 'average_duration' column
df['average_duration'] = [generate_random_time() for _ in range(len(df))]


<span style="color:red"> Export the dataset as a csv file</span>

In [81]:
# Export the DataFrame to a CSV file
df.to_csv('web_traffic.csv', index=False)

# To visualize the data, please direct yourself to my Looker studio projects repository on my github profile

The project may be found under the title **"Web Traffic Data Visualization"**