In [None]:
import pandas as pd
import numpy as np
from scipy import stats

# Load the dataset
df = pd.read_csv('traffic.csv')

In [None]:
df

Unnamed: 0,event,date,country,city,artist,album,track,isrc,linkid
0,click,2021-08-21,Saudi Arabia,Jeddah,Tesher,Jalebi Baby,Jalebi Baby,QZNWQ2070741,2d896d31-97b6-4869-967b-1c5fb9cd4bb8
1,click,2021-08-21,Saudi Arabia,Jeddah,Tesher,Jalebi Baby,Jalebi Baby,QZNWQ2070741,2d896d31-97b6-4869-967b-1c5fb9cd4bb8
2,click,2021-08-21,India,Ludhiana,Reyanna Maria,So Pretty,So Pretty,USUM72100871,23199824-9cf5-4b98-942a-34965c3b0cc2
3,click,2021-08-21,France,Unknown,"Simone & Simaria, Sebastian Yatra",No Llores Más,No Llores Más,BRUM72003904,35573248-4e49-47c7-af80-08a960fa74cd
4,click,2021-08-21,Maldives,Malé,Tesher,Jalebi Baby,Jalebi Baby,QZNWQ2070741,2d896d31-97b6-4869-967b-1c5fb9cd4bb8
...,...,...,...,...,...,...,...,...,...
93936,pageview,2021-08-25,United States,Longview,KA$HDAMI,epiphany,Reparations!,QZFYY2115255,9c61dba1-9369-4ee4-a215-1d34581cd811
93937,pageview,2021-08-25,Saudi Arabia,Riyadh,Tundra Beats,Beautiful Day,Beautiful Day,QZHN92194591,e849515b-929d-44c8-a505-e7622f1827e9
93938,pageview,2021-08-25,India,Lucknow,Justin Bieber,Mari Berdansa,Yummy,USUM71923046,638b0c7a-8f87-47c0-b444-8ac66d3d7054
93939,pageview,2021-08-25,United States,Unknown,"Kayla Nicole, Taylor Girlz, Flo Milli","Bundles 2 (feat. Flo Milli, Taylor Girlz)","Bundles 2 (feat. Flo Milli, Taylor Girlz)",QMCE32100359,fbf4b935-f961-4b13-b1d8-45ad47093559


In [None]:
# 1. Total and Daily Pageview Events

total_pageviews = df[df['event'] == 'pageview']['event'].count()

# Calculate average daily pageview events
daily_pageviews = df[df['event'] == 'pageview'].groupby('date')['event'].count().mean()

# How many total pageview events did the links in the dataset receive during the entire period?

print(f"Total Pageview Events: {total_pageviews}")

# What is the average number of pageview events per day?
print(f"Average Daily Pageview Events: {daily_pageviews}")

Total Pageview Events: 142015
Average Daily Pageview Events: 20287.85714285714


In [None]:
# 2. Analysis of Other Events:

# What is the total count and distribution of other recorded events in the dataset?

total_other_events = df['event'].count()
event_distribution = df['event'].value_counts()


print(f"Total Other Events: {total_other_events}")
print("Distribution of Other Events:")
print(event_distribution)

Total Other Events: 226278
Distribution of Other Events:
event
pageview    142015
click        55732
preview      28531
Name: count, dtype: int64


In [None]:
# 3. Geographical Distribution:

# Which countries contributed to the pageviews?
countries = df[df['event'] == 'pageview']['country'].value_counts()
print("Number of countries contributed to the pageviews:", len(countries))
print(countries)

Number of countries contributed to the pageviews: 161
country
Saudi Arabia              1963
India                     1798
United States             1464
France                     675
Iraq                       324
                          ... 
Saint Lucia                  1
Uruguay                      1
Aruba                        1
British Virgin Islands       1
Panama                       1
Name: count, Length: 161, dtype: int64


In [None]:
# 4. Click-Through Rate (CTR) Analysis:

# What is the overall click-through rate (CTR) calculated as clicks/pageviews?
clicks = df[df['event'] == 'click']['event'].count()
ctr = clicks / total_pageviews

print(f"Overall Click-Through Rate (CTR): {ctr}")

# • How does the CTR vary across different links?
ctr_by_link = df[df['event'] == 'click'].groupby('linkid')['event'].count() / df[df['event'] == 'pageview'].groupby('linkid')['event'].count()

print("CTR by Link:")
print(ctr_by_link)

Overall Click-Through Rate (CTR): 0.3924374185825441
CTR by Link:
linkid
00073307-ae96-5089-a117-4783afb42f8e    NaN
00126b32-0c35-507b-981c-02c80d2aa8e7    1.0
0018cfff-50a1-5984-9715-01ef2d11a49a    NaN
0033934b-5d16-5a06-af58-d087bcdd3680    NaN
0034d6cf-3bd8-5ffe-aafc-b3959fc48608    NaN
                                       ... 
fff38ca0-8043-50cd-a5f1-f65ebb7105c5    1.0
fff4e5f0-4ee5-5fe7-aa30-e870edaf6ed7    NaN
fff84c0e-90a1-59d8-9997-adc909d50e16    1.0
fffc17a7-f935-5d3e-bd3e-d761fd80d479    0.5
fffd0045-29de-522b-b5d8-35786363bf07    0.5
Name: event, Length: 3839, dtype: float64


In [None]:
# 5. Correlation Analysis:

# Is there a correlation between clicks and previews on a link? If so, is it statistically significant, and how strong is the effect?

clicks_by_link = df[df['event'] == 'click'].groupby('linkid')['event'].count()
pageviews_by_link = df[df['event'] == 'pageview'].groupby('linkid')['event'].count()

# Ensure both series have the same index by aligning them and filling missing values with 0
clicks_by_link, pageviews_by_link = clicks_by_link.align(pageviews_by_link,fill_value=0)

# Calculate the Pearson correlation coefficient and p-value
correlation, p_value = stats.pearsonr(clicks_by_link, pageviews_by_link)

# Print the results
print(f"Correlation between Clicks and Pageviews: {correlation}")
print(f"P-value: {p_value}")
if p_value < 0.05:
    print("The correlation is statistically significant.")
else:
    print("The correlation is not statistically significant.")

# Perform tests for both potential linear relationships and categorical (binary) relationships between these variables

Correlation between Clicks and Pageviews: 0.9940014167230311
P-value: 0.0
The correlation is statistically significant.


In [None]:
# Categorical Relationship using Chi-Square Test
# Convert clicks and pageviews to binary (0 or 1) based on whether they are greater than 0
clicks_binary = np.where(clicks_by_link > 0, 1, 0)
pageviews_binary = np.where(pageviews_by_link > 0, 1, 0)

# Create a contingency table
contingency_table = pd.crosstab(clicks_binary, pageviews_binary)

# Perform the Chi-Square test
chi2, chi_p_value, _, _ = stats.chi2_contingency(contingency_table)

print(f"Chi-Square Test between Clicks and Pageviews: {chi2}")
print(f"P-value: {chi_p_value}")
if chi_p_value < 0.05:
    print("The association is statistically significant.")
else:
    print("The association is not statistically significant.")


Chi-Square Test between Clicks and Pageviews: 0.2183090235621212
P-value: 0.6403313890647948
The association is not statistically significant.
