<a href="https://colab.research.google.com/github/orangegreen212/cv/blob/main/Analysis_sport_site_data_A_B_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pylab
import scipy.stats as stats
import statsmodels.stats.api as sms
from math import ceil

# Uploading and Cleanining Data

In [None]:
df_1 = pd.read_csv('/kaggle/input/sport-site-data/sport_data_1.csv')
df_2 = pd.read_csv('/kaggle/input/sport-site-data/sport_data_2.csv')
df_1.info()
df_1.head()


In [None]:

df_2.info()
df_2.head()

Description for each column:

   **Date:** The date when the data was recorded.
   
   **Page Views:** The total number of pages viewed by visitor
   
   **Unique Visitors:** The number of distinct individuals visiting the site
   
   **Bounce Rate:** The percentage of visitors who leave the site after viewing only one page.
   
   **Avg Session Duration (s):** The average length of time visitors spend on the site in seconds.
   
   **Conversion Rate (%):** The percentage of visitors who complete a desired action or goal (e.g., make a purchase).
   
   **Traffic Source:** The origin of the website traffic (e.g., search engines, social media, direct).
   
   **Revenue:** The total income generated from visitors during the period.

In [None]:
# Convert the 'Date' column to datetime format
df_1['Date'] = pd.to_datetime(df_1['Date'])

# Truncate the date to the format 'yyyy-mm-dd'
df_1['Date'] = df_1['Date'].dt.strftime('%Y-%m-%d')

print(df_1.head())


In [None]:
# Convert the 'Date' column to datetime format

df_2['Date'] = pd.to_datetime(df_2['Date'])

# Truncate the date to the format 'yyyy-mm-dd'
df_2['Date'] = df_2['Date'].dt.strftime('%Y-%m-%d')

print(df_2.head())

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming df_2 is already loaded
# Convert 'Date' to datetime
df_2['Date'] = pd.to_datetime(df_2['Date'])

# Set the date as the index
df_2.set_index('Date', inplace=True)

# Resample data by week and aggregate using sum or mean as appropriate
weekly_data = df_2.resample('W').agg({
    'Page Views': 'sum',
    'Unique Visitors': 'sum',
    'Bounce Rate': 'mean',
    'Avg Session Duration (s)': 'mean',
    'Conversion Rate (%)': 'mean',
    'Revenue': 'sum'
})

# Plotting Bar Charts
plt.figure(figsize=(14, 10))

# Page Views and Unique Visitors
plt.subplot(2, 2, 1)
weekly_data[['Page Views', 'Unique Visitors']].plot(kind='bar', ax=plt.gca(), color=['#FF9999','#66B2FF'])
plt.title('Weekly Page Views and Unique Visitors')
plt.xlabel('Week')
plt.ylabel('Count')
plt.legend(title='Metrics')

# Bounce Rate and Conversion Rate
plt.subplot(2, 2, 2)
weekly_data[['Bounce Rate', 'Conversion Rate (%)']].plot(kind='bar', ax=plt.gca(), color=['#99FF99','#FFCC99'])
plt.title('Weekly Bounce Rate and Conversion Rate')
plt.xlabel('Week')
plt.ylabel('Percentage')
plt.legend(title='Metrics')

# Avg Session Duration
plt.subplot(2, 2, 3)
weekly_data[['Avg Session Duration (s)']].plot(kind='bar', ax=plt.gca(), color=['#CCFF99'])
plt.title('Weekly Average Session Duration')
plt.xlabel('Week')
plt.ylabel('Duration (s)')
plt.legend(title='Metrics')

# Revenue
plt.subplot(2, 2, 4)
weekly_data[['Revenue']].plot(kind='bar', ax=plt.gca(), color=['#FF6666'])
plt.title('Weekly Revenue')
plt.xlabel('Week')
plt.ylabel('Revenue')
plt.legend(title='Metrics')

plt.tight_layout()
plt.show()


# Correlation Matrix (excluding non-numeric columns)
numeric_df = df_2.select_dtypes(include=['float64', 'int64'])
corr_matrix = numeric_df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()


# Perform A/B testing



    Baseline conversion rate :  5%

    Need minimum detectable effect: 5%

    Statistical power: 90%

    Significance level: 5%


Define Required sample size from our DataFrames

In [None]:
# Given parameters
baseline_conversion_rate = 0.03
minimum_detectable_effect = 0.05

power = 0.90
alpha = 0.05

# Calculate the effect size
effect_size = sms.proportion_effectsize(baseline_conversion_rate, baseline_conversion_rate + minimum_detectable_effect)

# Calculate the required sample size
required_n = sms.NormalIndPower().solve_power(effect_size, power=power, alpha=alpha, ratio=1)
required_n = int(np.ceil(required_n))  # Round up to the nearest integer

print("Required sample size:", required_n)



In [None]:
# Get a random sample without replacement
sample_1 = df_1.sample(n=required_n, replace=False)

print(sample_1)


In [None]:
# Get a random sample without replacement
sample_2 = df_2.sample(n=required_n, replace=False)

print(sample_2)

# Calculate Average Conversion Rate in original DataFrames and Samples DataFrames

Calculation Average Conversion Rate in original DataFrames

In [None]:
average_conversion_rate = df_1['Conversion Rate (%)'].mean()
print("Average Conversion Rate Sport 1:", average_conversion_rate)
average_conversion_rate = df_2['Conversion Rate (%)'].mean()
print("Average Conversion Rate Sport 2:", average_conversion_rate)

Calculation Samples DataFrames

In [None]:
conversion_rate_sample_1 = sample_1['Conversion Rate (%)'].mean()
print("Average Conversion Rate Sport Sample 1:", conversion_rate_sample_1)
conversion_rate_sample_2 = sample_2['Conversion Rate (%)'].mean()
print("Average Conversion Rate Sport Sample 2:", conversion_rate_sample_2)

In [None]:
data = {'Sample': ['Sample 1', 'Sample 2'], 'Conversion Rate': [conversion_rate_sample_1, conversion_rate_sample_2]}
df = pd.DataFrame(data)

# Create the bar chart
plt.figure(figsize=(6, 5))
sns.barplot(x='Sample', y='Conversion Rate', data=df)

# Get bar positions (after creating the chart)
bar_positions = plt.gca().get_xticks()  # Get x-axis positions of bars

# Extract conversion rates from DataFrame
conversion_rates = df['Conversion Rate']

# Add annotations above each bar
for pos, rate in zip(bar_positions, conversion_rates):
    plt.annotate(str(rate),  # Text to display (conversion rate as string)
                 xy=(pos, rate + 0.01),  # Position slightly above the bar
                 ha='center',  # Horizontal alignment (center)
                 va='bottom')  # Vertical alignment (bottom)

# Customize the chart
plt.title('Conversion Rate Comparison')
plt.ylabel('Conversion Rate')
plt.show()

# Finding P-Value and check H0 and Ha hypothesis

T-Test

In [None]:
t_stat, p_value = stats.ttest_ind(df_1['Conversion Rate (%)'], df_2['Conversion Rate (%)'])

# Print results
if p_value < 0.05:
    print("There are statistically significant differences between the mean values of the samples.")
else:
    print("There are no statistically significant differences between the mean values of the samples.")
print("P-Value:", p_value)


Welch's t-test

In [None]:
# Obtain data
CR_1 = df_1['Conversion Rate (%)']
CR_2 = df_2['Conversion Rate (%)']

# Check variance of samples
var_CR_1 = np.var(CR_1, ddof=1)
var_CR_2 = np.var(CR_2, ddof=1)
print("Variance of sample 1:", var_CR_1)
print("Variance of sample 2:", var_CR_2)

# Perform t-test if variance is not zero
if var_CR_1 > 0 and var_CR_2 > 0:
    t_stat, p_value = stats.ttest_ind(CR_1, CR_2)

    # Print results
    if p_value < 0.05:
        print("There are statistically significant differences between the mean values of the samples.")
    else:
        print("There are no statistically significant differences between the mean values of the samples.")

    print("t-statistic:", t_stat)
    print("p-value:", p_value)
else:
    print("One of the samples has zero variance, t-test is not possible.")


In [None]:
# Obtain data
BR_1 = df_1['Bounce Rate']
BR_2 = df_2['Bounce Rate']


# Check variance of samples
var_BR_1 = np.var(BR_1, ddof=1)
var_BR_2 = np.var(BR_2, ddof=1)
print("Variance of sample 1:", var_BR_1)
print("Variance of sample 2:", var_BR_2)

# Perform t-test if variance is not zero
if var_BR_1 > 0 and var_BR_2 > 0:
    t_stat, p_value = stats.ttest_ind(BR_1, BR_2)

    # Print results
    if p_value < 0.05:
        print("There are statistically significant differences between the mean values of the samples.")
    else:
        print("There are no statistically significant differences between the mean values of the samples.")

    print("t-statistic:", t_stat)
    print("p-value:", p_value)
else:
    print("One of the samples has zero variance, t-test is not possible.")

# Finding correlation

Correlation between Conversion R (%) and Bounce Rate in the first sample

In [None]:


        # Create a DataFrame for correlation matrix
        df_m = pd.DataFrame({
            'Bounce Rate': BR_1,
            'Conversion R (%)': CR_1
        })

        # Calculateate the correlation matrix
        correlation_matrix = df_m.corr()
        print("Correlation Matrix:\n", correlation_matrix)

        # Plot the correlation matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
        plt.title('Correlation Matrix')
        plt.show()



There is no significant relationship between the bounce rate and the conversion rate in the first DataFrame. A slight negative correlation suggests that as the bounce rate increases, the conversion rate might decrease very slightly, but this relationship is too weak to be of practical significance.

In [None]:
        # Create a DataFrame for correlation matrix
        df_m2 = pd.DataFrame({
            'Bounce Rate': BR_2,
            'Conversion Rate (%)': CR_2
        })

        # Calculate the correlation matrix
        correlation_matrix = df_m2.corr()
        print("Correlation Matrix:\n", correlation_matrix)

        # Plot the correlation matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
        plt.title('Correlation Matrix')
        plt.show()

There is no significant relationship between the bounce rate and the conversion rate the Second DataFrame. The correlation is so close to zero that it suggests that changes in the bounce rate have an almost negligible effect on the conversion rate.

In [None]:
        # Create a DataFrame for correlation matrix
        PV_1= sample_1['Page Views']
        df_v= pd.DataFrame({
            'Bounce Rate': CR_1,
            'Page Views': PV_1
        })


        # Calculate the correlation matrix
        correlation_matrix = df_v.corr()
        print("Correlation Matrix:\n", correlation_matrix)

        # Plot the correlation matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
        plt.title('Correlation Matrix')
        plt.show()


In [None]:
# Scatter plot with Seaborn
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Page Views', y='Bounce Rate', data=df_v, color='blue')
plt.title('Scatter Plot: Page Views vs. Bounce Rate')
plt.xlabel('Page Views')
plt.ylabel('Bounce Rate')
plt.show()

**Spearman's rank correlation coefficient**

In [None]:
from scipy.stats import spearmanr

# Calculate Spearman's rank correlation coefficient
correlation, _ = spearmanr(BR_1, CR_1)
print(f"Spearman's rank correlation coefficient: {correlation}")


**The First DataFrame**
Near 0: A coefficient close to 0, like -0.0065, suggests that there is almost no correlation between the variables.

Negative Value: A negative value indicates a slight tendency for one variable to decrease as the other increases, but the strength of this relationship is very weak given the coefficient’s proximity to 0.

In [None]:
from scipy.stats import spearmanr

# Calculate Spearman's rank correlation coefficient
correlation, _ = spearmanr(BR_2, CR_2)
print(f"Spearman's rank correlation coefficient: {correlation}")

**The Second DataFrame**
The very small positive coefficient suggests that there is practically no meaningful relationship between the variables.

# Chi-2 Test

In [None]:
from scipy.stats import chi2_contingency

# Converting to a categorical variable
sample_1['Conversion Level'] = pd.cut(sample_1['Conversion Rate (%)'], bins=[0, 2, 5, 10], labels=['Low', 'Medium', 'High'])

# Creating a contingency table
contingency_table = pd.crosstab(sample_1['Traffic Source'], sample_1['Conversion Level'])

# Applying the Chi-square test
chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table)

print(f"Chi-square statistic: {chi2_stat}")
print(f"p-value: {p_val}")




There is no significant association between the traffic source and the conversion rate categories in your data. The differences observed are likely due to random chance.

# Chi-Square Goodness of Fit Test

In [None]:

from scipy.stats import  chisquare

# Converting Conversion Rate (%) to a categorical variable
sample_1['Conversion Level'] = pd.cut(sample_1['Conversion Rate (%)'], bins=[0, 2, 5, 10], labels=['Low', 'Medium', 'High'])

# Creating a contingency table
contingency_table = pd.crosstab(sample_1['Traffic Source'], sample_1['Conversion Level'])

# Applying the Chi-square test of independence
chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table)
print(f"Chi-square statistic: {chi2_stat}")
print(f"p-value: {p_val}")

# Sum the observed and expected frequencies for use in a goodness-of-fit test
observed = contingency_table.values.flatten()
expected = expected.flatten()

# Check and print sums of observed and expected frequencies
print(f"Sum of observed frequencies: {observed.sum()}")
print(f"Sum of expected frequencies: {expected.sum()}")

# Ensure that observed and expected are numeric and 1-dimensional
print(f"Observed data type: {observed.dtype}")
print(f"Expected data type: {expected.dtype}")
print(f"Observed shape: {observed.shape}")
print(f"Expected shape: {expected.shape}")

# Check if the sums match and adjust if necessary
if not np.isclose(observed.sum(), expected.sum()):
    print("Sums do not match. Adjusting expected frequencies.")
    # Scale expected frequencies to match the sum of observed frequencies
    expected = expected * observed.sum() / expected.sum()

# Perform the Chi-square goodness-of-fit test
chi2_stat, p_val = chisquare(observed, f_exp=expected)
print(f"Chi-square statistic (goodness of fit): {chi2_stat}")
print(f"p-value (goodness of fit): {p_val}")

# Interpretation
alpha = 0.05
if p_val < alpha:
    print("Reject the null hypothesis: observed frequencies do not match the expected distribution.")
else:
    print("No grounds to reject the null hypothesis: observed frequencies match the expected distribution.")