<a href="https://colab.research.google.com/github/pierrelouisbescond/medium_articles/blob/main/medium_stats.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing librairies

In [None]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go


# When is the best day to post an article?
## Data Upload

In [None]:
df = pd.read_excel("/content/drive/MyDrive/Medium/Medium Stats.xlsx", sheet_name="visits")
df


Unnamed: 0,Date,Visits
0,2020-05-09,50
1,2020-05-10,59
2,2020-05-11,30
3,2020-05-12,12
4,2020-05-13,5
...,...,...
291,2021-02-24,148
292,2021-02-25,138
293,2021-02-26,135
294,2021-02-27,115


In [None]:
df["article_release"]=~df["Date"].isin(articles_release_dates)


In [None]:
~df["Date"].isin(articles_release_dates)


0      False
1       True
2       True
3       True
4       True
       ...  
291     True
292     True
293     True
294     True
295     True
Name: Date, Length: 296, dtype: bool

In [None]:
articles_release_dates = ["2020-05-09", "2020-05-18", "2020-05-22", "2020-05-24", "2020-05-28",
                          "2020-05-31", "2020-06-08", "2020-06-15", "2020-06-22", "2020-07-19",
                          "2020-07-30", "2020-08-24", "2020-10-29", "2020-11-02", "2020-11-25",
                          "2020-12-01", "2020-12-17", "2020-12-21", "2020-12-30", "2021-01-06",
                          "2021-02-01"]


In [None]:
# We index all dates when an article has been published
df["article_release"] = df["Date"].isin(articles_release_dates) * 1
# We index day+1 and day+2 after an article has been published
df["article_release"] = df["article_release"] + df["article_release"].shift(1, fill_value=0) + df["article_release"].shift(2, fill_value=0)

df


Unnamed: 0,Date,Visits,article_release
0,2020-05-09,50,1
1,2020-05-10,59,1
2,2020-05-11,30,1
3,2020-05-12,12,0
4,2020-05-13,5,0
...,...,...,...
291,2021-02-24,148,0
292,2021-02-25,138,0
293,2021-02-26,135,0
294,2021-02-27,115,0


In [None]:
# We calculate the average number of visits, excluding day, day+1 and day+2 after an article was published
mean_without_release = df["Visits"][df["article_release"] == 0].mean()
print ("Mean without articles peaks:", mean_without_release)

# We create a "Visits_norm" column where peaks related to articles' submission are normalized
df["Visits_norm"] = df["Visits"]
df["Visits_norm"][df["article_release"] >= 1] = mean_without_release
df


Mean without articles peaks: 87.52564102564102




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Date,Visits,article_release,Visits_norm
0,2020-05-09,50,1,87.525641
1,2020-05-10,59,1,87.525641
2,2020-05-11,30,1,87.525641
3,2020-05-12,12,0,12.000000
4,2020-05-13,5,0,5.000000
...,...,...,...,...
291,2021-02-24,148,0,148.000000
292,2021-02-25,138,0,138.000000
293,2021-02-26,135,0,135.000000
294,2021-02-27,115,0,115.000000


In [None]:
# We plot the original and normalized visits numbers per day
fig = go.Figure()
fig.add_trace(go.Scatter(x=df["Date"], y=df.Visits, mode='lines', name='Visits'))
fig.add_trace(go.Scatter(x=df["Date"], y=df.Visits_norm, mode='lines', name='Visits_norm'))
fig.show()


In [None]:
# We extract the number of the weekday numbers and names
df['weekday'] = df['Date'].dt.dayofweek
df['weekday_name'] = df['Date'].dt.day_name()
df


Unnamed: 0,Date,Visits,article_release,Visits_norm,weekday,weekday_name
0,2020-05-09,50,1,87.525641,5,Saturday
1,2020-05-10,59,1,87.525641,6,Sunday
2,2020-05-11,30,1,87.525641,0,Monday
3,2020-05-12,12,0,12.000000,1,Tuesday
4,2020-05-13,5,0,5.000000,2,Wednesday
...,...,...,...,...,...,...
291,2021-02-24,148,0,148.000000,2,Wednesday
292,2021-02-25,138,0,138.000000,3,Thursday
293,2021-02-26,135,0,135.000000,4,Friday
294,2021-02-27,115,0,115.000000,5,Saturday


In [None]:
# We rank the days per average number of visits (normalized and original values)
df_mean = df.groupby("weekday_name").mean()
df_mean[["Visits_norm", "Visits"]].sort_values(ascending=False, by="Visits_norm")


Unnamed: 0_level_0,Visits_norm,Visits
weekday_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Monday,103.031136,132.214286
Wednesday,99.650183,120.904762
Tuesday,96.554945,123.428571
Thursday,89.885836,115.785714
Friday,86.766789,106.833333
Saturday,71.073345,72.302326
Sunday,66.607036,64.930233


In [None]:
# We diplay the visits per day thanks to box-plots
fig = px.box(df.sort_values(by="weekday"), y="Visits_norm", x="weekday_name")
fig.show()


## The "T-Test" or "Student T-Test"

In [None]:
import scipy.stats as stats

In [None]:
population_1 = np.random.normal(30, 5, 1000)
print("Population 1 Mean: {:.2f} and Standard Deviation: {:.2f}".format(population_1.mean(), population_1.std()))

sample_1 = np.random.choice(population_1, 100)
sample_2 = np.random.choice(population_1, 100)

print("Sample 1 Mean: {:.2f} and Standard Deviation: {:.2f}".format(sample_1.mean(), sample_1.std()))
print("Sample 2 Mean: {:.2f} and Standard Deviation: {:.2f}".format(sample_2.mean(), sample_2.std()))


Population 1 Mean: 30.23 and Standard Deviation: 4.89
Sample 1 Mean: 31.30 and Standard Deviation: 4.54
Sample 2 Mean: 31.00 and Standard Deviation: 4.64


In [None]:
population_2 = np.random.normal(28, 3, 1000)
print("Population 2 Mean: {:.2f} and Standard Deviation: {:.2f}".format(population_2.mean(), population_2.std()))

sample_3 = np.random.choice(population_2, 100)
print("Sample 3 Mean: {:.2f} and Standard Deviation: {:.2f}".format(sample_3.mean(), sample_3.std()))


Population 2 Mean: 27.92 and Standard Deviation: 3.02
Sample 3 Mean: 28.50 and Standard Deviation: 2.99


In [None]:
p_value = stats.ttest_ind(sample_1, sample_2)[1]
print("Sample 1 Vs Sample 2 p_value: {:.4f}".format(p_value))
p_value = stats.ttest_ind(sample_1, sample_3)[1]
print("Sample 1 Vs Sample 3 p_value: {:.4f}".format(p_value))
p_value = stats.ttest_ind(sample_2, sample_3)[1]
print("Sample 2 Vs Sample 3 p_value: {:.4f}".format(p_value))

Sample 1 Vs Sample 2 p_value: 0.6505
Sample 1 Vs Sample 3 p_value: 0.0000
Sample 2 Vs Sample 3 p_value: 0.0000


In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=sample_1, marker_color='#5A9EFF', name="Sample 1"))
fig.add_trace(go.Histogram(x=sample_2, marker_color='#3282FF', name="Sample 2"))
fig.add_trace(go.Histogram(x=sample_3, marker_color='#14C82E', name="Sample 3"))
fig.update_layout(bargap=0.1, bargroupgap=0.1)
fig.show()


## The statistical differences between the weekdays

In [None]:
weekdays_name = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

# We create a DataFrame where both axis corresponds to the weekdays names
ttest_matrix = pd.DataFrame(index=weekdays_name, columns=weekdays_name)

# For every possibility of days couple , we calculate the T-Test
for x in np.arange(0, 7, 1):
  for y in np.arange(0, 7, 1):
    if x < y:
      p_value = stats.ttest_ind(df["Visits_norm"][df["weekday"] == x], df["Visits_norm"][df["weekday"] == y])[1]
      ttest_matrix.loc[weekdays_name[y], weekdays_name[x]] = p_value

ttest_matrix = ttest_matrix.mask(ttest_matrix <= 0.05, "Different!")
ttest_matrix = ttest_matrix.fillna("")
ttest_matrix


Unnamed: 0,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
Monday,,,,,,,
Tuesday,0.389521,,,,,,
Wednesday,0.664696,0.693737,,,,,
Thursday,0.0935077,0.395472,0.230442,,,,
Friday,Different!,0.201553,0.106077,0.692495,,,
Saturday,Different!,Different!,Different!,Different!,Different!,,
Sunday,Different!,Different!,Different!,Different!,Different!,0.499288,


## How many visits, claps or reading time do you need to reach 1000$?

In [None]:
df_articles = pd.read_excel("/content/drive/MyDrive/Medium/Medium Stats.xlsx", sheet_name="earnings", usecols="A:G")
df_articles


Unnamed: 0,Article Name,Views,Reads,Reading Time,Fans,Claps,Earning
0,XAI — Build your own deep-learning interpretat...,240,66,74,5,17,1.55
1,"Data Science 101: Start with Pandas, Scikit-Le...",344,87,72,6,16,2.86
2,Better visualizing tensors thanks to cities!,182,51,49,3,11,1.18
3,Help Santa optimize his Christmas run with Goo...,788,161,269,12,89,11.34
4,Four Ultimate Mail Management Principles,611,87,6,1,6,0.14
5,"Lobe, Microsoft’s No-Code Computer Vision Soft...",361,121,50,4,27,1.57
6,Employees’ Attrition — How Catboost and Shap c...,549,148,242,10,24,8.91
7,Beyond “classic” PCA: Functional Principal Com...,3069,948,555,15,47,25.23
8,AI in Industry: How a Maintenance Routine ruin...,576,187,168,9,32,7.13
9,Don’t Forget the Big Green Button… and Other E...,328,153,79,4,19,3.59


In [None]:
df_articles._get_numeric_data().sum()

Views           22921.00
Reads            9495.00
Reading Time     4976.00
Fans              168.00
Claps             600.00
Earning           202.73
dtype: float64

In [None]:
corr_mat = df_articles[["Views", "Reads", "Reading Time", "Fans", "Claps", "Earning"]].corr()
corr_mat[["Earning"]][corr_mat["Earning"] < 1]


Unnamed: 0,Earning
Views,0.982667
Reads,0.973393
Reading Time,0.996033
Fans,0.940996
Claps,0.628918


In [None]:
dollar_per_hour = df_articles["Earning"].sum() / df_articles["Reading Time"].sum()
round(dollar_per_hour, 3)


0.041

In [None]:
10 / dollar_per_hour

245.44961278547825