<a href="https://colab.research.google.com/github/nazanin-zinouri/subclu/blob/main/chat_channel_ecosystem_check.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import scipy.stats as stats
from scipy import mean
from scipy.stats import t
from statsmodels.stats.contingency_tables import mcnemar
from sklearn.linear_model import LinearRegression
import datetime as dt
from datetime import date
import matplotlib.pyplot as plt
import numpy as np
import random
import pandas  as pd
import seaborn as sns
import gspread
from google.auth import default
from google.colab import auth, files
from google.colab import drive
# Authenticate
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)
from google.cloud import bigquery
project_id = 'data-prod-165221'
client = bigquery.Client(project=project_id)

In [None]:
revenue_query='''
SELECT
     DISTINCT pt,
     CASE WHEN pt<'2023-05-01' THEN 'pre' ELSE 'post' END AS time_period_glag,
     subreddit_name,
     revenue,
     impressions
FROM `reddit-employee-datasets.nazanin_zinouri.community_chat_impact_ad_impressions_and_revenue_20230620`
WHERE pt>='2023-03-21' AND pt<='2023-06-11'

'''

In [None]:
ad_impresion_revenue_df = client.query(revenue_query).to_dataframe()

In [None]:
ad_impresion_revenue_df.head()

Unnamed: 0,pt,time_period_glag,subreddit_name,revenue,impressions
0,2023-06-06 00:00:00+00:00,post,love,41.242229,7422
1,2023-06-02 00:00:00+00:00,post,love,14.033061,4637
2,2023-06-09 00:00:00+00:00,post,love,17.888934,6381
3,2023-05-23 00:00:00+00:00,post,love,18.900007,5936
4,2023-05-29 00:00:00+00:00,post,love,14.202471,5068


In [None]:
##defining simple functions to calculate 1st and 3rd quantiles
def q1(x):
    return x.quantile(0.25)
def med(x):
    return x.quantile(0.50)
def q3(x):
    return x.quantile(0.75)

In [None]:
## aggregate functions for groupby
f = {'impressions': ['min',med, 'mean', q1, q3,'max']}

In [None]:
ad_impresion_revenue_df=ad_impresion_revenue_df[ad_impresion_revenue_df['subreddit_name']!='indian_hangouts']

In [None]:
ad_impresion_revenue_df.groupby(['subreddit_name','time_period_glag']).agg(f)

Unnamed: 0_level_0,Unnamed: 1_level_0,impressions,impressions,impressions,impressions,impressions,impressions
Unnamed: 0_level_1,Unnamed: 1_level_1,min,med,mean,q1,q3,max
subreddit_name,time_period_glag,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
asksingapore,post,70679,90006.5,87625.309524,81396.0,94427.0,107611
asksingapore,pre,61220,74361.0,76886.390244,70511.0,82606.0,99678
belohorizonte,post,22,520.0,512.190476,398.5,605.5,905
belohorizonte,pre,276,620.0,629.780488,497.0,703.0,1024
britishcolumbia,post,6346,19060.5,18929.119048,15995.0,22470.75,32371
britishcolumbia,pre,9669,15661.0,15922.97561,13475.0,18371.0,24069
deathgrips,post,11231,23168.0,30227.595238,18684.25,37042.75,98757
deathgrips,pre,8622,13701.0,14408.487805,12434.0,15146.0,23209
hingeapp,post,17285,35530.0,34985.285714,33439.0,37689.0,46020
hingeapp,pre,27069,35314.0,35010.390244,32322.0,37128.0,41710


In [None]:
## aggregate functions for groupby
f2 = {'revenue': ['min',med, 'mean', q1, q3,'max']}

In [None]:
ad_impresion_revenue_df.groupby(['subreddit_name','time_period_glag']).agg(f2)

Unnamed: 0_level_0,Unnamed: 1_level_0,revenue,revenue,revenue,revenue,revenue,revenue
Unnamed: 0_level_1,Unnamed: 1_level_1,min,med,mean,q1,q3,max
subreddit_name,time_period_glag,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
asksingapore,post,79.84118,107.330585,117.037989,97.702595,132.229028,193.39608
asksingapore,pre,67.651013,111.329172,113.471994,97.27551,130.057591,170.023806
belohorizonte,post,0.0,0.128317,0.226647,0.019672,0.291933,1.218631
belohorizonte,pre,0.007125,0.170775,0.22008,0.076881,0.295127,0.945834
britishcolumbia,post,15.52192,55.704386,56.116752,41.212468,65.583127,101.454329
britishcolumbia,pre,22.999286,41.478142,45.378269,37.080239,54.949549,73.313099
deathgrips,post,40.629475,93.440953,117.570834,77.734639,145.908837,358.438966
deathgrips,pre,27.292355,62.705461,66.357894,48.279281,82.529674,139.336166
hingeapp,post,27.027238,45.769602,45.328923,36.357488,54.385088,62.360496
hingeapp,pre,32.904516,55.135908,57.96214,46.361945,64.516911,110.111467


In [None]:
sv_time_query='''
SELECT
     DISTINCT dt,
     CASE WHEN dt<'2023-05-01' THEN 'pre' ELSE 'post' END AS time_period_glag,
     subreddit_name,
     total_sessions,
     total_screenviews,
     totaltime_on_subreddit
FROM `reddit-employee-datasets.nazanin_zinouri.community_chat_impact_screenviews_and_timeonsite_20230620`
WHERE dt>='2023-03-21' AND dt<='2023-06-11'
'''

In [None]:
sv_time_df = client.query(sv_time_query).to_dataframe()

In [None]:
sv_time_df.head()

Unnamed: 0,dt,time_period_glag,subreddit_name,total_sessions,total_screenviews,totaltime_on_subreddit
0,2023-06-07,post,love,7694,21693,1626691
1,2023-03-24,pre,love,4726,15105,1317351
2,2023-03-22,pre,love,7035,18766,1381874
3,2023-06-09,post,love,7936,23226,1987757
4,2023-05-01,post,love,8072,22539,1803505


In [None]:
sv_time_df=sv_time_df[sv_time_df['subreddit_name']!='indian_hangouts']

In [None]:
## aggregate functions for groupby
f = {'total_sessions': ['min',med, 'mean', q1, q3,'max']}

In [None]:
sv_time_df.groupby(['subreddit_name','time_period_glag']).agg(f)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_sessions,total_sessions,total_sessions,total_sessions,total_sessions,total_sessions
Unnamed: 0_level_1,Unnamed: 1_level_1,min,med,mean,q1,q3,max
subreddit_name,time_period_glag,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
asksingapore,post,103263,125454.0,123996.785714,115824.75,131677.0,142391
asksingapore,pre,77230,101998.0,101322.219512,90580.0,109412.0,127007
belohorizonte,post,39,1572.0,1746.904762,1254.0,2165.25,3689
belohorizonte,pre,1017,1961.0,2335.121951,1727.0,2380.0,10661
britishcolumbia,post,23690,44391.0,46141.047619,37830.25,52085.5,95392
britishcolumbia,pre,21480,36966.0,40025.243902,31625.0,47732.0,69069
deathgrips,post,22071,31182.5,32054.333333,27164.0,37003.25,44567
deathgrips,pre,16242,21402.0,21503.292683,19635.0,22555.0,30305
fallout76bowhunters,post,150,296.0,314.857143,242.25,371.0,659
fallout76bowhunters,pre,172,338.0,335.365854,297.0,356.0,680


In [None]:
f2 = {'total_screenviews': ['min',med, 'mean', q1, q3,'max']}

In [None]:
sv_time_df.groupby(['subreddit_name','time_period_glag']).agg(f2)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_screenviews,total_screenviews,total_screenviews,total_screenviews,total_screenviews,total_screenviews
Unnamed: 0_level_1,Unnamed: 1_level_1,min,med,mean,q1,q3,max
subreddit_name,time_period_glag,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
asksingapore,post,293144,385788.5,384949.380952,364613.25,409543.0,468994
asksingapore,pre,264495,340033.0,350224.585366,310464.0,379934.0,476997
belohorizonte,post,90,3506.5,3779.047619,2830.25,4864.0,7019
belohorizonte,pre,2403,4763.0,5404.487805,4063.0,5624.0,19748
britishcolumbia,post,47691,87788.0,89940.904762,74509.0,102697.75,145298
britishcolumbia,pre,43371,76015.0,80066.95122,63947.0,94544.0,139205
deathgrips,post,66409,111804.5,126599.97619,90964.5,141320.5,395441
deathgrips,pre,53233,68467.0,73685.97561,63331.0,78827.0,132465
fallout76bowhunters,post,331,983.5,1115.547619,809.25,1456.75,2305
fallout76bowhunters,pre,526,1170.0,1201.317073,987.0,1385.0,2405
