In [1]:
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
# We read the reconstruction method from the .json file

import xgboost as xgb

model_xgb_2 = xgb.Booster()
model_xgb_2.load_model("reconstruction_model.json")


In [3]:
#read Csv of hourly views evolution available at https://doi.org/10.6084/m9.figshare.20079857
df = pd.read_csv('/home/castaldm-admin/Desktop/Owncloud/Bot_5minuts/riscrittura2022/videoStatististicsALL_from21_anonymized.csv', sep=';')

In [4]:
# to make it faster, as an example, we can use just the first rows...
df = df[:10000]

In [6]:
df['created_at'] = pd.to_datetime(df['created_at'])
df['hour'] = pd.to_datetime(df['created_at']).dt.hour
df['hour_from_beginning'] = df.groupby(['video_id', 'created_at']).ngroup()
df.head()

Unnamed: 0.1,Unnamed: 0,video_id,created_at,comments,dislikes,likes,views,corrections,hour,hour_from_beginning
0,0,100000,2021-05-31 20:00:14,1.0,0.0,16.0,162,0,20,0
1,1,100000,2021-05-31 21:00:14,14.0,2.0,36.0,266,0,21,1
2,2,100000,2021-05-31 22:00:12,17.0,2.0,49.0,1474,0,22,2
3,3,100000,2021-05-31 23:00:12,18.0,3.0,56.0,3746,0,23,3
4,4,100000,2021-06-01 00:00:12,18.0,3.0,62.0,5625,0,0,4


In [7]:
# create column views_diff with the hour delta of views
df2 = pd.DataFrame({'video_id':df.video_id.unique(), 'created_at' :pd.to_datetime(1490195805, unit='s'), 'views': 0})
df = pd.concat([df2, df])
#sort by video and created at
df = df.sort_values(by=['video_id', 'created_at']).reset_index()
df['views_diff'] = df.views.diff()
df.head()

df = df[df.created_at != pd.to_datetime(1490195805, unit='s')]
df.head()

Unnamed: 0.1,index,video_id,created_at,views,Unnamed: 0,comments,dislikes,likes,corrections,hour,hour_from_beginning,views_diff
1,0,100000,2021-05-31 20:00:14,162,0.0,1.0,0.0,16.0,0.0,20.0,0.0,162.0
2,1,100000,2021-05-31 21:00:14,266,1.0,14.0,2.0,36.0,0.0,21.0,1.0,104.0
3,2,100000,2021-05-31 22:00:12,1474,2.0,17.0,2.0,49.0,0.0,22.0,2.0,1208.0
4,3,100000,2021-05-31 23:00:12,3746,3.0,18.0,3.0,56.0,0.0,23.0,3.0,2272.0
5,4,100000,2021-06-01 00:00:12,5625,4.0,18.0,3.0,62.0,0.0,0.0,4.0,1879.0


In [8]:
df['corrections'] = 0
df.loc[df.views_diff.values < 0,'corrections'] = 1

In [9]:
# now we need to classify hours with our recunstruction method.
# First we create the matrix X


X = np.zeros([len(df), 50])

for i in range(0, 48):

    temp = df[['views_diff', 'video_id']].groupby('video_id')['views_diff'].shift(i-24)
    temp[temp != temp] = 0
    X[:,i] = temp


#we add the columns with the hour of the day
X[:, 48] = df.hour

# we add also the hour from pubblication
X[:,49] = df.hour_from_beginning


In [10]:
# we can make our predictions with the recunstruction method
db = xgb.DMatrix(data = X)

# make prediction
preds = model_xgb_2.predict(db)
y_predict = np.zeros(len(preds))
y_predict[preds >= 0.500] = 1

In [11]:
df['bool_corrections_methodB'] = y_predict

In [12]:
df.head()

Unnamed: 0.1,index,video_id,created_at,views,Unnamed: 0,comments,dislikes,likes,corrections,hour,hour_from_beginning,views_diff,bool_corrections_methodB
1,0,100000,2021-05-31 20:00:14,162,0.0,1.0,0.0,16.0,0,20.0,0.0,162.0,0.0
2,1,100000,2021-05-31 21:00:14,266,1.0,14.0,2.0,36.0,0,21.0,1.0,104.0,0.0
3,2,100000,2021-05-31 22:00:12,1474,2.0,17.0,2.0,49.0,0,22.0,2.0,1208.0,0.0
4,3,100000,2021-05-31 23:00:12,3746,3.0,18.0,3.0,56.0,0,23.0,3.0,2272.0,0.0
5,4,100000,2021-06-01 00:00:12,5625,4.0,18.0,3.0,62.0,0,0.0,4.0,1879.0,0.0


In [13]:
df['interventions'] = 0
df.loc[(df.corrections> 0) | (df.bool_corrections_methodB  > 0), 'interventions'] = 1


### We now have the prediction of the hours with corrections. The size of the corrections still needs to be estimated.

In [14]:
df_agg = df.groupby(['video_id']).agg({'views_diff':list,
                         'created_at':min,
                         'corrections':list,
                         'interventions':list
                         }).reset_index()

In [15]:
df_agg['created_at'] = pd.to_datetime(df_agg['created_at'])
#df_agg['hour_publication'] = df_agg['created_at'].dt.hour
df_agg.head()

Unnamed: 0,video_id,views_diff,created_at,corrections,interventions
0,100000,"[162.0, 104.0, 1208.0, 2272.0, 1879.0, 1211.0,...",2021-05-31 20:00:14,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,100001,"[310.0, 560.0, 360.0, 4218.0, 3887.0, 3810.0, ...",2021-05-31 17:00:13,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,100002,"[181.0, 309.0, 250.0, 2915.0, 1625.0, 1007.0, ...",2021-05-31 17:00:13,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,100003,"[136.0, 60.0, 31.0, 329.0, 255.0, 213.0, 141.0...",2021-05-31 17:00:13,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,100004,"[181.0, 89.0, 199.0, 590.0, 624.0, 536.0, 419....",2021-05-31 17:00:13,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [16]:
def methodA(x):
    h = 1
    corr = np.array(x[0]).copy()
    reconstructed = np.array(x[1]).copy()
    views = x[2].copy()

    if sum(corr)> 0 :

        for i in range(1, len(views)-1):
            if corr[i] > 0:
                start = np.max([i-h, 0])
                end = np.min([len(views), i+h+1])
                
                expected = np.min([*views[start:i], *views[i+1:end]]) #is it better mean? #with the min we get 0.5342 #with mean 0.57                    
                
                if expected < 0:
                    expected = 0
                if expected > np.sum(views[:i]):
                    expected = np.sum(views[:i])
                corr[i] = corr[i] + expected
                views[i] = -corr[i]

            elif (corr[i] == 0) & (reconstructed[i] > 0):
                start = np.max([i-h, 0])
                end = np.min([len(views), i+h+1])
                
                expected = np.min([*views[start:i], *views[i+1:end]]) #is it better mean? #with the min we get 0.5342 #with mean 0.57                    
                if expected < 0:
                    expected = 0
                if expected > np.sum(views[:i]):
                    expected = np.sum(views[:i])
                if  expected - views[i]> 0:
                    corr[i] = expected - views[i]

    return corr 

df_agg['estimated_corrections'] = df_agg[['corrections', 'interventions','views_diff']].apply(methodA, axis=1)
df_agg.head()

Unnamed: 0,video_id,views_diff,created_at,corrections,interventions,estimated_corrections
0,100000,"[162.0, 104.0, 1208.0, 2272.0, 1879.0, 1211.0,...",2021-05-31 20:00:14,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,100001,"[310.0, 560.0, 360.0, 4218.0, 3887.0, 3810.0, ...",2021-05-31 17:00:13,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,100002,"[181.0, 309.0, 250.0, 2915.0, 1625.0, 1007.0, ...",2021-05-31 17:00:13,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,100003,"[136.0, 60.0, 31.0, 329.0, 255.0, 213.0, 141.0...",2021-05-31 17:00:13,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,100004,"[181.0, 89.0, 199.0, 590.0, 624.0, 536.0, 419....",2021-05-31 17:00:13,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [17]:
df_agg = df_agg[['video_id', 'estimated_corrections']]

In [18]:
s = df_agg.apply(lambda x: pd.Series(x['estimated_corrections']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'corrections_methodB'

In [19]:
df['correctionsB'] = s.values

In [20]:
#update anche le views

df.loc[(df.correctionsB > 0) & (df.corrections > 0), 'views_diff'] = -df[(df.correctionsB > 0) & (df.corrections > 0)].correctionsB.values
df[(df.correctionsB > 0) & (df.corrections == 0)][:10]

Unnamed: 0.1,index,video_id,created_at,views,Unnamed: 0,comments,dislikes,likes,corrections,hour,hour_from_beginning,views_diff,bool_corrections_methodB,interventions,correctionsB
195,193,100001,2021-06-01 16:00:12,23692,193.0,128.0,21.0,239.0,0,16.0,193.0,58.0,1.0,1,171.0
211,209,100001,2021-06-02 08:00:12,26887,209.0,137.0,23.0,251.0,0,8.0,209.0,46.0,1.0,1,3.0
220,218,100001,2021-06-02 17:00:14,27606,218.0,138.0,23.0,257.0,0,17.0,218.0,86.0,1.0,1,18.0
291,289,100001,2021-06-05 16:00:12,30663,289.0,142.0,26.0,264.0,0,16.0,289.0,14.0,1.0,1,6.0
338,336,100001,2021-06-07 16:00:10,31227,336.0,147.0,26.0,269.0,0,16.0,336.0,1.0,1.0,1,5.0
366,363,100002,2021-06-01 16:00:12,9279,363.0,41.0,15.0,75.0,0,16.0,363.0,51.0,1.0,1,38.0
391,388,100002,2021-06-02 17:00:14,10518,388.0,46.0,17.0,81.0,0,17.0,388.0,30.0,1.0,1,5.0
708,703,100004,2021-06-01 16:00:12,4080,703.0,40.0,8.0,53.0,0,16.0,703.0,31.0,1.0,1,17.0
879,873,100005,2021-06-01 16:00:12,5505,873.0,32.0,14.0,44.0,0,16.0,873.0,83.0,1.0,1,90.0
895,889,100005,2021-06-02 08:00:12,14329,889.0,57.0,25.0,73.0,0,8.0,889.0,284.0,1.0,1,14.0


In [15]:
df.head()

Unnamed: 0.2,index,video_id,created_at,views,Unnamed: 0.1,Unnamed: 0,likes,dislikes,comments,channel,views_diff,corrections,hour,hour_from_beginning,bool_corrections_methodB,interventions,correctionsB
0,9388797,---jg-Dv9hY,2022-01-30 19:00:37,1314,9388797.0,9422267.0,28.0,0.0,9.0,UCzKH70qfN_yuXq3s91fdwmg,1314.0,0,19,0,0.0,0,0.0
1,9388798,---jg-Dv9hY,2022-01-30 20:00:43,1317,9388798.0,9422268.0,28.0,0.0,9.0,UCzKH70qfN_yuXq3s91fdwmg,3.0,0,20,1,0.0,0,0.0
2,9388799,---jg-Dv9hY,2022-01-30 21:00:35,1320,9388799.0,9422269.0,28.0,0.0,9.0,UCzKH70qfN_yuXq3s91fdwmg,3.0,0,21,2,0.0,0,0.0
3,9388800,---jg-Dv9hY,2022-01-30 22:00:34,1324,9388800.0,9422270.0,28.0,0.0,9.0,UCzKH70qfN_yuXq3s91fdwmg,4.0,0,22,3,0.0,0,0.0
4,9388801,---jg-Dv9hY,2022-01-30 23:00:35,1328,9388801.0,9422271.0,28.0,0.0,9.0,UCzKH70qfN_yuXq3s91fdwmg,4.0,0,23,4,0.0,0,0.0


In [19]:
df= df[['video_id', 'channel','created_at', 'views', 'views_diff', 'corrections', 'hour', 'hour_from_beginning', 'correctionsB', 'interventions']]

In [20]:
#save the reconstructed data
df.to_csv('/home/castaldm-admin/Desktop/Owncloud/Bot_5minuts/riscrittura2022/videoStatististicsALL_from21_reconstructed.csv', sep=';')