### Libraries Required

In [1]:
import pandas as pd 
import numpy as np
from tqdm import tqdm
import datetime

#### Bonus A
<p>  Sort the users in terms of number of followers and divide them into two groups: for the first group, take only the top 10% regarding "followers", and for the second one, take the rest. <br>
 Now compare the mean of time intervals between posts for the two categories. <br> Do you notice something relevant?</p>

#### *Approach*:

##### Task divided in three parts:
 - Order users descending by number of followers from instagram_profiles.csv and split them into two groups
 - Create a post_finder funtion to get all posts cts (Timestamp when the post was created) for each group
 - Calculate the mean of timedeltas for each group

In [2]:
## Sort the users in terms of number of followers and divide them into two groups (named a and b):

appended_data = []
for chunk in tqdm(pd.read_csv('~/instagram_profiles.zip', delimiter='\t', usecols=['sid','followers'], chunksize=500000)):  
    
    chunk = chunk.dropna()        ## Remove missing values
    appended_data.append(chunk)

df_followers = pd.concat(appended_data)

df_followers['followers'] = df_followers['followers'].astype("int")    # Cast Followers column to a specified dtype (integer)
df_followers['sid'] = df_followers['sid'].astype("int")                # Cast Sid column to a specified dtype (integer)

result = df_followers.sort_values(by=['followers'], ascending=False)

a, b = np.split(result, [int(.1*len(result))])   # splitting df sorted by followers in 10 % and rest

ten_list = a['sid'].tolist()    ## sid needed as a list for post_finder function input
rest_list = b['sid'].tolist()   ## sid needed as a list for post_finder function input

print(a.shape, b.shape)

10it [00:12,  1.20s/it]


(345277, 2) (3107494, 2)


In [3]:
# function to get posts's datetime information from instagram_posts.zip

def post_finder(lis):
    
    ''' Arg: list of sid from profiles.csv used to match with sid_profile in posts.csv in order to retrieve their posts .
    
        Returns: posts that belong to the users in input list as a pandas DataFrame. '''
    
    big_lista = []
    for chunk in tqdm(pd.read_csv('~/instagram_posts.zip', delimiter='\t', usecols=['sid_profile','cts'], chunksize=500000)):
        
        chunk = chunk.dropna()     ## Remove missing values
        result = chunk.loc[chunk['sid_profile'].isin(lis)]

        if not result.empty:
                big_lista.append(result)

    data = pd.concat(big_lista)
    data['cts'] = pd.to_datetime(data['cts'])  ## converts values to pandas datetime objects.
    print("shape :", data.shape)
    
    return (data)  ## return dataFrame with cts of the posts with matching sid from two tables

In [23]:
## top 10 % 

data_10 = post_finder(ten_list)  ## call the function on 10 % list

##  mean of time intervals between posts:

df_top = data_10.sort_values(["sid_profile",'cts'], ascending = [True, True])
df_top['cts'] = pd.to_datetime(df_top['cts'], utc=True)
y = df_top.diff()         ### Calculates the difference of a DataFrame element compared with element in previous row

result1 = y[y.sid_profile == 0]['cts'].mean()
print(" mean of time intervals between posts for 10 % users regarding 'followers' :", str(result1).split(".")[0])

86it [03:34,  2.49s/it]


shape : (4861946, 2)
 mean of time intervals between posts for 10 % : 22 days 09:14:56


In [24]:
## rest 

data_rest = post_finder(rest_list) ## call the function on rest list

##  mean of time intervals between posts:

df_rest = data_rest.sort_values(['sid_profile','cts'], ascending = [True, True])
df_rest['cts'] = pd.to_datetime(df_rest['cts'], utc=True)
z = df_rest.diff()      ### Calculates the difference of a DataFrame element compared with element in previous row

result2 = z[z.sid_profile == 0]['cts'].mean()
print(" mean of time intervals between posts for rest users regarding 'followers' :", str(result2).split(".")[0])

86it [04:14,  2.96s/it]


shape : (22268940, 2)
 mean of time intervals between posts for rest users regarding 'followers' : 20 days 02:38:57


**Comment :** <br>
<br>The data shows that the mean frequency of posts for the two groups is very similar, 22 days for top followers and 20 days for the rest users.
<br> Intuitively we expected a different result: more influential profiles (with more followers) posting more frequently than others