In [None]:
import os
import glob
import pycld2 as cld2
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Posts

In [None]:
# Use of parents1
data_path = Path.cwd().parents[1].joinpath('data')
raw_data_path = data_path.joinpath('raw')
intermediate_data_path = data_path.joinpath('intermediate')
processed_data_path = data_path.joinpath('processed')
post_path = raw_data_path.joinpath('post')
user_path = raw_data_path.joinpath('users', 'followers.txt')
todate_path = intermediate_data_path.joinpath('extracted_todate', 'usernames_todate.txt')
processed_post_path = processed_data_path.joinpath('post', 'processed_data.csv')

In [None]:
from datetime import datetime
# use glob to get all the csv files 
# in the folder
# use glob to get all the csv files in the raw data folder. 
post_files = post_path.glob(os.path.join("*.csv"))

post_appended_data = []
# loop over the list of csv files
for f in post_files:
    data = pd.read_csv(f) 
    post_appended_data.append(data)
# see pd.concat documentation for more info
df = pd.concat(post_appended_data)
df.reset_index(drop=True, inplace=True)

In [None]:
df.head()

In [None]:
df['url'].nunique()

In [None]:
list(df['caption_hashtags'][41:43])

We have the number of people who have created their own content. We can do an analysis on these people to find information such as:
* Time they are posting
* Mentions
* Topic / Niche
* Sentiment Analysis
* Brand Voice


**There are missing values, so possible solution is remove them by dropna - after assign to column back are created again**

### Hour to post 

In [None]:
df['hour'] = pd.to_datetime(df['date_utc']).dt.hour

In [None]:
df['hour'].hist(bins=24)

In [None]:
df['caption'].iloc[0]

In [None]:
df['tagged_users'].iloc[11]

In [None]:
df['hashtag_set'] = df['caption_hashtags'].dropna().apply(lambda x: list({item for item in x.lstrip("[").rstrip("]").split(",")}))

In [None]:
df['tagged_users_set'] = df['tagged_users'].dropna().apply(lambda x: list({item for item in x.lstrip("[").rstrip("]").split(",")}))

In [None]:
df.columns

### Grouped Data 

In [None]:
grouped = df.groupby('owner_id', as_index=False).agg(
    {
         'date_utc': lambda x: (list(x)),    # Sum duration per group
          'profile': 'count',  # get the count of networks
          'typename': ['count', lambda x: (list(x))],
        'mediacount' : sum,
        'caption': lambda x: list(x), 
        'caption_hashtags': lambda x: ','.join(set(x)),
        'caption_mentions' : lambda x: ",".join(set(x)), 
        'tagged_users' : lambda x: ','.join(set(x)), 
        'is_video' : lambda x: (list(x)),  
        'video_view_count' : lambda x: (list(x)), 
        'video_duration' : lambda x: (list(x)),  
        'likes': ['sum', 'mean', lambda x: (list(x))], 
        'comments': ['sum', 'mean',lambda x: (list(x))], 
        'is_sponsored': lambda x: (list(x)), 
        'sponsor_users': lambda x: (list(x)), 
        'location' : lambda x: (list(x)), 
        'days_ago': ['min', lambda x: (list(x))]
    }
)

# Using ravel, and a string join, we can create better names for the columns:
grouped.columns = ["_".join(x) for x in grouped.columns.ravel()]

In [None]:
rename_columns_dict = {
    'owner_id_': 'userid',
    'date_utc_<lambda>': 'date_utc',
    'typename_<lambda_0>': 'typename',
    'caption_<lambda>': 'caption',
    'caption_hashtags_<lambda>': 'caption_hashtags',
    'caption_mentions_<lambda>': 'caption_mentions',
    'tagged_users_<lambda>': 'tagged_users',
    'is_video_<lambda>': 'is_video',
    'video_view_count_<lambda>': 'video_view_count',
    'video_duration_<lambda>': 'video_duration', 
    'likes_<lambda_0>': 'likes',
    'comments_<lambda_0>': 'comments', 
    'is_sponsored_<lambda>': 'is_sponsored', 
    'sponsor_users_<lambda>': 'sponsor_users', 
    'location_<lambda>': 'location', 
    'days_ago_<lambda_0>': 'days_ago'
    
    
}
grouped.rename(columns= rename_columns_dict, inplace = True)

In [None]:
grouped.shape