In [None]:
from user_posting_emulation import AWSDBConnector
from sqlalchemy import inspect, text
import pandas as pd

In [None]:
def get_topic_data(topic_table, num_rows=2000):
    dbc = AWSDBConnector().create_db_connector()
    table_names = inspect(dbc).get_table_names()
    if topic_table not in table_names:
        raise ValueError(f'Given table `{topic_table}` not found in the database.')

    with dbc.connect() as conn:
        query = text(f'SELECT * FROM {topic_table} LIMIT {num_rows};')
        result = conn.execute(query)

    data = []
    for row in result:
        data.append(dict(row._mapping))
    
    return data

df_pin = pd.json_normalize(get_topic_data('pinterest_data', num_rows=11154))
# df_geo = pd.json_normalize(get_topic_data('geolocation_data', num_rows=11154))
# df_user = pd.json_normalize(get_topic_data('user_data', num_rows=11154))

In [None]:
import multiprocessing
import pyspark
from pyspark.sql import DataFrame
import pyspark.pandas as psd
from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
os.environ["SPARK_LOCAL_IP"] = "192.168.1.211"
pyspark.__version__

In [None]:
cfg = (
    pyspark.SparkConf()
    # Setting the master to run locally and with the maximum amount of cpu coresfor multiprocessing.
    .setMaster(f"local[{multiprocessing.cpu_count()//2}]")
    # Setting application name
    .setAppName("S3ToSparkConnection")
    # Setting config value via string
    # .set("spark.eventLog.enabled", False)
    # Setting environment variables for executors to use
    # .setExecutorEnv(pairs=[("VAR3", "value3"), ("VAR4", "value4")])
    # Setting memory if this setting was not set previously
    .setIfMissing("spark.executor.memory", "2g")
    .setMaster('local[*]')
)

# Getting a single variable
print(cfg.get("spark.executor.memory"))
# Listing all of them in string readable format
print(cfg.toDebugString())

In [None]:
session = pyspark.sql.SparkSession.builder.config(conf=cfg).getOrCreate()

In [None]:
sc = session.sparkContext
ss = session

In [None]:
# pin_dfs = ss.createDataFrame(df_pin)
pin_dfs = psd.DataFrame(df_pin)

In [None]:
# pin_dfs.show(5)
# pin_dfs.show(1, vertical=True)
# pin_dfs.printSchema()
# pin_dfs.select('*').describe().collect()
# pin_dfs.tail(5)
# pin_dfs.take(5)
# pin_dfs.select('title').show()
pin_dfs.info()

In [None]:
# geo_dfs = psd.DataFrame(df_geo)
# user_dfs = psd.DataFrame(df_user)

In [None]:
# pin_dfs = pin_dfs.replace(['', 'N/A', 'n/a', 'none', 'None'], None)
len(pin_dfs)

## Clean Pin Spark DataFrame

Replace empty entries and entries with no relevant data in each column with `Nones` \
Perform the necessary transformations on the `follower_count` to ensure every entry is a number. \
Make sure the data type of this column is an `int`. \
Ensure that each column containing numeric data has a numeric data type \
Clean the data in the `save_location` column to include only the save location path \
Rename the `index` column to `ind`. \
Reorder the `DataFrame` columns to have the following column order: \
    `ind` \
    `unique_id` \
    `title` \
    `description` \
    `follower_count` \
    `poster_name` \
    `tag_list` \
    `is_image_or_video` \
    `image_src` \
    `save_location` \
    `category`

### Make sure unique ids are correct format

In [None]:
uuid_regex = r'[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12}'
pin_dfs = pin_dfs[pin_dfs['unique_id'].str.match(uuid_regex)]
len(pin_dfs)

### Cleanup and transform `follower_count`

In [None]:
follower_regex = r'[0-9]{1,}[kM]?'
pin_dfs = pin_dfs[pin_dfs['follower_count'].str.match(follower_regex)]
len(pin_dfs)

In [None]:
def transform_followers_count(x) -> np.int64:
    muliplier = 1
    if x.endswith('k'):
        return 1000 * int(x[:-1])
    elif x.endswith('M'):
        return 1000000 * int(x[:-1])
    else:
        return int(x)

pin_dfs['follower_count'] = pin_dfs['follower_count'].apply(transform_followers_count)

In [None]:
pin_dfs.rename(columns={'index': 'ind'}, inplace=True)

In [None]:
pin_dfs['save_location'] = pin_dfs['save_location'].str.replace('Local save in ', '', regex=False)

In [None]:
pin_dfs = pin_dfs[['ind', 'unique_id', 'title', 'description', 'follower_count', 'poster_name',
                   'tag_list', 'is_image_or_video', 'image_src', 'save_location', 'category']]