In [0]:
from pyspark.sql import SparkSession
import pandas as pd
import random

# Initialize Spark Session
spark = SparkSession.builder.appName("Databricks_Catalog_Read").getOrCreate()

# Define the catalog and schema
catalog_name = "workspace"
schema_name = "default"

# Define the tables and corresponding DataFrame names
tables = {
    'geo': 'df_geo',
    'pinterest_data': 'df_pin',
    'user_data': 'df_user'
}

# Read data from Databricks tables and convert them to Pandas DataFrames
for table, df_name in tables.items():
    table_full_path = f"{catalog_name}.{schema_name}.{table}"
    
    # Read the table using Spark
    df_spark = spark.read.table(table_full_path)
    
    # Convert to Pandas DataFrame
    globals()[df_name] = df_spark.toPandas()



In [0]:
#getting 500 random rows
row_numbers = random.sample(range(0, 11154), 1000)  # Unique random numbers


In [0]:
#getting only 1000 records
df_geo=df_geo.iloc[row_numbers]
df_pin=df_pin.iloc[row_numbers]
df_user=df_user.iloc[row_numbers]

In [0]:
# Show all rows
pd.set_option('display.max_rows', None)

In [0]:
df_geo

In [0]:
df_pin

In [0]:
df_user

In [0]:

df_pin.replace('',None)
df_pin.replace(to_replace=r'.*User Info Error.*', value=None, regex=True, inplace=True)
df_pin.replace(to_replace=r'.*N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e.*', value=None, regex=True, inplace=True)
df_pin.replace(to_replace=r'.*No description available Story format.*', value=None, regex=True, inplace=True)
df_pin.replace(to_replace=r'.*No Title Data Available.*', value=None, regex=True, inplace=True)
df_pin.replace(to_replace=r'.*Image src error.*', value=None, regex=True, inplace=True)
df_pin = df_pin.applymap(lambda x: None if pd.isna(x) else x)
# Remove "Local save in " from the 'save_location' column
df_pin['save_location'] = df_pin['save_location'].str.replace('Local save in ', '', regex=False)



# Function to convert follower counts
def convert_to_number(num):
    if pd.isna(num):
        return None
    num = str(num).upper()  # Ensure it's uppercase
    if 'M' in num:
        return float(num.replace('M', ''))* 1000000
    elif 'K' in num:
        return float(num.replace('K', '')) * 1000
    elif 'B' in num:
        return float(num.replace('B', '')) * 1000000000
    else:
        return float(num) if num.isdigit() else None

# Apply conversion
#df_pin['index'] = df_pin['index'].apply(convert_to_number)
df_pin['follower_count'] = df_pin['follower_count'].apply(convert_to_number)

# Rename 'index' column to 'ind'
df_pin.rename(columns={'index': 'ind'}, inplace=True)
# Reorder columns in the specified order
df_pin = df_pin[['ind', 
                 'unique_id', 
                 'title', 
                 'description', 
                 'follower_count', 
                 'poster_name', 
                 'tag_list', 
                 'is_image_or_video', 
                 'image_src', 
                 'save_location', 
                 'category']]



In [0]:
df_geo

In [0]:
df_pin

In [0]:
df_user

In [0]:
import pandas as pd

# Create 'coordinates' column only if 'latitude' and 'longitude' exist
if 'latitude' in df_geo.columns and 'longitude' in df_geo.columns:
    df_geo['coordinates'] = df_geo[['latitude', 'longitude']].apply(lambda x: [x[0], x[1]], axis=1)
    
    # Drop 'latitude' and 'longitude' only if they exist
    df_geo.drop(columns=['latitude', 'longitude'], inplace=True)

# Convert 'timestamp' column to datetime format
df_geo['timestamp'] = pd.to_datetime(df_geo['timestamp'])

# Reorder the columns
df_geo = df_geo[['ind', 'country', 'coordinates', 'timestamp']]

df_geo

In [0]:
import pandas as pd

# Print columns to debug issue
print("Available columns:", df_user.columns)

# Create 'user_name' column only if 'first_name' and 'last_name' exist
if 'first_name' in df_user.columns and 'last_name' in df_user.columns:
    df_user['user_name'] = df_user['first_name'] + " " + df_user['last_name']
    
    # Drop 'first_name' and 'last_name' only if they exist
    df_user.drop(columns=['first_name', 'last_name'], inplace=True)

# Convert 'date_joined' column to datetime format
df_user['date_joined'] = pd.to_datetime(df_user['date_joined'])

# Reorder the columns
df_user = df_user[['ind', 'user_name', 'age', 'date_joined']]


df_user


In [0]:
# Concatenate the DataFrames column-wise
df_combined = pd.concat([df_pin.reset_index(drop=True), df_user.reset_index(drop=True), df_geo.reset_index(drop=True)], axis=1)

df_combined

In [0]:
#Task 4
# Group by country and category, then count occurrences
df_category_count = df_combined.groupby(['country', 'category']).size().reset_index(name='category_count')

# Find the most popular category in each country
df_most_popular_category_by_country = df_category_count.loc[df_category_count.groupby('country')['category_count'].idxmax()]

# Display the resulting DataFrame
df_most_popular_category_by_country

In [0]:
#Task 5
# Convert timestamp column to datetime format
df_combined['post_year'] = pd.to_datetime(df_combined['timestamp']).dt.year

# Group by year and category, then count occurrences
df_category_count = df_combined.groupby(['post_year', 'category']).size().reset_index(name='category_count')

# Find the most popular category each year
df_most_popular_category_by_year = df_category_count.loc[df_category_count.groupby('post_year')['category_count'].idxmax()]

# Display the resulting DataFrame
df_most_popular_category_by_year

In [0]:
#Task 6
#Step 1
#find the user with most followers in each country

# Group by country and poster_name, then sum up follower counts
df_followers = df_combined.groupby(['country', 'poster_name'])['follower_count'].sum().reset_index()

# Find the user with the most followers in each country
df_top_users_per_country = df_followers.loc[df_followers.groupby('country')['follower_count'].idxmax()]

# Display the resulting DataFrame
df_top_users_per_country

In [0]:
#Task 6
#Step 2
# Find the country where the user has the highest follower count
df_top_country = df_top_users_per_country.loc[df_top_users_per_country['follower_count'].idxmax(), ['country', 'follower_count']].to_frame().T

# Display the final DataFrame (single row)
df_top_country


In [0]:
#Task 7
# Define age group bins and labels
bins = [0, 24, 35, 50, float('inf')]
labels = ['18-24', '25-35', '36-50', '50+']

# Create age_group column based on age
df_combined['age_group'] = pd.cut(df_combined['age'], bins=bins, labels=labels, right=True)

# Group by age_group and category, then count occurrences
df_category_count = df_combined.groupby(['age_group', 'category']).size().reset_index(name='category_count')

# Find the most popular category in each age group
df_most_popular_category_by_age_group = df_category_count.loc[df_category_count.groupby('age_group')['category_count'].idxmax()]

# Display the resulting DataFrame
df_most_popular_category_by_age_group

In [0]:
#Task 8
# Group by age_group and calculate median follower count
df_median_followers = df_combined.groupby('age_group')['follower_count'].median().reset_index(name='median_follower_count')

# Display the resulting DataFrame
df_median_followers

In [0]:
#Task 9
# Convert date_joined column to datetime format and extract the year
df_combined['post_year'] = pd.to_datetime(df_combined['date_joined']).dt.year

# Filter data for years between 2015 and 2020
df_filtered = df_combined[(df_combined['post_year'] >= 2015) & (df_combined['post_year'] <= 2020)]

# Count number of users joined per year
df_users_per_year = df_filtered.groupby('post_year').size().reset_index(name='number_users_joined')

# Display the resulting DataFrame
df_users_per_year


In [0]:
#Task 10
# Convert date_joined column to datetime format and extract the year
df_combined['post_year'] = pd.to_datetime(df_combined['date_joined']).dt.year

# Filter data for users who joined between 2015 and 2020
df_filtered = df_combined[(df_combined['post_year'] >= 2015) & (df_combined['post_year'] <= 2020)]

# Calculate median follower count per year
df_median_followers = df_filtered.groupby('post_year')['follower_count'].median().reset_index(name='median_follower_count')

# Display the resulting DataFrame
df_median_followers

In [0]:
#Task 11
# Convert date_joined column to datetime format and extract the year
df_combined['post_year'] = pd.to_datetime(df_combined['date_joined']).dt.year

# Filter data for users who joined between 2015 and 2020
df_filtered = df_combined[(df_combined['post_year'] >= 2015) & (df_combined['post_year'] <= 2020)]

# Group by post_year and age_group, then calculate the median follower count
df_median_followers_by_age_group = df_filtered.groupby(['post_year', 'age_group'])['follower_count'].median().reset_index(name='median_follower_count')

# Display the resulting DataFrame
df_median_followers_by_age_group