### Explore Then Implement

In [8]:
import sys
import os

BASE_DIR = os.getcwd()  # Gets current working directory

sys.path.append(os.path.abspath(os.path.join(BASE_DIR, "..")))

from frontend.transformation import (
    transform_google_locations, transform_google_trend, 
    transform_twitter_hashflags, transform_twitter_locations, transform_twitter_trend
)

In [9]:
import numpy as np

In [10]:
# Assuming transform_twitter_trend() gives you a DataFrame
df = transform_twitter_trend()

# Drop duplicates based on the 'trend' column
df = df.drop_duplicates(subset=['trend'])

# Remove rows where 'meta_description' is missing or empty
df = df[df['meta_description'].notna() & (df['meta_description'] != '')]

# Function to extract the numeric part of the 'meta_description' and handle 'K' and 'M'
def extract_numeric(meta_desc):
    # Remove commas in case there are any in large numbers (e.g., 9,969)
    meta_desc = meta_desc.replace(',', '')
    
    # Try to extract the number from the meta description, ignoring non-numeric characters
    number_str = ''.join(filter(str.isdigit, meta_desc))
    
    # If a number is found, check if there's a suffix (K or M)
    if number_str:
        number = int(number_str)
        
        # Handle 'K' (thousand) and 'M' (million)
        if 'K' in meta_desc.upper():  # 'K' for thousands
            number *= 1000
        elif 'M' in meta_desc.upper():  # 'M' for millions
            number *= 1000000
        return number
    else:
        return np.nan  # Return NaN if no numeric value is found

# Apply the function to create a new column 'meta_description_numeric' for sorting
df['meta_description_numeric'] = df['meta_description'].apply(extract_numeric)

# Sort by the numeric part of the 'meta_description' in descending order
df_sorted = df.sort_values('meta_description_numeric', ascending=False)

# Select only the columns you want to display
df_sorted = df_sorted[['trend', 'meta_description', 'domain_context']]

# Check if the 'meta_description_numeric' column exists before dropping it
if 'meta_description_numeric' in df_sorted.columns:
    df_sorted = df_sorted.drop(columns=['meta_description_numeric'])

# Display the final sorted DataFrame
print(df_sorted)


  df = pd.read_sql(query, conn)


                  trend                                   meta_description  \
13142             Trump                                        2.76M posts   
15112     BREAKING NEWS                                        1.81M posts   
5306               DOGE                                        1.48M posts   
2350              USAID                                        1.41M posts   
2358               Musk                                        1.36M posts   
...                 ...                                                ...   
48759          Berlinie                                        1,002 posts   
6749             Filiks                                        1,001 posts   
45367     #GenerativeAI                                        1,001 posts   
2601   #DiversiónBetfun                              Promoted by betfun.ok   
52198        #BirOluruz  Promoted by T.C. Çevre, Şehircilik ve İklim Dğ...   

             domain_context  
13142              Politics  
151

In [11]:
def twitter_top_ten():
    # Assuming transform_twitter_trend() gives you a DataFrame
    df = transform_twitter_trend()

    # Drop duplicates based on the 'trend' column
    df = df.drop_duplicates(subset=['trend'])

    # Remove rows where 'meta_description' is missing or empty
    df = df[df['meta_description'].notna() & (df['meta_description'] != '')]

    # Function to extract the numeric part of the 'meta_description' and handle 'K' and 'M'
    def extract_numeric(meta_desc):
        # Remove commas in case there are any in large numbers (e.g., 9,969)
        meta_desc = meta_desc.replace(',', '')
        
        # Try to extract the number from the meta description, ignoring non-numeric characters
        number_str = ''.join(filter(str.isdigit, meta_desc))
        
        # If a number is found, check if there's a suffix (K or M)
        if number_str:
            number = int(number_str)
            
            # Handle 'K' (thousand) and 'M' (million)
            if 'K' in meta_desc.upper():  # 'K' for thousands
                number *= 1000
            elif 'M' in meta_desc.upper():  # 'M' for millions
                number *= 1000000
            return number
        else:
            return np.nan  # Return NaN if no numeric value is found

    # Apply the function to create a new column 'meta_description_numeric' for sorting
    df['meta_description_numeric'] = df['meta_description'].apply(extract_numeric)

    # Sort by the numeric part of the 'meta_description' in descending order
    df_sorted = df.sort_values('meta_description_numeric', ascending=False)

    # Select only the columns you want to display
    df_sorted = df_sorted[['trend', 'meta_description', 'domain_context']]

    # Check if the 'meta_description_numeric' column exists before dropping it
    if 'meta_description_numeric' in df_sorted.columns:
        df_sorted = df_sorted.drop(columns=['meta_description_numeric'])

    # Output the top 10 rows without index
    return df_sorted.head(10)


In [12]:
import pandas as pd

In [13]:
def twitter_trends_location():
    # Transform the location and trend DataFrames
    df_loc = transform_twitter_locations()
    df_trend = transform_twitter_trend()

    # Merge the DataFrames on 'location_id' using an inner join
    merged_df = pd.merge(df_loc, df_trend, on='location_id', how='inner')

    # Select and order the desired columns
    df_sorted = merged_df[['country', 'trend', 'domain_context', 'url']]

    return df_sorted

In [14]:
df = transform_twitter_trend()
df

  df = pd.read_sql(query, conn)


Unnamed: 0,id,trend,position,meta_description,domain_context,url,last_updated,location_id
0,1,Fantastic Four,0,,Entertainment,twitter://search/?query=%22Fantastic+Four%22&s...,2025-02-04 22:48:44.048739,-7293673535050703919
1,2,Netanyahu,1,,Politics,twitter://search/?query=Netanyahu&src=trend_cl...,2025-02-04 22:48:44.048739,-7293673535050703919
2,3,Jessica Alba,2,"2,054 posts",Entertainment,twitter://search/?query=%22Jessica+Alba%22&src...,2025-02-04 22:48:44.048739,-7293673535050703919
3,4,OSHA,3,37.9K posts,Politics,twitter://search/?query=OSHA&src=trend_click&p...,2025-02-04 22:48:44.048739,-7293673535050703919
4,5,Waffle House,4,"7,327 posts",Chain restaurants,twitter://search/?query=%22Waffle+House%22&src...,2025-02-04 22:48:44.048739,-7293673535050703919
...,...,...,...,...,...,...,...,...
56827,56828,Methylene Blue,14,"5,248 posts",Trending in United States,twitter://search/?query=%22Methylene+Blue%22&s...,2025-02-06 01:17:40.900583,-2999473465018429484
56828,56829,Deebo,15,"2,195 posts",Sports,twitter://search/?query=Deebo&src=trend_click&...,2025-02-06 01:17:40.900583,-2999473465018429484
56829,56830,Drake Bell,16,"4,866 posts",Entertainment,twitter://search/?query=%22Drake+Bell%22&src=t...,2025-02-06 01:17:40.900583,-2999473465018429484
56830,56831,James Harrison,17,"3,513 posts",Sports,twitter://search/?query=%22James+Harrison%22&s...,2025-02-06 01:17:40.900583,-2999473465018429484


In [15]:
# newest trends

# Select only the columns you want to display


def top5_per_context():
    # First, ensure your DataFrame is sorted by last_updated in descending order
    df = transform_twitter_trend()
    df_sorted = df[['trend', 'domain_context', 'last_updated']]
    df_sorted = df_sorted.sort_values('last_updated', ascending=False)

    # Then, for each domain_context group, take the first 5 rows
    top5_per_context = df_sorted.groupby('domain_context').head(5).reset_index(drop=True)

    return top5_per_context

top5_per_context()

  df = pd.read_sql(query, conn)


Unnamed: 0,trend,domain_context,last_updated
0,Spartacus,Politics,2025-02-06 01:17:40.900583
1,Druski,Trending in United States,2025-02-06 01:17:40.900583
2,Mitch McConnell,Politics,2025-02-06 01:17:40.900583
3,Refund,Trending in United States,2025-02-06 01:17:40.900583
4,Hillary,Politics,2025-02-06 01:17:40.900583
...,...,...,...
537,El Salvador,Trending in Washington,2025-02-04 23:01:02.143000
538,Sanji,Television,2025-02-04 23:00:13.811278
539,Marcus Smith,NFL,2025-02-04 22:51:26.668849
540,Sanji,Television,2025-02-04 22:51:25.312332


In [26]:
df = transform_twitter_trend()
values = 'Trump'
filtered_df = df[df['trend'] == values]
filtered_df

  df = pd.read_sql(query, conn)


Unnamed: 0,id,trend,position,meta_description,domain_context,url,last_updated,location_id
13142,13143,Trump,19,2.76M posts,Politics,twitter://search/?query=Trump&src=trend_click&...,2025-02-05 01:31:33.910106,6731622963572903665
13331,13332,Trump,19,2.76M posts,Politics,twitter://search/?query=Trump&src=trend_click&...,2025-02-05 01:31:59.511952,7825301456499146803
14177,14178,Trump,0,2.8M posts,Politics,twitter://search/?query=Trump&src=trend_click&...,2025-02-05 02:00:54.798498,-2152060852187384679
16500,16501,Trump,19,3.66M posts,Politics,twitter://search/?query=Trump&src=trend_click&...,2025-02-05 02:32:20.511772,-625131485298729454
16888,16889,Trump,17,3.66M posts,Politics,twitter://search/?query=Trump&src=trend_click&...,2025-02-05 02:33:03.060621,-7608764736147602991
...,...,...,...,...,...,...,...,...
39326,39327,Trump,11,4.15M posts,Politics,twitter://search/?query=Trump&src=trend_click&...,2025-02-05 14:30:24.422314,9063907876614026187
39501,39502,Trump,10,4.15M posts,Politics,twitter://search/?query=Trump&src=trend_click&...,2025-02-05 14:30:54.403220,149318359618127277
40238,40239,Trump,19,4.77M posts,Politics,twitter://search/?query=Trump&src=trend_click&...,2025-02-05 14:42:35.552761,-1021290998162208791
41497,41498,Trump,19,5.28M posts,Politics,twitter://search/?query=Trump&src=trend_click&...,2025-02-05 15:39:07.796791,-1769440319492945944


In [29]:
def trend_growth():
    df = transform_twitter_trend()
    
    # Count occurrences of each trend
    trend_counts = df['trend'].value_counts()
    
    # Filter trends that appear more than 3 times
    valid_trends = trend_counts[trend_counts > 3].index
    
    # Filter the dataframe to keep only valid trends
    df_filtered = df[df['trend'].isin(valid_trends)]
    
    # Select relevant columns
    df_sorted = df_filtered[['trend', 'meta_description', 'last_updated']]
    
    return df_sorted
trend_growth()

  df = pd.read_sql(query, conn)


Unnamed: 0,trend,meta_description,last_updated
0,Fantastic Four,,2025-02-04 22:48:44.048739
1,Netanyahu,,2025-02-04 22:48:44.048739
2,Jessica Alba,"2,054 posts",2025-02-04 22:48:44.048739
3,OSHA,37.9K posts,2025-02-04 22:48:44.048739
4,Waffle House,"7,327 posts",2025-02-04 22:48:44.048739
...,...,...,...
58971,Jurassic World,81.9K posts,2025-02-06 01:33:16.724410
58972,Missouri,"7,838 posts",2025-02-06 01:33:16.724410
58973,Kuminga,"3,973 posts",2025-02-06 01:33:16.724410
58974,Jimmy Butler,,2025-02-06 01:33:16.724410
