In [18]:
import os
import pandas as pd

# Import all csv files inside the specified folder
folder_path = '/Users/zeruizhang/Downloads/archive'

# Initialize an empty list to hold the dataframes
df_list = []

for file in os.listdir(folder_path):
    if file.endswith('.csv'):
        file_path = os.path.join(folder_path, file)
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        # Extract the first two characters of the file name to use as the country code
        country_code = file[:2]
        # Add a new column 'country' to the DataFrame with the country code
        df['country'] = country_code
        # Append the DataFrame to the list
        df_list.append(df)

In [19]:
import pandas as pd

def filter_last_3_months(df):
    # Convert columns to datetime
    df['trending_date'] = pd.to_datetime(df['trending_date'], format='%Y-%m-%dT%H:%M:%SZ')
    df['publishedAt'] = pd.to_datetime(df['publishedAt'], format='%Y-%m-%dT%H:%M:%SZ')
    
    # Get the most recent date in the 'trending_date' column
    most_recent_date = df['trending_date'].max()
    
    # Calculate the date 3 months before the most recent date
    three_months_ago = most_recent_date - pd.DateOffset(months=3)
    
    # Filter the DataFrame to only include rows from the last 3 months
    filtered_df = df[df['trending_date'] >= three_months_ago]
    
    return filtered_df

In [20]:
filtered_df_list = [filter_last_3_months(df) for df in df_list]

# Concatenate all the filtered DataFrames into one, name it 'df' (ready to analyze)
df = pd.concat(filtered_df_list, ignore_index=True)

In [21]:
df.shape

(195803, 17)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195803 entries, 0 to 195802
Data columns (total 17 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   video_id           195803 non-null  object        
 1   title              195803 non-null  object        
 2   publishedAt        195803 non-null  datetime64[ns]
 3   channelId          195803 non-null  object        
 4   channelTitle       195803 non-null  object        
 5   categoryId         195803 non-null  int64         
 6   trending_date      195803 non-null  datetime64[ns]
 7   tags               195803 non-null  object        
 8   view_count         195803 non-null  int64         
 9   likes              195803 non-null  int64         
 10  dislikes           195803 non-null  int64         
 11  comment_count      195803 non-null  int64         
 12  thumbnail_link     195803 non-null  object        
 13  comments_disabled  195803 non-null  bool    

In [23]:
df.head()

Unnamed: 0,video_id,title,publishedAt,channelId,channelTitle,categoryId,trending_date,tags,view_count,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,description,country
0,b_l1IP_6psY,Watch Out (Official Audio) Sidhu Moose Wala | ...,2023-11-12 06:30:11,UC9ChdqQRCaZmTCwSJ49tcbw,Sidhu Moose Wala,10,2023-11-13,sidhu moosewala|sidhu moosewala latest song|si...,9966312,1859773,0,505944,https://i.ytimg.com/vi/b_l1IP_6psY/default.jpg,False,False,Sidhu Moose Wala Presents Song - Watch OutSing...,CA
1,okAAL53Rj_0,Film Theory: The FNAF Movie Just Changed the L...,2023-11-12 19:05:34,UC3sznuotAs2ohg_U__Jzj_Q,The Film Theorists,1,2023-11-13,fnaf|five nights at freddy’s|five nights at fr...,2255318,157782,0,6009,https://i.ytimg.com/vi/okAAL53Rj_0/default.jpg,False,False,*🆕 Don’t Miss Our Latest Theorywear Drop!*Shop...,CA
2,nxdh69enoSw,Traveling to the Strangest Country on Earth,2023-11-12 18:01:06,UCvK4bOhULCpmLabd2pDMtnA,Yes Theory,24,2023-11-13,yestheory|seek discomfort|yes theory strangers...,595532,38546,0,9362,https://i.ytimg.com/vi/nxdh69enoSw/default.jpg,False,False,Visit http://magictravel.ai/yestheory to enter...,CA
3,idtvY5lN314,We're Giving Away $30 Million in Free Food,2023-11-12 12:00:29,UCAiLfjNXkNv24uhpzUgPa6A,Beast Philanthropy,22,2023-11-13,[None],2883764,303587,0,10743,https://i.ytimg.com/vi/idtvY5lN314/default.jpg,False,False,Thank you to our partner @Ibotta for sponsorin...,CA
4,Z7r26_NqfLw,Can This Iron Chef Turn Panda Express Gourmet?,2023-11-12 14:00:17,UCVjlpEjEY9GpksqbEesJnNA,mrnigelng,23,2023-11-13,nigel ng|uncle roger|nigel ng comedy,963594,50632,0,1627,https://i.ytimg.com/vi/Z7r26_NqfLw/default.jpg,False,False,Use my code UNCLEROGER to get $15 off your fir...,CA


In [24]:
df.tail()

Unnamed: 0,video_id,title,publishedAt,channelId,channelTitle,categoryId,trending_date,tags,view_count,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,description,country
195798,k24xKZ_Oajs,Apex Legends: Breakout Gameplay Trailer,2024-02-08 16:00:14,UC0ZV6M2THA81QT9hrVWJG3A,Apex Legends,20,2024-02-13,apex legends|apex legends season 20|apex legen...,1318418,49069,0,2573,https://i.ytimg.com/vi/k24xKZ_Oajs/default.jpg,False,False,Welcome to the biggest Anniversary Season yet!...,FR
195799,ZdpBufCMnyc,JE DORS DANS MA PIÈCE SECRÈTE (et je l’ai enco...,2024-02-07 17:00:09,UCgl_xdd0kH27vMIZnE-_17w,ALEKS,24,2024-02-13,pièce secrète|secret|pièce|dormir|je dors|alek...,156093,10093,0,235,https://i.ytimg.com/vi/ZdpBufCMnyc/default.jpg,False,False,Je DORS dans ma Pièce SECRÈTE et je l'ai améli...,FR
195800,HBQGhbE0jno,Daaaaaalí de Quentin Dupieux : Edouard Baer et...,2024-02-06 22:27:23,UC1ObaaFz4XHVPN2T5IFsU4w,L'Obs,25,2024-02-13,Cinéma|Daaaaaali|Dali|Dupieux|Edouard Baer|Jon...,245785,2019,0,107,https://i.ytimg.com/vi/HBQGhbE0jno/default.jpg,False,False,"Dans le nouveau film de Quentin Dupieux, cinq ...",FR
195801,vVybduQrcF8,Puisque c’est complet en 7h j’rajoute une date 🤟🏾,2024-02-07 17:03:04,UCXdHJabqwLJ3NvPfx6XmS5Q,Ninho,10,2024-02-13,Ninho|Binks to binks 5 Musique|Rap|Clip offici...,108865,5888,0,268,https://i.ytimg.com/vi/vVybduQrcF8/default.jpg,False,False,Puisque c’est complet en 7h j’rajoute une date...,FR
195802,deS1N039wl0,24h avec Apple Vision Pro !,2024-02-06 15:33:23,UCaybrunQi8xWgPMgv1AYBHw,TheiCollection,28,2024-02-13,Vision Pro|Apple Vision Pro|Apple Vision Pro F...,711108,26733,0,1788,https://i.ytimg.com/vi/deS1N039wl0/default.jpg,False,False,Merci à Nord VPN de m'avoir accompagné sur ce ...,FR


In [25]:
min_date = df['trending_date'].min()
max_date = df['trending_date'].max()

In [26]:
min_date

Timestamp('2023-11-13 00:00:00')

In [27]:
max_date

Timestamp('2024-02-13 00:00:00')