In [1]:
pip install pandas pyarrow




In [5]:
import os
import pandas as pd

def clean_parquet_file(input_filename, output_filename):
    """
    Cleans a Parquet dataset by selecting specific columns and formatting datetime.

    Parameters:
    - input_filename: str, name of the input Parquet file located in the upper directory.
    - output_filename: str, name of the output Parquet file to be saved in the upper directory.
    """
    # Define the path to the upper directory
    upper_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
    
    input_path = os.path.join(upper_dir, input_filename)
    output_path = os.path.join(upper_dir, output_filename)
    
    print(f"Reading data from: {input_path}")
    
    # Read the Parquet file
    try:
        df = pd.read_parquet(input_path)
    except FileNotFoundError:
        print(f"Error: The file {input_path} does not exist.")
        return
    except Exception as e:
        print(f"An error occurred while reading the Parquet file: {e}")
        return
    
    # Check if required columns exist
    required_columns = ['published_date', 'title']
    if not all(col in df.columns for col in required_columns):
        print(f"Error: The input file must contain the columns: {required_columns}")
        return
    
    # Select only the required columns
    df = df[required_columns]
    
    # Convert 'published_date' to datetime format
    print("Converting 'published_date' to datetime format...")
    df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')
    
    # Optionally, you can drop rows with invalid dates
    initial_row_count = len(df)
    df = df.dropna(subset=['published_date'])
    final_row_count = len(df)
    if final_row_count < initial_row_count:
        print(f"Dropped {initial_row_count - final_row_count} rows due to invalid dates.")
    
    # Export the cleaned DataFrame back to a Parquet file
    print(f"Writing cleaned data to: {output_path}")
    try:
        df.to_parquet(output_path, index=False)
        print("Data successfully cleaned and exported.")
    except Exception as e:
        print(f"An error occurred while writing the Parquet file: {e}")

if __name__ == "__main__":
    # Define input and output file names
    INPUT_FILE = 'news_v1.parquet'   # Replace with your input file name
    OUTPUT_FILE = 'cleaned_dataset.parquet'  # Replace with desired output file name
    
    clean_parquet_file(INPUT_FILE, OUTPUT_FILE)


Reading data from: C:\Users\msfal\dl\news_v1.parquet
Converting 'published_date' to datetime format...
Writing cleaned data to: C:\Users\msfal\dl\cleaned_dataset.parquet
Data successfully cleaned and exported.


In [10]:
# Step 2: Import the library
import pandas as pd

# Step 3: Load the Parquet file
file_path = './dataset/final_datasets/item_120_train.parquet'  # Replace with your file path or URL
df = pd.read_parquet(file_path)

# Step 4: Display the first 10 records
print("First 10 records:")
display(df.head(10))

# Step 5: Display the last 10 records
print("Last 10 records:")
display(df.tail(10))

First 10 records:


Unnamed: 0,date,item_code,price
0,2022-01-01,120,3.924919
1,2022-01-02,120,3.942143
2,2022-01-03,120,3.935
3,2022-01-04,120,3.93197
4,2022-01-05,120,3.93403
5,2022-01-06,120,3.936109
6,2022-01-07,120,3.929495
7,2022-01-08,120,3.9343
8,2022-01-09,120,3.93781
9,2022-01-10,120,4.038353


Last 10 records:


Unnamed: 0,date,item_code,price
535,2023-07-22,120,4.136316
536,2023-07-23,120,4.125301
537,2023-07-24,120,4.130374
538,2023-07-25,120,4.142466
539,2023-07-26,120,4.14726
540,2023-07-27,120,4.146622
541,2023-07-28,120,4.143125
542,2023-07-29,120,4.142073
543,2023-07-30,120,4.144231
544,2023-07-31,120,4.142784


In [9]:
# Step 2: Import the library
import pandas as pd

# Step 3: Load the Parquet file
file_path = './dataset/final_datasets/item_1_validation.parquet'  # Replace with your file path or URL
df = pd.read_parquet(file_path)

# Step 4: Display the first 10 records
print("First 10 records:")
display(df.head(10))

# Step 5: Display the last 10 records
print("Last 10 records:")
display(df.tail(10))

First 10 records:


Unnamed: 0,date,item_code,price
0,2023-08-01,120,4.149285
1,2023-08-02,120,4.147917
2,2023-08-03,120,4.145395
3,2023-08-04,120,4.24
4,2023-08-05,120,4.219444
5,2023-08-06,120,4.135294
6,2023-08-07,120,4.147282
7,2023-08-08,120,4.141666
8,2023-08-09,120,4.139131
9,2023-08-10,120,4.134459


Last 10 records:


Unnamed: 0,date,item_code,price
127,2023-12-21,120,4.134862
128,2023-12-22,120,4.141666
129,2023-12-23,120,4.121084
130,2023-12-24,120,4.075325
131,2023-12-26,120,4.086301
132,2023-12-27,120,4.14315
133,2023-12-28,120,4.156522
134,2023-12-29,120,4.149324
135,2023-12-30,120,4.144231
136,2023-12-31,120,4.137324


In [13]:

# Step 3: Load the Parquet file
file_path = 'cleaned_dataset.parquet'  # Replace with your actual file path or URL

try:
    df = pd.read_parquet(file_path)
    print("Parquet file loaded successfully.")
except FileNotFoundError:
    print(f"Error: The file at {file_path} was not found.")
except Exception as e:
    print(f"An error occurred while loading the Parquet file: {e}")

# Step 4: Inspect the DataFrame
display(df.head())
df.info()

# Step 5: Prepare the date column
date_column = 'published_date'  # Replace with your actual date column name

if not pd.api.types.is_datetime64_any_dtype(df[date_column]):
    try:
        df[date_column] = pd.to_datetime(df[date_column])
        print(f"Converted '{date_column}' to datetime format.")
    except Exception as e:
        print(f"Error converting '{date_column}' to datetime: {e}")

# Step 6: Sort the DataFrame by date
df_sorted = df.sort_values(by=date_column).reset_index(drop=True)
print("DataFrame sorted by date.")

# Step 7: Define date ranges
train_start = pd.Timestamp('2022-01-01')
train_end = pd.Timestamp('2023-08-01')

valid_start = pd.Timestamp('2023-08-02')
valid_end = pd.Timestamp('2023-12-31')

test_start = pd.Timestamp('2024-01-01')
test_end = pd.Timestamp('2024-10-31')

# Step 8: Split the DataFrame
train_df = df_sorted[(df_sorted[date_column] >= train_start) & (df_sorted[date_column] <= train_end)]
print(f"Train dataset: {train_df.shape[0]} records.")

valid_df = df_sorted[(df_sorted[date_column] >= valid_start) & (df_sorted[date_column] <= valid_end)]
print(f"Validation dataset: {valid_df.shape[0]} records.")

test_df = df_sorted[(df_sorted[date_column] >= test_start) & (df_sorted[date_column] <= test_end)]
print(f"Test dataset: {test_df.shape[0]} records.")

# Step 9: Export to Parquet files
train_output_path = 'train_dataset.parquet'
valid_output_path = 'validation_dataset.parquet'
test_output_path = 'test_dataset.parquet'

try:
    train_df.to_parquet(train_output_path, index=False)
    print(f"Train dataset exported to {train_output_path}.")
except Exception as e:
    print(f"Error exporting Train dataset: {e}")

try:
    valid_df.to_parquet(valid_output_path, index=False)
    print(f"Validation dataset exported to {valid_output_path}.")
except Exception as e:
    print(f"Error exporting Validation dataset: {e}")

try:
    test_df.to_parquet(test_output_path, index=False)
    print(f"Test dataset exported to {test_output_path}.")
except Exception as e:
    print(f"Error exporting Test dataset: {e}")

# Step 10: (Optional) Verify the exported files
# Uncomment the following lines if you wish to verify the exports

# train_loaded = pd.read_parquet(train_output_path)
# print("Train dataset loaded successfully.")
# display(train_loaded.head())

# valid_loaded = pd.read_parquet(valid_output_path)
# print("Validation dataset loaded successfully.")
# display(valid_loaded.head())

# test_loaded = pd.read_parquet(test_output_path)
# print("Test dataset loaded successfully.")
# display(test_loaded.head())


Parquet file loaded successfully.


Unnamed: 0,published_date,title
0,2024-11-15,Cabinet congratulates PM Anwar on receiving Pe...
1,2024-11-15,State Budget 2025: Sabah plans to impose sales...
2,2024-11-15,Cops nab four undergrads believed to be part o...
3,2024-11-15,Flooding in Teluk Intan caused by high tide ph...
4,2024-11-15,Interim report on separation of AG's powers to...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44815 entries, 0 to 44814
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   published_date  44815 non-null  datetime64[ns]
 1   title           44815 non-null  object        
dtypes: datetime64[ns](1), object(1)
memory usage: 700.4+ KB
DataFrame sorted by date.
Train dataset: 0 records.
Validation dataset: 78 records.
Test dataset: 42122 records.
Train dataset exported to train_dataset.parquet.
Validation dataset exported to validation_dataset.parquet.
Test dataset exported to test_dataset.parquet.
