In [1]:
%pip install kagglehub

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import shutil
import kagglehub
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Download latest version
path1 = kagglehub.dataset_download("kazanova/sentiment140")
path2 = kagglehub.dataset_download("yasserh/twitter-tweets-sentiment-dataset")
path3 = kagglehub.dataset_download("saurabhshahane/twitter-sentiment-dataset")

print("Path to dataset files:", path1)
print("Path to dataset files:", path2)
print("Path to dataset files:", path3)

Path to dataset files: C:\Users\User\.cache\kagglehub\datasets\kazanova\sentiment140\versions\2
Path to dataset files: C:\Users\User\.cache\kagglehub\datasets\yasserh\twitter-tweets-sentiment-dataset\versions\1
Path to dataset files: C:\Users\User\.cache\kagglehub\datasets\saurabhshahane\twitter-sentiment-dataset\versions\1


In [5]:
def download_and_move_datasets(dataset_ids, destination_directory):
    """
    Download datasets using KaggleHub, move them to the specified directory, 
    and rename the files by prefixing them with the dataset ID.

    :param dataset_ids: List of Kaggle dataset IDs to download.
    :param destination_directory: Directory where datasets will be moved.
    :return: A dictionary containing dataset IDs and their corresponding file paths.
    """
    dataset_files = {}
    for dataset_id in dataset_ids:
        # Download dataset
        cache_path = kagglehub.dataset_download(dataset_id)
        
        # Ensure the destination directory exists
        os.makedirs(destination_directory, exist_ok=True)
        
        # List files in the cache directory
        files = os.listdir(cache_path)
        
        # Move and rename each file to the destination directory
        moved_files = []
        for file in files:
            src = os.path.join(cache_path, file)
            # Prefix the file name with the dataset ID
            renamed_file = f"{dataset_id.replace('/', '_')}_{file}"
            dest = os.path.join(destination_directory, renamed_file)
            shutil.copy(src, dest)
            moved_files.append(dest)

        dataset_files[dataset_id] = moved_files
        print(f"Dataset '{dataset_id}' files moved and renamed to: {destination_directory}")
    
    return dataset_files

In [6]:
def inspect_datasets(dataset_files, headers_dict=None):
    """
    Inspect the datasets by loading them into pandas DataFrames and showing an overview.
    If headers are provided, the dataset is saved back after inspection.

    :param dataset_files: Dictionary containing dataset IDs and their file paths.
    :param headers_dict: Optional dictionary mapping dataset IDs to their headers.
                         If the value for a dataset ID is an empty list, the first line of the file is treated as headers.
                         Example: {"dataset_id": ["col1", "col2", ...], "dataset_with_no_headers": []}
    """
    headers_dict = headers_dict or {}  # Default to an empty dictionary if not provided

    for dataset_id, files in dataset_files.items():
        print(f"\n### Dataset: {dataset_id} ###")
        for file in files:
            try:
                print(f"\nFile: {file}")
                # Determine headers based on the dataset ID
                headers = headers_dict.get(dataset_id, None)
                use_infer_header = headers == []  # Infer headers if explicitly marked with an empty list

                # Attempt to load the file into a DataFrame
                if file.endswith(".csv"):
                    df = pd.read_csv(
                        file,
                        encoding="latin1",
                        low_memory=False,
                        header=0 if use_infer_header else None,
                        names=headers if headers else None
                    )
                elif file.endswith(".tsv"):
                    df = pd.read_csv(
                        file,
                        sep="\t",
                        encoding="latin1",
                        low_memory=False,
                        header=0 if use_infer_header else None,
                        names=headers if headers else None
                    )
                else:
                    print(f"Unsupported file format: {file}")
                    continue

                # Display basic information
                print("DataFrame Overview:")
                print(df.info())
                print("\nFirst 5 Rows:")
                print(df.head())

                # Save the dataset back if headers were explicitly provided
                if headers or use_infer_header:
                    save_path = file.replace(".csv", "_with_headers.csv") if file.endswith(".csv") else file.replace(".tsv", "_with_headers.tsv")
                    df.to_csv(save_path, index=False, sep="\t" if file.endswith(".tsv") else ",")
                    print(f"Dataset with headers saved to: {save_path}")

            except Exception as e:
                print(f"Error reading file '{file}': {e}")

In [7]:
# List of dataset IDs to download
dataset_ids = [
    "kazanova/sentiment140",
    "yasserh/twitter-tweets-sentiment-dataset",
    "saurabhshahane/twitter-sentiment-dataset"
]

# Define the destination directory
destination_directory = "../../data/raw"

# Define headers for datasets that are missing them
headers_dict = {
    "kazanova/sentiment140": ["target", "ids", "date", "flag", "user", "text"],
    "yasserh/twitter-tweets-sentiment-dataset": [],
    "saurabhshahane/twitter-sentiment-dataset": []
}

In [8]:
# Download and move datasets
dataset_files = download_and_move_datasets(dataset_ids, destination_directory)


Dataset 'kazanova/sentiment140' files moved and renamed to: ../../data/raw
Dataset 'yasserh/twitter-tweets-sentiment-dataset' files moved and renamed to: ../../data/raw
Dataset 'saurabhshahane/twitter-sentiment-dataset' files moved and renamed to: ../../data/raw


In [9]:
# Inspect the datasets with custom headers
inspect_datasets(dataset_files, headers_dict)



### Dataset: kazanova/sentiment140 ###

File: ../../data/raw\kazanova_sentiment140_training.1600000.processed.noemoticon.csv
DataFrame Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int64 
 1   ids     1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   flag    1600000 non-null  object
 4   user    1600000 non-null  object
 5   text    1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB
None

First 5 Rows:
   target         ids                          date      flag  \
0       0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1       0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2       0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3       0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4       0  1467811193  Mon Apr 06 22:19:57

In [1]:
import os

def visualize_directory_structure(path, indent=0):
    """
    Recursively prints the directory structure starting from the given path.
    """
    for item in os.listdir(path):
        if item == ".git" or item == ".gitignore":
            continue
        item_path = os.path.join(path, item)
        if os.path.isdir(item_path):
            print("  " * indent + f"[Dir] {item}")
            visualize_directory_structure(item_path, indent + 1)
        else:
            print("  " * indent + f"[File] {item}")

# Get the current working directory
current_directory = os.getcwd()

print("Directory Structure of", current_directory)
visualize_directory_structure("E:\\2_LEARNING_BKU\\2_File_2\\K22_HK242\\CO3117_Machine_Learning\\Main")


Directory Structure of e:\2_LEARNING_BKU\2_File_2\K22_HK242\CO3117_Machine_Learning\Main\data\raw
[Dir] data
  [Dir] final
    [File] a.txt
    [File] final_clean.csv
    [File] final_clean_no_duplicates.csv
    [File] final_clean_no_neutral.csv
    [File] final_clean_no_neutral_no_duplicates.csv
  [Dir] processed
    [File] a.txt
    [File] df1_with_text_clean.csv
    [File] df2_with_text_clean.csv
    [File] df3_with_text_clean.csv
    [File] processed_data.csv
    [File] sample_with_text_clean.csv
  [Dir] raw
    [File] a.txt
    [File] data_download.ipynb
    [File] kazanova_sentiment140_training.1600000.processed.noemoticon.csv
    [File] kazanova_sentiment140_training.1600000.processed.noemoticon_with_headers.csv
    [File] saurabhshahane_twitter-sentiment-dataset_Twitter_Data.csv
    [File] saurabhshahane_twitter-sentiment-dataset_Twitter_Data_with_headers.csv
    [File] yasserh_twitter-tweets-sentiment-dataset_Tweets.csv
    [File] yasserh_twitter-tweets-sentiment-dataset_Tweet