In [1]:
from IPython.display import display
import logging
from sklearn.model_selection import train_test_split
from transformers import logging as transformers_logging

# Set transformers logging level early
transformers_logging.set_verbosity_warning()
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)
transformers_logger.propagate = False

class DisplayHandler(logging.Handler):
    def emit(self, record):
        display(self.format(record))

# Configure logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Add custom display handler
display_handler = DisplayHandler()
display_handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s:%(message)s'))
logger.addHandler(display_handler)


In [2]:
!pip install hydra-core omegaconf wandb


import wandb

Collecting hydra-core
  Downloading hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Collecting omegaconf
  Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting antlr4-python3-runtime==4.9.* (from hydra-core)
  Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Downloading hydra_core-1.3.2-py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading omegaconf-2.3.0-py3-none-any.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: antlr4-python3-runtime
  Building wheel for antlr4-python3-runtime (setup.py) ... [?25ldone
[?25h  Created wheel for antlr4-python3-ru

In [3]:
# import hydra
# from omegaconf import DictConfig

# #@hydra.main(config_name = "config", config_path="/kaggle/input/config")
# def my_app(cfg: DictConfig):
#     repo_id = cfg.huggingface.repo_id
#     hf_api_token = cfg.huggingface.api_token
#     wandb_api_token = cfg.wandb.api_token
#     wandb_project_name = cfg.wandb.project_name
#     wandb_entity = cfg.wandb.entity
#     return repo_id, hf_api_token, wandb_api_token, wandb_project_name, wandb_entity


In [4]:
from huggingface_hub import login, snapshot_download, HfApi
import hydra
from dotenv import load_dotenv
from omegaconf import OmegaConf

In [11]:
from abc import ABC, abstractmethod
import os
import pandas as pd
import re
import json, random
import logging
from structured_configs import CornellDatasetConfig, MovieQADatasetConfig
# Configure logging
# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)
# Configure logging

# logging.basicConfig(filename='example.log', 
#                     encoding='utf-8', 
#                     level=logging.DEBUG,
#                     format="%(asctime)s:%(levelname)s:%(message)s"
#                    )

# # Create a logger object
# logger = logging.getLogger()


class BaseDataLoader(ABC):
    """
    Abstract base class for data loaders.
    """
    
    def __init__(self):
        pass
    
    @abstractmethod
    def load_data(self):
        """
        Load data from the specified source.
        """
        pass

    @abstractmethod
    def preprocess_data(self):
        """
        Preprocess the loaded data.
        """
        pass
    
    @abstractmethod
    def train_val_test_split(self):
        pass
    
    @abstractmethod
    def merge_dataframes(self):
        pass

    @abstractmethod
    def convert_to_json(self):
        """
        Convert the processed data to JSON format.
        """
        pass

    @abstractmethod
    def save_data(self):
        """
        Save the JSON data to a file.
        """
        pass

    @staticmethod
    def concat_json_files(file_path1, file_path2):
        try:
            # Open and read the first JSON file
            with open(file_path1, "r") as file1:
                data1 = json.load(file1)
            logging.info(f"Successfully loaded data from {file_path1}")

            # Open and read the second JSON file
            with open(file_path2, "r") as file2:
                data2 = json.load(file2)
            logging.info(f"Successfully loaded data from {file_path2}")

            # Concatenate the data from both files
            data = data1 + data2
            logging.info(f"Successfully concatenated data from {file_path1} and {file_path2}")
            full_output_path = os.path.join(".",cfg.hf.combined_dataset.file_name)
            with open(full_output_path, 'w', encoding='utf-8') as file:
                    json.dump(data, file, ensure_ascii=False, indent=4)
            if cfg.hf.combined_dataset.to_hf:
                api.upload_file(
                path_or_fileobj=cfg.hf.combined_dataset.file_name,
                repo_id= cfg.hf.repo_id,
                path_in_repo = f"{cfg.hf.combined_dataset.path_in_repo}/{cfg.hf.combined_dataset.file_name}",
                repo_type="dataset",
                commit_message=cfg.hf.combined_dataset.commit_message,
                commit_description=cfg.hf.combined_dataset.commit_description
                    )
                logger.info(f"File {cfg.hf.combined_dataset.file_name}  logged to Huggingface successfully.")
                if cfg.wandb.combined_dataset.to_wandb:
                    artifact = wandb.Artifact(name=cfg.wandb.combined_dataset.json_artifact_name, 
                                          description= cfg.wandb.combined_dataset.description, type='dataset')  # Name and type for the artifact
                    artifact.add_file(full_output_path)  # Add the saved JSON file to the artifact
                    #wandb.log_artifact(artifact)  # Log the artifact to WandB
                    artifact.save()
                    logger.info(f"File {cfg.hf.combined_dataset.file_name}  logged to WandB successfully.")
        except FileNotFoundError as e:
            logging.error(f"File not found: {e}")
            raise

        except json.JSONDecodeError as e:
            logging.error(f"Error decoding JSON: {e}")
            raise

        except Exception as e:
            logging.error(f"An unexpected error occurred: {e}")
            raise
    
class CornellDataLoader(BaseDataLoader):
    """
    Concrete data loader for Cornell movie datasets.
    """

    def __init__(self, config):
        """
        Initialize the CornellDataLoader.

        :param folder_path: Path to the folder containing data files.
        :param data_path_list: List of data file paths.
        :param column_names_list: List of column names for each data file.
        :param output_path: Path to save the output JSON file.
        :param output_path: Path to save the output JSON file.
        :param sample_fraction: Fraction of new samples to be generated.
        :param train_ratio: Proportion of the data used for training, default is 0.9 (90%).
        :param test_ratio: Proportion of the data used for testing, default is 0.05 (5%).
        :param generate_new_questions: Boolean flag to indicate whether new questions should be generated.
        
        """
        self.config = config
        self.movie_dataset_df = pd.DataFrame()
        self.movie_dataset = []
        self.metadata_questions = pd.DataFrame()
        
    def load_data(self):
        """
        Load dataset files into pandas DataFrames.

        :return: List of pandas DataFrames.
        """
        try:
            movie_df_list = [self._read_csv_file(file_key) for file_key in self.config.files ]
            logger.info("Data loaded successfully.")
            return movie_df_list
        except Exception as e:
            logger.error(f"Error loading data: {e}")
            raise

    def _read_csv_file(self, file_kay):
        """
        Read a single CSV file into a pandas DataFrame.

        :return: pandas DataFrame.
        """
        try:
            file_config = self.config.files[file_kay]
            path = os.path.join(self.config.folder_path, file_config['file_name'])
            column_names = file_config['columns']
            df = pd.read_csv(path, engine='python', sep = file_config['sep'], 
                             names = column_names, encoding=file_config['encoding'])
            logger.debug(f"Loaded data from {path} with columns {column_names}.")
            return df
        except FileNotFoundError:
            logger.error(f"File not found: {path}")
            raise
        except pd.errors.ParserError as e:
            logger.error(f"Parser error while reading {path}: {e}")
            raise
            
    def preprocess_data(self, df_list):
        """
        Preprocesses the given dataframes by cleaning and formatting specific columns.

        """
        try:
            # Unpack dataframes from the input list
            movie_conversation_df, movie_utterances_df, movie_metadata_df, characters_df, imdb_details_df = df_list

            # Title case the 'character_name' column in both utterances and characters dataframes
            movie_utterances_df['character_name'] = movie_utterances_df['character_name'].str.title()
            characters_df['character_name'] = characters_df['character_name'].str.title()

            # Clean 'release_year' by removing any non-digit characters and converting to integer
            movie_metadata_df['release_year'] = movie_metadata_df['release_year'].str.replace(r'\D', '', regex=True)
            movie_metadata_df['release_year'] = movie_metadata_df['release_year'].astype(int)

            # Extract genres from the 'genre' column and join multiple genres with commas
            movie_metadata_df['genre'] = movie_metadata_df['genre'].str.extractall(r"'(.*?)'")[0].groupby(level=0).apply(', '.join)

            # Round 'imdb_rating' to one decimal place and ensure the type is float
            movie_metadata_df['imdb_rating'] = movie_metadata_df['imdb_rating'].map(lambda r: round(r, 1)).astype(float)

            # Parse the 'line_id_list' column into actual lists using a helper method
            movie_conversation_df['line_id_list'] = movie_conversation_df['line_id_list'].apply(self._parse_list_string)
            
            
            imdb_details_df = imdb_details_df[['movie_name', 'plot_outline']]
            print("preprocess imdb_details_df", imdb_details_df.columns)
            logger.info("Data preprocessing completed successfully.")

            return movie_conversation_df, movie_utterances_df, movie_metadata_df, characters_df, imdb_details_df

        except Exception as e:
            logger.error(f"Error during data preprocessing: {str(e)}")
            raise  # Re-raise the exception after logging
    
    def _row_to_json(self, row):
        """
        Convert a DataFrame row to a JSON-like dictionary.

        :param row: pandas Series representing a row in the DataFrame.
        :return: Dictionary representing the JSON object.
        """
        

        json_obj = {
            "split": row.get('split', 'train'),
            "type":"dialogue",
            "instruction": row.get('instruction', 'Continue the conversation between the characters.'),
            "input": row.get('utterance', ''),
            "context": {
                "movie_name": row.get('movie_name', 'Unknown'),
                "character_names": row.get('character_names', 'Unknown'),
                "genre": row.get('genre', ''),
                "year": row.get('release_year', 'Unknown'),
                "imdb_rating": row.get('imdb_rating', 0),
                "num_imdb_votes": row.get('num_imdb_votes', 0),
                'plot_outline': row.get('plot_outline', 'Unknown'),
                "additional_information": None
            },
            "response": row.get('response', 'Unknown')
        }

        return json_obj

    def convert_to_json(self):
        """
        Convert the list of DataFrames into a JSON-compatible list of dictionaries.

        :param df_list: List of pandas DataFrames.
        """
        
        try:
            
            # Convert the main dataset to JSON using the existing method
            self.movie_dataset = self.movie_dataset_df.apply(self._row_to_json, axis=1).tolist()
            
        
            logger.info("Data converted to JSON format successfully.")
        except Exception as e:
            logger.error(f"Error converting data to JSON: {e}")
            raise
    def _generate_samples(self, row):
        """
        Generate new samples by modifying the question and response of a given row.

        Args:
            row (namedtuple): A row from the DataFrame with movie details and dialogue.

        Returns:
            pd.DataFrame: A DataFrame with the modified question-answer samples.
        """
        # Define question templates
        genre_questions = [
            'What genre is the movie {movie_name}?',
            'Which genres does {movie_name} belong to?',
            'Can you tell me the genre of {movie_name}?',
            'What are the main genres of {movie_name}?',
            'Under which genres is {movie_name} classified?',
            'What type of film is {movie_name}?',
            'What genres categorize {movie_name}?'
        ]

        release_year_questions = [
            'In what year was {movie_name} released?',
            'When did {movie_name} come out?',
            'What is the release year of {movie_name}?',
            'Which year did {movie_name} premiere?',
            'When was {movie_name} first released?',
            'What year did {movie_name} hit theaters?',
            'Can you tell me the release year of {movie_name}?'
        ]

        imdb_rating_questions = [
            'What is the IMDb rating of {movie_name}?',
            'How is {movie_name} rated on IMDb?',
            'Can you tell me the IMDb score for {movie_name}?',
            'What rating did {movie_name} receive on IMDb?',
            'What is the IMDb rating for {movie_name}?'
        ]

        imdb_votes_questions = [
            'How many votes does {movie_name} have on IMDb?',
            'What is the total number of IMDb votes for {movie_name}?',
            'How many people rated {movie_name} on IMDb?',
            'What is the number of votes for {movie_name} on IMDb?'
        ]
        # List to collect new samples
        new_samples = []

        # Extract values from the row
        movie_name = getattr(row, 'movie_name', 'Unknown')
        genre = getattr(row, 'genre', 'Unknown')
        year = getattr(row, 'release_year', 'Unknown')
        imdb_rating = getattr(row, 'imdb_rating', 'Unknown')
        imdb_votes = getattr(row, 'num_imdb_votes', 'Unknown')
        
        # Helper function to create a new modified row
        def add_modified_row(new_question, new_response):
            # Convert row back to a dictionary and modify question and response
            new_row = row._asdict()  # Convert namedtuple row to a dictionary
            new_row['utterance'] = new_question
            new_row['response'] = new_response
            new_row['instruction'] = 'Answer the following question:'
            new_samples.append(new_row)  # Add the modified row to the samples list

        # Modify question and response based on attributes and add the new row to the samples list
        if genre != 'Unknown':
            new_question = random.choice(genre_questions).format(movie_name=movie_name)
            add_modified_row(new_question, genre)

        if year != 'Unknown':
            new_question = random.choice(release_year_questions).format(movie_name=movie_name)
            add_modified_row(new_question, year)

        if imdb_rating != 'Unknown':
            new_question = random.choice(imdb_rating_questions).format(movie_name=movie_name)
            add_modified_row(new_question, imdb_rating)

        if imdb_votes != 'Unknown':
            new_question = random.choice(imdb_votes_questions).format(movie_name=movie_name)
            add_modified_row(new_question, imdb_votes)

        return new_samples

    def _parse_list_string(self, value):
        """
        Parse a string representation of a list into an actual list.

        :param value: String to parse.
        :return: List of extracted strings.
        """
        clean_string = re.findall(r"'(.*?)'", value)

        if clean_string:
            return clean_string
        else:
            logger.warning(f"Could not parse: {value}")
            return []

    def merge_dataframes(self, df_list):
        """
        Merge multiple DataFrames into a single DataFrame for processing.

        :param df_list: List of pandas DataFrames.
        :return: Merged pandas DataFrame.
        """
        try:
            if len(df_list) != 5:
                raise ValueError("df_list must contain exactly four DataFrames.")

            movie_conversation_df, movie_utterances_df, movie_metadata_df, characters_df, imdb_details_df = df_list

            # Concatenate character name with utterance
            movie_utterances_df['utterance'] = movie_utterances_df['character_name'].str.cat(
                movie_utterances_df['utterance'], sep=': '
            )
            
            # Merge conversation and metadata DataFrames on 'movie_id'
            movie_conversation_metadata_df = pd.merge(
                left=movie_conversation_df,
                right=movie_metadata_df,
                on='movie_id',
                how='inner'
            )
            
            # Merge with character details
            movie_conversation_metadata_df = pd.merge(
                left=movie_conversation_metadata_df,
                right=characters_df,
                left_on='character_id1',
                right_on='character_id',
                how='inner'
            )
            
            
            # Drop redundant columns
            movie_conversation_metadata_df.drop(
                columns=['character_id1', 'movie_id_x', 'movie_id_y', 'character_id', 'gender', 'position'],
                inplace=True
            )
            
            # Map character IDs to names
            characters_dict = characters_df.set_index('character_id')['character_name'].to_dict()
            movie_conversation_metadata_df['character_name2'] = movie_conversation_metadata_df['character_id2'].map(characters_dict)

            # Rename columns for clarity
            movie_conversation_metadata_df.rename(
                columns={'character_name': 'character_name1', 'movie_name_x': 'movie_name'},
                inplace=True
            )
            movie_conversation_metadata_df.drop(columns=['character_id2'], inplace=True)
           

            # Save the original DataFrame index for grouping
            movie_conversation_metadata_df['original_line_index'] = movie_conversation_metadata_df.index

            # Explode the 'line_id_list' to have one row per line ID
            movie_conversation_metadata_df = movie_conversation_metadata_df.explode('line_id_list')

            # Merge with utterances DataFrame using 'line_id_list' as the key
            movie_conversation_metadata_df = pd.merge(
                left=movie_conversation_metadata_df,
                right=movie_utterances_df,
                left_on='line_id_list',
                right_index=True,
                how='inner'
            )

            # Concatenate character names
            movie_conversation_metadata_df['character_names'] = (
                movie_conversation_metadata_df['character_name1'] + ', ' +
                movie_conversation_metadata_df['character_name2']
            )
            # Create 'response' column as a copy of 'utterance'
            movie_conversation_metadata_df['response'] = movie_conversation_metadata_df['utterance'].copy(deep=True)
            
            # Aggregate the data
            movie_dataset_df = movie_conversation_metadata_df.groupby('original_line_index').agg({
                'character_names': 'first',
                'movie_name': 'first',
                'release_year': 'first',
                'imdb_rating': 'mean',
                'num_imdb_votes': 'first',
                'genre': 'first',
                'utterance': lambda u: '\n '.join(map(str, u[:-1])) if len(u) > 1 else u.iloc[0],
                'response': 'last'
            }).reset_index().drop(columns=['original_line_index'])
            
            # print("in merge_dataframe imdb_details_df", imdb_details_df.columns, imdb_details_df.size)
            # print("imdb_details_df", imdb_details_df.head(5))
            movie_dataset_df = pd.merge(left = movie_dataset_df, right = imdb_details_df, on = 'movie_name', how='left')
            # print("movie_dataset_df", movie_dataset_df.head(5), movie_dataset_df.size)
            # Remove duplicate rows based on all columns
            movie_dataset_df = movie_dataset_df.drop_duplicates()
            # Remove rows where 'utterance' or 'response' are NaN
            self.movie_dataset_df = movie_dataset_df.dropna(subset=['utterance', 'response'])
            self.movie_dataset_df.loc[:, 'instruction'] = 'Continue the conversation between the characters.'
            logger.debug("DataFrames merged successfully.")
            # Generate new samples if flagged
            if self.config.generate_new_questions:
                # Sample a fraction of the data
                sampled_df = self.movie_dataset_df.groupby('movie_name').agg({'character_names':'first','movie_name':'first','genre':'first', 'release_year':'first','imdb_rating':'mean', 'num_imdb_votes':'first'}).sample(frac=self.config.sample_fraction, random_state=42).reset_index(drop=True)
                logger.info(f"Sampled {len(sampled_df)} data successfully.")

                # Generate new samples from the sampled DataFrame
                #new_sample_list = sampled_df.apply(self._generate_json_sample, axis=1).tolist()
                new_sample_list = [sample for row in sampled_df.itertuples(index=False) 
                                            for sample in self._generate_samples(row)]
                new_sample_df = pd.DataFrame(new_sample_list)
                
              
            self.movie_dataset_df = pd.concat([self.movie_dataset_df, new_sample_df], axis = 0).reset_index(drop = True)
            self.movie_dataset_df['response'] = self.movie_dataset_df['response'].astype(str)    
            if self.config.data_prune_enabled:
                self._data_pruning()
            self._save_df()
            logger.debug(f"Total number of rows found in movie_dataset_df: {len(self.movie_dataset_df)}")
            return self.movie_dataset_df
        except Exception as e:
            logger.error(f"Error merging DataFrames: {e}")
            raise
    def _data_pruning(self):

        logger.info(f"Cornell dataset size before pruning: {self.movie_dataset_df.shape[0]} rows")
        filtered_movies = self.movie_dataset_df.groupby('movie_name').size()
        filtered_movies = filtered_movies[filtered_movies > self.config.frequent_sample_ratio].index

        # Filter the main dataset to include only those movies
        self.movie_dataset_df = self.movie_dataset_df[self.movie_dataset_df['movie_name'].isin(filtered_movies)].reset_index(drop = True)
        logger.info(f"Cornell dataset size after pruning: {self.movie_dataset_df.shape[0]} rows")
        
    def train_val_test_split(self):
        # Filtering movies that appear more than 50 times
        # filtered_movies = self.movie_dataset_df.groupby('movie_name').size()
        # filtered_movies = filtered_movies[filtered_movies > 50].index

        # Filter the main dataset to include only those movies
        # filtered_dataset = self.movie_dataset_df[self.movie_dataset_df['movie_name'].isin(filtered_movies)]

        # Calculating sample sizes for training, validation, and test
        total_records = len(self.movie_dataset_df)
        train_size = int(total_records * self.config.train_ratio)  # 90% training
        test_size = int(total_records * self.config.test_ratio)    # 0.05% test
        val_size = total_records - train_size - test_size  # remaining for validation
        
        # Split the data into train, validation, and test
        train_data, temp_data = train_test_split(
            self.movie_dataset_df, 
            train_size=train_size, 
            stratify=self.movie_dataset_df['movie_name'], 
            random_state=42
        )

        try:
             test_data,val_data = train_test_split(
                temp_data, 
                train_size=test_size, 
                stratify=temp_data['movie_name'], 
                random_state=42
            )
        except ValueError as e:
            logger.warning(f"Stratified split failed: {e}. Falling back to non-stratified split.")
            test_data,val_data = train_test_split(
                temp_data, 
                train_size=test_size, 
                random_state=42
            )
        # Assign the split column
        train_data['split'] = 'train'
        val_data['split'] = 'val'
        test_data['split'] = 'test'

        # Combine the data back into a single dataset
        self.movie_dataset_df = pd.concat([train_data, val_data, test_data], ignore_index=True)

        # Logging the size of each dataset
        logger.info(f"Training set size: {len(train_data)}")
        logger.info(f"Validation set size: {len(val_data)}")
        logger.info(f"Test set size: {len(test_data)}")
        
        return self.movie_dataset_df

    def save_data(self):
        """
        Save the converted JSON data to the specified output file.
        """
        try:
            full_output_path  = os.path.join(self.config.folder_path, self.config.output_path)
            with open(full_output_path , 'w', encoding='utf-8') as f:
                json.dump(self.movie_dataset, f, ensure_ascii=False, indent=4)
            logger.info(f"Data saved to {full_output_path }")
            if cfg.wandb.cornell_dataset.to_wandb:
                artifact = wandb.Artifact(name=cfg.wandb.cornell_dataset.json_artifact_name,
                                          description = cfg.wandb.cornell_dataset.description,type='dataset')  # Name and type for the artifact
                artifact.add_file(full_output_path)  # Add the saved JSON file to the artifact
    #             wandb.log_artifact(artifact)  # Log the artifact to WandB
                artifact.save()
                logger.info(f"JSON file '{self.config.output_path}' uploaded to WandB successfully.")
            if cfg.hf.cornell_dataset.to_hf:
                 api.upload_file(
                    path_or_fileobj=full_output_path,
                    repo_id=cfg.hf.repo_id,
                    path_in_repo=f"{cfg.hf.cornell_dataset.path_in_repo}{cfg.hf.cornell_dataset.file_name}",
                    repo_type="dataset")
                 logger.info(f"File {self.config.output_path}  logged to Huggingface successfully.")
            return full_output_path
        except Exception as e:
            logger.error(f"Error saving data: {e}")
            raise
    
    def _save_df(self):
        """
        Save the converted JSON data to the specified output file.
        """
        try:
            output_path = os.path.join(self.config.folder_path,'movie_dataset.csv')
            # Save the DataFrame to a CSV file
            self.movie_dataset_df.to_csv(output_path, index=False)
            logger.info(f"DataFrame saved as CSV file at '{output_path}'.")
            if cfg.wandb.cornell_dataset.to_wandb:
                # Create a WandB artifact
                artifact = wandb.Artifact(name=cfg.wandb.cornell_dataset.csv_artifact_name, type= 'dataset',
                                          description = cfg.wandb.cornell_dataset.description,
                                         metadata = {'size':len(self.movie_dataset_df),
                                                    'columns': self.movie_dataset_df.columns.to_list()
    })
    
                # Add the saved CSV file to the artifact
                artifact.add_file(output_path)
                logger.info(f"CSV file '{output_path}' added to WandB artifact cornell_movie_df.")
    
                # Save the artifact
                wandb.log_artifact(artifact)
                logger.info(f"Artifact cornell_movie_df logged to WandB successfully.")

        except Exception as e:
            logger.error(f"Error saving DataFrame or uploading to WandB: {e}")

In [6]:
class MovieQADataLoader(BaseDataLoader):
    def __init__(self, config):
        self.config = config
        self.qa_dataset = []
        self.qa_movie_metadata_df = pd.DataFrame()
        
    def load_data(self):
        """
        Load Q&A dataset.
        """
        try:
            qa_df_list = []
            for file,file_config in self.config.files.items():
                path = os.path.join(self.config.folder_path,file_config['file_name'])
                if path.endswith('.json'):
                    qa_df_list.append(self._read_json_file(path))
                else:
                    qa_df_list.append(self._read_csv_file(file_config))
            
            logger.info("Data loaded successfully.")
            return qa_df_list
        except Exception as e:
            logger.error(f"Error loading data: {e}")
            raise
    def _read_json_file(self, data_path):
        try:
            qa_df = pd.read_json(data_path)
            logger.info("Q&A JSON file loaded successfully.")
            return qa_df
        except Exception as e:
            logger.error(f"Error loading Q&A JSON file: {e}")
            raise
    def _read_csv_file(self, file):
        try:
            path = os.path.join(self.config.folder_path, file['file_name'])
            qa_df = pd.read_csv(path, usecols=file['columns'], 
                                sep = file['sep'], encoding=file['encoding'])
            logger.info("Q&A CSV file loaded successfully.")
            return qa_df
        except Exception as e:
            logger.error(f"Error loading Q&A JSON file: {e}")
            raise
    def preprocess_data(self,df_list):
        qa_df, movie_df, imdb_details_df = df_list
        movie_df = movie_df[['genre', 'imdb_key', 'name', 'year']]
        qa_df = qa_df[qa_df['correct_index'].notna()]
        qa_df['response'] = qa_df.apply(lambda row: row['answers'][int(row['correct_index'])], axis=1)
        qa_df = qa_df[['question','imdb_key' ,'response']]
        return qa_df, movie_df, imdb_details_df
    def merge_dataframes(self, df_list): 
        qa_df, movie_df, imdb_details_df = df_list
        self.qa_movie_metadata_df = pd.merge(left = qa_df, right = movie_df, on = 'imdb_key', how='left')
        self.qa_movie_metadata_df = pd.merge(left = self.qa_movie_metadata_df , right = imdb_details_df, left_on = 'name', right_on = 'title', how= 'left').drop(columns=['title']).rename(columns={'name':'movie_name'})
        self.qa_movie_metadata_df.drop(columns=['imdb_key'], inplace = True)
        self._save_df()
        return self.qa_movie_metadata_df
    def convert_to_json(self):
        self.qa_dataset = self.qa_movie_metadata_df.apply(self._row_to_json, axis=1).tolist()
        return self.qa_dataset
    def _row_to_json(self, row):
        return {
            "split": row.get('split', 'train'),
            "type":"qa",
            "instruction": "Answer the following question:",
            "input": row['question'],
            "context": {
                "movie_name": row.get('movie_name', 'Unknown'),
                "genre": row.get('genre', 'Unknown'),
                "year": row.get('year', 'Unknown'),
                "plot_outline": row.get('plot_outline', 'Unknown'),
                "additional_information": None
            },
            "response": row['response']
        }
    def train_val_test_split(self):
        # Filtering movies that appear more than 50 times
        filtered_qa_movies = self.qa_movie_metadata_df.groupby('movie_name').size()
        filtered_qa_movies = filtered_qa_movies[filtered_qa_movies > 50].index

        # Filter the main dataset to include only those movies
        filtered_dataset = self.qa_movie_metadata_df[self.qa_movie_metadata_df['movie_name'].isin(filtered_qa_movies)]

        # Calculating sample sizes for training, validation, and test
        total_records = len(filtered_dataset)
        train_size = int(total_records * self.config.train_ratio)  # 90% training
        test_size = int(total_records * self.config.test_ratio)    # 0.05% test
        val_size = total_records - train_size - test_size  # remaining for validation

        # Split the data into train, validation, and test
        train_data, temp_data = train_test_split(
            filtered_dataset, 
            train_size=train_size, 
            stratify=filtered_dataset['movie_name'], 
            random_state=42
        )

        try:
             test_data,val_data = train_test_split(
                temp_data, 
                train_size=test_size, 
                stratify=temp_data['movie_name'], 
                random_state=42
            )
        except ValueError as e:
            logger.warning(f"Stratified split failed: {e}. Falling back to non-stratified split.")
            test_data,val_data = train_test_split(
                temp_data, 
                train_size=test_size, 
                random_state=42
            )

        # Assign the split column
        train_data['split'] = 'train'
        val_data['split'] = 'val'
        test_data['split'] = 'test'

        # Combine the data back into a single dataset
        self.qa_movie_metadata_df = pd.concat([train_data, val_data, test_data], ignore_index=True)

        # Logging the size of each dataset
        logger.info(f"Training set size: {len(train_data)}")
        logger.info(f"Validation set size: {len(val_data)}")
        logger.info(f"Test set size: {len(test_data)}")
        return self.qa_movie_metadata_df
    
    def save_data(self):
         try:
            full_output_path = os.path.join(self.config.folder_path ,self.config.output_path)#os.path.join(self.folder_path,self.output_path)
            with open(full_output_path, 'w', encoding='utf-8') as f:
                json.dump(self.qa_dataset, f, ensure_ascii=False, indent=4)
            logger.info(f"Q&A Movie dataset saved to {self.config.output_path}")
            if cfg.wandb.movieqa_dataset.to_wandb:
                artifact = wandb.Artifact(name=cfg.wandb.movieqa_dataset.json_artifact_name, 
                                          description= cfg.wandb.movieqa_dataset.description, type='dataset')  # Name and type for the artifact
                artifact.add_file(full_output_path)  # Add the saved JSON file to the artifact
                #wandb.log_artifact(artifact)  # Log the artifact to WandB
                artifact.save()
            if cfg.hf.movieqa_dataset.to_hf:
                api.upload_file(
                    path_or_fileobj=full_output_path,
                    repo_id= cfg.hf.repo_id,
                    path_in_repo = cfg.hf.movieqa_dataset.path_in_repo,
                    repo_type="dataset",
                    )
                logger.info(f"Artifact {self.config.output_path}  logged to Huggingface successfully.")
            return full_output_path
         except Exception as e:
            logger.error(f"Error saving data: {e}")
            raise
    def _save_df(self):
        """
        Save the converted JSON data to the specified output file.
        """
        try:
            output_path = os.path.join(self.config.folder_path ,'qa_movie_df.csv')
            # Save the DataFrame to a CSV file
            self.qa_movie_metadata_df.to_csv(output_path, index=False)
            logger.info(f"DataFrame saved as CSV file at '{output_path}'.")
            
            if cfg.wandb.movieqa_dataset.to_wandb:
                # Create a WandB artifact
                artifact = wandb.Artifact(name = cfg.wandb.movieqa_dataset.csv_artifact_name, type = 'dataset',
                                          description= cfg.wandb.movieqa_dataset.description,
                                        metadata = {'size':len(self.qa_movie_metadata_df),
                                                    'columns':self.qa_movie_metadata_df.columns.to_list()},
                                         )
    
                # Add the saved CSV file to the artifact
                artifact.add_file(output_path)
                logger.info(f"CSV file '{output_path}' added to WandB artifact qa_movie_df.")
    
                # Save the artifact
                artifact.save()
                logger.info(f"Artifact qa_movie_df  logged to WandB successfully.")

        except Exception as e:
            logger.error(f"Error saving DataFrame or uploading to WandB: {e}")

In [7]:
from hydra.core.global_hydra import GlobalHydra
class DataLoaderFactory:
    """
    Factory class to create DataLoader instances.
    """
    @staticmethod
    def get_data_loader(dataset_type,config, **kwargs):
        if dataset_type == 'cornell':
            return CornellDataLoader(config, **kwargs)
        elif dataset_type == 'movieqa':
            return MovieQADataLoader(config, **kwargs)
        else:
            raise ValueError(f"Unknown dataset type: {dataset_type}")

# Example Usage for Both Cornell and MovieQA DataLoader
if __name__ == "__main__":
    # Check if Hydra is already initialized and clear it if necessary
    if GlobalHydra.instance().is_initialized():
        GlobalHydra.instance().clear()
    # Load .env file   
    load_dotenv('/kaggle/input/config/.env')
    
    # Manually initialize Hydra in Jupyter
    hydra.initialize(config_path="../input/config", version_base="1.1")  # If you're using a specific path, set config_path=<path>

    # Manually compose the config from the configuration file
    cfg = hydra.compose(config_name="config")  
    
    wandb.login(key = cfg.wandb.api_token)
    run = wandb.init(project= cfg.wandb.project_name, entity=cfg.wandb.entity)
    login(cfg.hf.api_token)
    api = HfApi()
    # Download the entire repository (including folders)
    local_dir = snapshot_download(repo_id = cfg.hf.repo_id, repo_type="dataset", allow_patterns= cfg.hf.allow_patterns, local_dir = '/kaggle/working/')

    logger.info(f"Downloaded repository to {local_dir}")
    # Define dataset type (choose 'cornell' or 'movieqa')
    dataset_type = 'cornell'  # Change to 'movieqa' for MovieQA dataset

    if dataset_type == 'cornell':
        # Parameters for Cornell Dataset
        cornell_config_dict = OmegaConf.to_object(cfg.cornell_dataset)
        cornell_config = CornellDatasetConfig(**cornell_config_dict)
    
#         column_names_list = [
#             cfg.cornell_dataset.columns.movie_conversations,
#             cfg.cornell_dataset.columns.movie_lines,
#             cfg.cornell_dataset.columns.movie_titles_metadata,
#             cfg.cornell_dataset.columns.movie_characters_metadata,
#             cfg.cornell_dataset.columns.imdb_movie_detailed
#         ]
      
#         folder_path = "/kaggle/working/cornell"
#         #!wget https://huggingface.co/datasets/niloufarna/MovieChat/resolve/main/movie_detailed.csv
        
#         data_path_list = ['movie_conversations.txt', 'movie_lines.txt', 'movie_titles_metadata.txt', 'movie_characters_metadata.txt', 'movie_detailed.csv']
#         column_names_list = [
#             ['character_id1' ,'character_id2', 'movie_id','line_id_list'],
#             ['character_id1','movie_id','character_name','utterance'],
#             ['movie_id', 'movie_name', 'release_year', 'imdb_rating', 'num_imdb_votes', 'genre'],
#             ['character_id','character_name','movie_id','movie_name','gender','position'],
#             ["title", "year", "kind", "cover_url", "original_title", "localized_title", 
#             "genres", "runtimes", "countries", "language_codes", "rating", "votes", 
#             "imdbID", "plot_outline", "languages", "director", "writer", "cast", "box_office", "plot", "synopsis"]
#         ]
#         output_path = "cornell_movie_data.json"
        
        # Initialize DataLoader via factory
        loader = DataLoaderFactory.get_data_loader(
            dataset_type='cornell',
            config = cornell_config
#             folder_path = cfg.cornell_dataset.paths.folder_path,
#             data_path_list = list(cfg.cornell_dataset.paths.data_path_list),
#             column_names_list=column_names_list,
#             output_path = cfg.cornell_dataset.paths.output_path,
#             sample_fraction=cfg.cornell_dataset.sample_fraction,
#             train_ratio=cfg.cornell_dataset.train_ratio, 
#             test_ratio = cfg.cornell_dataset.test_ratio,
#             generate_new_questions = cfg.cornell_dataset.generate_new_questions
        )

    elif dataset_type == 'movieqa':
        
        # Parameters for MovieQA Dataset
        movie_qa_config_dict = OmegaConf.to_object(cfg.movieqa_dataset)
        movieqa_config = MovieQADatasetConfig(**movieqa_config_dict)
        #columns_list = [cfg.movieqa_dataset.columns.imdb_movie_details_movieqa]
        
#         folder_path = "/kaggle/working/movieqa"
#         data_path_list = ["qa.json", "movies.json", "imdb_movie_details_movieqa.csv"]
#         output_path = "movieqa_movie_data.json"
#         columns_list = [['movie_name', 'plot_outline']]
        # Initialize DataLoader via factory
        loader = DataLoaderFactory.get_data_loader(
            dataset_type='movieqa',
            config = movieqa_config
#             folder_path = cfg.movieqa_dataset.paths.folder_path,
#             data_path_list = list(cfg.movieqa_dataset.paths.data_path_list),
#             output_path = cfg.movieqa_dataset.paths.output_path,
#             columns_list = columns_list,
#             train_ratio = cfg.movieqa_dataset.train_ratio, 
#             test_ratio = cfg.movieqa_dataset.test_ratio
        )

    # Load data
    data_frames = loader.load_data()

    # Preprocess data (if any preprocessing is implemented)
    data_frames = loader.preprocess_data(data_frames)
    
    loader.merge_dataframes(data_frames)
    
    loader.train_val_test_split()
    
    # Convert to JSON
    loader.convert_to_json()

    # Save JSON data
    file_path = loader.save_data()
    dataset_type = 'movieqa'  # Change to 'movieqa' for MovieQA dataset

    if dataset_type == 'cornell':
        # Parameters for Cornell Dataset
        cornell_config_dict = OmegaConf.to_object(cfg.cornell_dataset)
        cornell_config = CornellDatasetConfig(**cornell_config_dict)
    
#         column_names_list = [
#             cfg.cornell_dataset.columns.movie_conversations,
#             cfg.cornell_dataset.columns.movie_lines,
#             cfg.cornell_dataset.columns.movie_titles_metadata,
#             cfg.cornell_dataset.columns.movie_characters_metadata,
#             cfg.cornell_dataset.columns.imdb_movie_detailed
#         ]
#         folder_path = "/kaggle/working/cornell"
#         #!wget https://huggingface.co/datasets/niloufarna/MovieChat/resolve/main/movie_detailed.csv
        
#         data_path_list = ['movie_conversations.txt', 'movie_lines.txt', 'movie_titles_metadata.txt', 'movie_characters_metadata.txt', 'movie_detailed.csv']
#         column_names_list = [
#             ['character_id1' ,'character_id2', 'movie_id','line_id_list'],
#             ['character_id1','movie_id','character_name','utterance'],
#             ['movie_id', 'movie_name', 'release_year', 'imdb_rating', 'num_imdb_votes', 'genre'],
#             ['character_id','character_name','movie_id','movie_name','gender','position'],
#             ["title", "year", "kind", "cover_url", "original_title", "localized_title", 
#             "genres", "runtimes", "countries", "language_codes", "rating", "votes", 
#             "imdbID", "plot_outline", "languages", "director", "writer", "cast", "box_office", "plot", "synopsis"]
#         ]
#         output_path = "cornell_movie_data.json"
        
        # Initialize DataLoader via factory
        loader = DataLoaderFactory.get_data_loader(
            dataset_type='cornell',
            config = cornell_config
#             folder_path = cfg.cornell_dataset.paths.folder_path,
#             data_path_list = list(cfg.cornell_dataset.paths.data_path_list),
#             column_names_list=column_names_list,
#             output_path = cfg.cornell_dataset.paths.output_path,
#             sample_fraction=cfg.cornell_dataset.sample_fraction,
#             train_ratio=cfg.cornell_dataset.train_ratio, 
#             test_ratio = cfg.cornell_dataset.test_ratio,
#             generate_new_questions = cfg.cornell_dataset.generate_new_questions
        )

    elif dataset_type == 'movieqa':
        movieqa_config_dict = OmegaConf.to_object(cfg.movieqa_dataset)
        movieqa_config = MovieQADatasetConfig(**movieqa_config_dict)
        
        # Initialize DataLoader via factory
        loader = DataLoaderFactory.get_data_loader(
            dataset_type='movieqa',
            config = movieqa_config
#             folder_path = cfg.movieqa_dataset.paths.folder_path,
#             data_path_list = list(cfg.movieqa_dataset.paths.data_path_list),
#             output_path = cfg.movieqa_dataset.paths.output_path,
#             columns_list = columns_list,
#             train_ratio = cfg.movieqa_dataset.train_ratio, 
#             test_ratio = cfg.movieqa_dataset.test_ratio
        )

    # Load data
    data_frames = loader.load_data()

    # Preprocess data (if any preprocessing is implemented)
    data_frames = loader.preprocess_data(data_frames)

    loader.merge_dataframes(data_frames)

    loader.train_val_test_split()

    # Convert to JSON
    loader.convert_to_json()

    # Save JSON data
    file_path2 = loader.save_data()

    combined_data = BaseDataLoader.concat_json_files(file_path, file_path2)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mniloufarcolab6[0m ([33mniloufarcolab6-n[0m). Use [1m`wandb login --relogin`[0m to force relogin


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

movieqa/movies.json:   0%|          | 0.00/123k [00:00<?, ?B/s]

movie_lines.txt:   0%|          | 0.00/34.6M [00:00<?, ?B/s]

cornell/movie_conversations.txt:   0%|          | 0.00/6.76M [00:00<?, ?B/s]

cornell/imdb_cornell_movie_dataset.csv:   0%|          | 0.00/6.22M [00:00<?, ?B/s]

movieqa/qa.json:   0%|          | 0.00/8.53M [00:00<?, ?B/s]

cornell/movie_titles_metadata.txt:   0%|          | 0.00/67.3k [00:00<?, ?B/s]

cornell/movie_characters_metadata.txt:   0%|          | 0.00/706k [00:00<?, ?B/s]

movieqa/imdb_movie_details_movieqa.csv:   0%|          | 0.00/5.05M [00:00<?, ?B/s]

'2024-10-16 21:38:19,671:INFO:Downloaded repository to /kaggle/working'

'2024-10-16 21:38:22,498:INFO:Data loaded successfully.'

preprocess imdb_details_df Index(['movie_name', 'plot_outline'], dtype='object')


'2024-10-16 21:38:23,009:INFO:Data preprocessing completed successfully.'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.movie_dataset_df.loc[:, 'instruction'] = 'Continue the conversation between the characters.'


'2024-10-16 21:38:30,667:INFO:Sampled 123 data successfully.'

'2024-10-16 21:38:30,731:INFO:Cornell dataset size before pruning: 83532 rows'

'2024-10-16 21:38:30,801:INFO:Cornell dataset size after pruning: 82231 rows'

"2024-10-16 21:38:35,312:INFO:DataFrame saved as CSV file at './cornell/movie_dataset.csv'."

"2024-10-16 21:38:35,953:INFO:CSV file './cornell/movie_dataset.csv' added to WandB artifact cornell_movie_df."

'2024-10-16 21:38:36,104:INFO:Artifact cornell_movie_df logged to WandB successfully.'



'2024-10-16 21:38:36,390:INFO:Training set size: 57561'

'2024-10-16 21:38:36,392:INFO:Validation set size: 83'

'2024-10-16 21:38:36,394:INFO:Test set size: 24587'

'2024-10-16 21:38:42,496:INFO:Data converted to JSON format successfully.'

'2024-10-16 21:38:47,290:INFO:Data saved to ./cornell/cornell_movie_data.json'

"2024-10-16 21:38:48,038:INFO:JSON file 'cornell_movie_data.json' uploaded to WandB successfully."

cornell_movie_data.json:   0%|          | 0.00/119M [00:00<?, ?B/s]

'2024-10-16 21:38:54,694:INFO:File cornell_movie_data.json  logged to Huggingface successfully.'

'2024-10-16 21:38:55,177:INFO:Q&A JSON file loaded successfully.'

'2024-10-16 21:38:55,187:INFO:Q&A JSON file loaded successfully.'

'2024-10-16 21:38:55,217:INFO:Q&A CSV file loaded successfully.'

'2024-10-16 21:38:55,220:INFO:Data loaded successfully.'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qa_df['response'] = qa_df.apply(lambda row: row['answers'][int(row['correct_index'])], axis=1)


"2024-10-16 21:38:56,039:INFO:DataFrame saved as CSV file at './movieqa/qa_movie_df.csv'."

"2024-10-16 21:38:56,096:INFO:CSV file './movieqa/qa_movie_df.csv' added to WandB artifact qa_movie_df."

'2024-10-16 21:38:56,255:INFO:Artifact qa_movie_df  logged to WandB successfully.'



'2024-10-16 21:38:56,283:INFO:Training set size: 3684'

'2024-10-16 21:38:56,285:INFO:Validation set size: 7'

'2024-10-16 21:38:56,288:INFO:Test set size: 1573'

'2024-10-16 21:38:56,781:INFO:Q&A Movie dataset saved to movieqa_movie_data.json'

'2024-10-16 21:38:59,857:INFO:Artifact movieqa_movie_data.json  logged to Huggingface successfully.'

'2024-10-16 21:39:00,918:INFO:Successfully loaded data from ./cornell/cornell_movie_data.json'

'2024-10-16 21:39:00,966:INFO:Successfully loaded data from ./movieqa/movieqa_movie_data.json'

'2024-10-16 21:39:00,969:INFO:Successfully concatenated data from ./cornell/cornell_movie_data.json and ./movieqa/movieqa_movie_data.json'

combined_movie_dataset.json:   0%|          | 0.00/125M [00:00<?, ?B/s]

'2024-10-16 21:39:09,995:ERROR:An unexpected error occurred: Bad request for commit endpoint:\n[31m------------------------------------------------------------------------- Unexpected internal error hook: check-file-count. (Request ID: Root=1-6710327d-23253f3d6ae1d09b2820e0a1;b10bb3ee-f60a-47e1-8292-56ad82082ec8) ------------------------------------------------------------------------- [0m\n\x1b[31m-------------------------------------------------------------------------\nUnexpected internal error hook: check-file-count. (Request ID: Root=1-6710327d-23253f3d6ae1d09b2820e0a1;b10bb3ee-f60a-47e1-8292-56ad82082ec8)\n-------------------------------------------------------------------------\x1b[0m'

BadRequestError: Bad request for commit endpoint:
[31m------------------------------------------------------------------------- Unexpected internal error hook: check-file-count. (Request ID: Root=1-6710327d-23253f3d6ae1d09b2820e0a1;b10bb3ee-f60a-47e1-8292-56ad82082ec8) ------------------------------------------------------------------------- [0m
[31m-------------------------------------------------------------------------
Unexpected internal error hook: check-file-count. (Request ID: Root=1-6710327d-23253f3d6ae1d09b2820e0a1;b10bb3ee-f60a-47e1-8292-56ad82082ec8)
-------------------------------------------------------------------------[0m

In [8]:
# with open("/kaggle/working/cornell_movie_data.json", "r") as file1:
#     data1 = json.load(file1)

In [12]:
combined_data = BaseDataLoader.concat_json_files(file_path, file_path2)

'2024-10-16 21:50:17,412:INFO:Successfully loaded data from ./cornell/cornell_movie_data.json'

'2024-10-16 21:50:17,470:INFO:Successfully loaded data from ./movieqa/movieqa_movie_data.json'

'2024-10-16 21:50:17,473:INFO:Successfully concatenated data from ./cornell/cornell_movie_data.json and ./movieqa/movieqa_movie_data.json'

'2024-10-16 21:50:23,296:INFO:File combined_movie_dataset.json  logged to Huggingface successfully.'

'2024-10-16 21:50:24,145:INFO:File combined_movie_dataset.json  logged to WandB successfully.'

In [8]:
# with open("/kaggle/working/movieqa_movie_data.json", "r") as file2:
#     data2 = json.load(file2)

In [None]:
# combined_data = data1 + data2

In [None]:
with open('/kaggle/working/movie_data.json', 'w', encoding='utf-8') as output_file:
    json.dump(combined_data, output_file, ensure_ascii=False, indent=4)

In [None]:
len(combined_data)

In [None]:
import pandas as pd

# Load the JSON file into pandas for inspection
file_path = '/kaggle/working/cornell_movie_data.json'  # Replace with the path to your JSON file

# Loading the JSON file into a pandas DataFrame
df = pd.read_json(file_path)

# Display the first few rows of the DataFrame to inspect the structure
print(df.head())

# Display column types to check for consistency
print(df.dtypes)

# Check for mixed types or inconsistent data in specific columns (like context or nested fields)
for col in df.columns:
    print(f"Checking column: {col}")
    print(df[col].apply(type).value_counts())


In [None]:
from huggingface_hub import snapshot_download

# Download the entire repository (including folders)
local_dir = snapshot_download(repo_id="niloufarna/MovieChat", repo_type="dataset", allow_patterns=["cornell/*","imdb/*", "movieqa/*"], local_dir = '/kaggle/working/')

print(f"Downloaded repository to {local_dir}")


In [10]:
pd.read_csv('/kaggle/working/movie_dataset.csv')

Unnamed: 0,character_names,movie_name,release_year,imdb_rating,num_imdb_votes,genre,utterance,response,plot_outline,instruction
0,"Bianca, Cameron",10 things i hate about you,1999,6.9,62847,"comedy, romance",Bianca: Can we make this quick? Roxanne Korri...,Cameron: Okay... then how 'bout we try out som...,,Continue the conversation between the characters.
1,"Bianca, Cameron",10 things i hate about you,1999,6.9,62847,"comedy, romance",Bianca: You're asking me out. That's so cute....,Cameron: Forget it.,,Continue the conversation between the characters.
2,"Bianca, Cameron",10 things i hate about you,1999,6.9,62847,"comedy, romance","Bianca: No, no, it's my fault -- we didn't hav...",Cameron: Seems like she could get a date easy ...,,Continue the conversation between the characters.
3,"Bianca, Cameron",10 things i hate about you,1999,6.9,62847,"comedy, romance",Cameron: Why?\n Bianca: Unsolved mystery. She...,Cameron: That's a shame.,,Continue the conversation between the characters.
4,"Bianca, Cameron",10 things i hate about you,1999,6.9,62847,"comedy, romance","Bianca: Gosh, if only we could find Kat a boyf...",Cameron: Let me see what I can do.,,Continue the conversation between the characters.
...,...,...,...,...,...,...,...,...,...,...
83651,"Adam, Billy",mighty morphin power rangers,1994,8.2,35,"action, family",How many votes does mighty morphin power range...,35,,Answer the following question:
83652,"Camille, Monica",love & basketball,2000,6.7,5907,"drama, romance, sport",What type of film is love & basketball?,"drama, romance, sport",,Answer the following question:
83653,"Camille, Monica",love & basketball,2000,6.7,5907,"drama, romance, sport",In what year was love & basketball released?,2000,,Answer the following question:
83654,"Camille, Monica",love & basketball,2000,6.7,5907,"drama, romance, sport",What rating did love & basketball receive on I...,6.7,,Answer the following question:


In [9]:
import os
for dirpath, dirname, filenames in os.walk('/kaggle/working'):
    print(dirpath)
    print(dirname)
    print(filenames)
    print("$$$$$$$$$$$$$$")

/kaggle/working
['imdb', 'movieqa', 'wandb', '.cache', '.virtual_documents', 'cornell']
['movie_dataset.csv']
$$$$$$$$$$$$$$
/kaggle/working/imdb
[]
['imdb_movie_details_cornell.csv', 'imdb_movie_details_movieqa.csv']
$$$$$$$$$$$$$$
/kaggle/working/movieqa
[]
['qa.json', 'imdb_movie_details_movieqa.csv', 'movies.json']
$$$$$$$$$$$$$$
/kaggle/working/wandb
['run-20241003_225825-af981qko', 'latest-run']
['debug.log', 'debug-internal.log']
$$$$$$$$$$$$$$
/kaggle/working/wandb/run-20241003_225825-af981qko
['tmp', 'logs', 'files']
['run-af981qko.wandb']
$$$$$$$$$$$$$$
/kaggle/working/wandb/run-20241003_225825-af981qko/tmp
['code']
[]
$$$$$$$$$$$$$$
/kaggle/working/wandb/run-20241003_225825-af981qko/tmp/code
[]
[]
$$$$$$$$$$$$$$
/kaggle/working/wandb/run-20241003_225825-af981qko/logs
[]
['debug.log', 'debug-core.log', 'debug-internal.log']
$$$$$$$$$$$$$$
/kaggle/working/wandb/run-20241003_225825-af981qko/files
[]
['output.log', 'wandb-metadata.json', 'requirements.txt']
$$$$$$$$$$$$$$
/kaggl