In [6]:
import pandas as pd
import json
import os
import glob

def combine_json_files_to_dataframe():
    """
    Combines all gemini_output*.json files into a single pandas DataFrame
    """
    
    # Find all JSON files matching the pattern
    json_files = glob.glob("../preprocessing/data/gemini_output*.json")
    
    if not json_files:
        print("No JSON files found in data/ directory")
        return None
    
    print(f"Found {len(json_files)} JSON files")
    
    all_data = []
    
    for file_path in sorted(json_files):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Extract the file number from filename (e.g., "gemini_output5.json" -> 5)
            file_number = int(file_path.split('gemini_output')[1].split('.json')[0])
            
            # Add file number to each record for tracking
            for item in data:
                item['source_file'] = file_number
                item['source_filename'] = os.path.basename(file_path)
            
            all_data.extend(data)
            print(f"Loaded {len(data)} records from {file_path}")
            
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON in {file_path}: {e}")
            continue
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            continue
    
    if not all_data:
        print("No valid data found in any JSON files")
        return None
    
    # Create DataFrame
    df = pd.DataFrame(all_data)
    
    print(f"\nCombined DataFrame created with {len(df)} rows and {len(df.columns)} columns")
    print(f"Columns: {list(df.columns)}")
    
    return df

In [None]:
df = combine_json_files_to_dataframe()

Found 131 JSON files
Loaded 15 records from ../preprocessing/data/gemini_output0.json
Loaded 17 records from ../preprocessing/data/gemini_output1.json
Loaded 17 records from ../preprocessing/data/gemini_output10.json
Loaded 26 records from ../preprocessing/data/gemini_output102.json
Loaded 15 records from ../preprocessing/data/gemini_output107.json
Loaded 15 records from ../preprocessing/data/gemini_output11.json
Loaded 15 records from ../preprocessing/data/gemini_output110.json
Loaded 16 records from ../preprocessing/data/gemini_output111.json
Loaded 20 records from ../preprocessing/data/gemini_output119.json
Loaded 22 records from ../preprocessing/data/gemini_output12.json
Loaded 15 records from ../preprocessing/data/gemini_output125.json
Loaded 15 records from ../preprocessing/data/gemini_output126.json
Loaded 20 records from ../preprocessing/data/gemini_output127.json
Loaded 20 records from ../preprocessing/data/gemini_output128.json
Loaded 20 records from ../preprocessing/data/gem

In [8]:
print(df)

                                               question  \
0     What should I do if a girl texts me an hour be...   
1     What's the immediate response when a girl text...   
2     How do I handle communication with a girl who ...   
3     What does it mean to 'drop her ass' or put her...   
4     What is the ultimate goal of putting a girl 'o...   
...                                                 ...   
2539  What if a girl says she won't have sex unless ...   
2540  What should I say if a girl tells me, 'I don't...   
2541  Why should I stop feeling guilty about giving ...   
2542  How should I behave with a quality woman I'm r...   
2543  Should I kiss a woman if I don't think she has...   

                                                context  \
0     A girl cancels a date last minute with a commo...   
1      She's just sent a text canceling an hour before.   
2     After the initial 'no reply' and putting her '...   
3     Understanding the dating coach's terminology f...

In [11]:
df.head()

Unnamed: 0,question,context,answer,text,source_file,source_filename
0,What should I do if a girl texts me an hour be...,A girl cancels a date last minute with a commo...,No reply. And you never text her again until s...,|begin_of_text|><|start_header_id|>system<|end...,0,gemini_output0.json
1,What's the immediate response when a girl text...,She's just sent a text canceling an hour before.,No reply.,|begin_of_text|><|start_header_id|>system<|end...,0,gemini_output0.json
2,How do I handle communication with a girl who ...,After the initial 'no reply' and putting her '...,You never text her again until she starts chas...,|begin_of_text|><|start_header_id|>system<|end...,0,gemini_output0.json
3,What does it mean to 'drop her ass' or put her...,Understanding the dating coach's terminology f...,It means you stop all communication and engage...,|begin_of_text|><|start_header_id|>system<|end...,0,gemini_output0.json
4,What is the ultimate goal of putting a girl 'o...,Implementing the 'no reply' and dropping her s...,The goal is for her to come back and start cha...,|begin_of_text|><|start_header_id|>system<|end...,0,gemini_output0.json


In [12]:
df.isnull().value_counts()

question  context  answer  text   source_file  source_filename
False     False    False   False  False        False              2527
          True     False   False  False        False                17
Name: count, dtype: int64

In [13]:
df_cleaned = df.dropna(subset=['context'])

In [14]:
df_cleaned.isnull().sum()

question           0
context            0
answer             0
text               0
source_file        0
source_filename    0
dtype: int64

In [17]:
df_cleaned.shape

(2527, 6)

In [18]:
df_cleaned.drop(columns=["source_file", "source_filename"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.drop(columns=["source_file", "source_filename"], inplace=True)


In [21]:
df_cleaned.shape

(2527, 4)

### Moving to Google colab for finetuning, now that we have the dataset

In [23]:
df_cleaned.to_csv("data/transcript_cleaned_output.csv", index=False)