In [None]:
import pandas as pd
import ast
pd.set_option('display.max_columns', None)

df = pd.read_excel("kolkata_cars.xlsx")
length = len(df)
df.columns

New_Car_Details

In [None]:

data_list = df['new_car_detail'].to_list()

# Function to convert string to dictionary
def to_dict(data):
    if isinstance(data,str):
        try:
            return ast.literal_eval(data)
        except(ValueError,SyntaxError):
            return {}
    return data

data_list = [to_dict(item) for item in data_list]

new_car_detail_flatened_data = pd.DataFrame(data_list)
new_car_detail_flatened_data['row index'] = range(length)

# # Function to flatten a single dictionary
# def flatten_dict(data):
#     if not isinstance(data, dict):
#         return {}
    
#     if 'trendingText' in data:
#         # Flatten the nested dictionary
#         flat_data = {
#             **data,  # Unpack the main dictionary
#             **{f'trendingText_{k}': v for k, v in data['trendingText'].items()}  # Flatten the nested dictionary
#         }
#         del flat_data['trendingText']  # Remove the original nested dictionary
#     else:
#         # If 'trendingText' key is missing, just return the data as is
#         flat_data = data
#     return flat_data
# flattened_data_list = [flatten_dict(row) for row in data_list]

New_Car_Overview

In [None]:
import pandas as pd
import ast

# Convert each string to a dictionary
nested_dict_list = [ast.literal_eval(d) for d in df['new_car_overview']]

# Function to extract and flatten each dictionary
def extract_flatten_dict(nested_dict):
    if 'top' in nested_dict:
        top_data = nested_dict['top']
        df = pd.DataFrame(top_data)
        df = df.drop(columns=['icon'], errors='ignore')  # Drop the 'icon' column if present
        return df.set_index('key').T
    else:
        return pd.DataFrame()

# Extract and flatten all dictionaries
df_list = [extract_flatten_dict(d) for d in nested_dict_list]

# Concatenate all DataFrames into one
new_car_overview_df_combined = pd.concat(df_list, ignore_index=True)

new_car_overview_df_combined['row index'] = range(length)


Car_Feature

In [None]:
def extract_features(data_dict):
    # Extract top features
    top_features = [item['value'] for item in data_dict.get('top', [])]
    
    # Extract detailed features
    detailed_features = []
    for category in data_dict.get('data', []):
        heading = category.get('heading', '')
        subHeading = category.get('subHeading', '')
        for item in category.get('list', []):
            detailed_features.append({
                'Category': heading,
                'SubCategory': subHeading,
                'Feature': item['value']
            })
    
    df_top = pd.DataFrame(top_features, columns=['Top Feature'])
    df_detailed = pd.DataFrame(detailed_features)
    
    return df_top, df_detailed

df_ = pd.DataFrame(df, columns=['new_car_feature'])

top_features_list = []
detailed_features_list = []

# Process each row
for index, row in df_.iterrows():
    # Convert the string to a dictionary
    try:
        data_dict = ast.literal_eval(row['new_car_feature'])
    except (ValueError, SyntaxError):
        print(f"Error parsing row {index}")
        continue
    
    # Extract features
    df_top_features, df_detailed_features = extract_features(data_dict)
    
    # Add row index for reference
    df_top_features['Row Index'] = index
    df_detailed_features['Row Index'] = index
    
    # Append to lists
    top_features_list.append(df_top_features)
    detailed_features_list.append(df_detailed_features)

# Combine all DataFrames
df_top_combined = pd.concat(top_features_list, ignore_index=True)
df_detailed_combined = pd.concat(detailed_features_list, ignore_index=True)

df_top_pivot = df_top_combined.groupby('Row Index')['Top Feature'].apply(lambda x: ', '.join(x)).reset_index()

# Rename columns
df_top_pivot.columns = ['row index', 'Top Features']


New_Car_Specs

In [None]:
car_spec_list = df['new_car_specs'].to_list()

# Define a function to process each row and extract key-value pairs
def extract_key_values(data):
    # Convert string to dictionary
    nested_dict = ast.literal_eval(data)
    
    # Extract key-value pairs from the 'data' section
    data_list = nested_dict.get('data', [])
    key_value_dict = {}
    for item in data_list:
        for kv in item.get('list', []):
            key_value_dict[kv['key']] = kv['value']
    return key_value_dict

df = pd.DataFrame({
    'new_car_specs': car_spec_list
})

df['key_value_dict'] = df['new_car_specs'].apply(extract_key_values)

key_value_df = pd.json_normalize(df['key_value_dict'])
key_value_df['row index'] = range(length)


Merge all column data into one

In [None]:
merged_df = pd.merge(new_car_detail_flatened_data, new_car_overview_df_combined, on='row index', how='outer')

merged_df = pd.merge(merged_df, df_top_pivot, on='row index', how='outer')

merged_df = pd.merge(merged_df, key_value_df, on='row index', how='outer')


normalise the column name

In [None]:
merged_df.columns = merged_df.columns.str.lower()
merged_df.columns = merged_df.columns.str.replace(" ","_")

save to csv

In [None]:
merged_df['city'] = "kolkata"
merged_df.to_csv('kolkata.csv')