In [13]:
import numpy as np
import pandas as pd

def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """ Receive dataframe. remove Nan's, remove columns that are usless """
    #Too many features with NaN's, remove columns which have more then 15000 Nan'n in column
    filtered_columns = df.columns[df.isna().sum() < 15000]
    filtered_df = df[filtered_columns]

    #remove some rows with a lot of Nan's
    test_filtered_df = filtered_df.dropna(subset='Acceleration')

    #columns that are usless.
    column_to_remove = ['Assistance', 'Aspiration', 'Rear tires', 'Spare tire', 'Front tires',
                    'Urban autonomy', 'Generation', 'Front suspension', 'Rear suspension', 'Coupling', 'Valve command',
                     'Disposition', 'Cylinders', 'Elastic element', 'Ipva R', 'Frontal area A', 'Engine code',
                    'Traction', 'Installation', 'Road autonomy', 'Engine power supply', 'Engine control activation',
                    'Gear change code', 'Corrected frontal area', 'Platform']

    # remove usless columns
    new_filtered_df = test_filtered_df.drop(columns=column_to_remove)
    #return clean dataframe
    return new_filtered_df


def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    ''' Receive dataframe. Transform values to numbers '''

    # columns that will be transformed same: Nan = 0 / standart equipment, optinal equipment = 1
    columns_to_transform = ['Hot air', 'Rev counter', 'Assisted direction', 'ABS brakes', 'Rear window', 'Central locking of the doors',
                        'Headrest for all occupants', 'Electric rearview mirror adjustment', 'Air conditioning',
                        'Bluetooth connection', 'Frontal Airbags', 'Steering wheel adjustment height',
                        'Electric front window control', 'Multifunctional steering wheel', 'Driver s seat with height adjustment',
                        'On board computer', 'Light in the trunk', 'Alloy wheels', 'USB connection', 'Radio',
                        'Folding rear seat', 'Perimeter anti theft alarm', 'Cooling liquid thermometer']
    df[columns_to_transform] = df[columns_to_transform].applymap(lambda x: 0 if pd.isnull(x) else 1)

    ########################################Preprocess [Acceleration]#################################################################
    # Preprocess Acceleration (0100 km/h 3,8 s = 3.8)
    df['Acceleration'] = df['Acceleration'].str.extract(r'(\d+\,\d+)')
    # Replace ',' with '.' and convert to numeric
    df['Acceleration'] = df['Acceleration'].str.replace(',', '.').astype(float)
    # Rename Column
    df = df.rename(columns={'Acceleration': 'Acceleration 0100 km/h in S'})

    ##########################################Preproces values with mm/kg/cm/ ETC ####################################################
    #function to remove strings (mm/kg/cm/) and transform to floats
    def extract_float_value(value):
        try:
            if isinstance(value, float):
                return value
            else:
                float_value = value.split()[0].replace(',', '.')
                return float(float_value)
        except (ValueError, IndexError):
            return np.nan

    #transforming
    for column in df[['Weight/Torque', 'Weight', 'Weight/power', 'Max power regime.', 'Cylinder diameter',
                      'Fuel tank', 'Specific power', 'Maximum power', 'Length', 'Maximum torque', 'Width', 'Height',
                      'Specific torque', 'Minimum height from the ground', 'Piston course', 'Front gauge', 'Displacement',
                      'Turns diameter', 'Rear gauge', 'Length between the axis', 'Maximum speed', 'Road consumption',
                      'Max torque regime', 'Car payload', 'Sidewall height', 'Unit displacement', 'Trunk', 'Urban']]:
        df[column] = df[column].apply(extract_float_value)
    #################################################################################################################################


    return df


In [2]:
df = pd.read_csv('/Users/bididudy/code/marcnaweb/car_recommendation_engine/raw_data/categorized_car_features.csv')

  df = pd.read_csv('/Users/bididudy/code/marcnaweb/car_recommendation_engine/raw_data/categorized_car_features.csv')


In [3]:
df=clean_data(df)
df

Unnamed: 0,Carcode,Weight/Torque,Weight,Fuel,Weight/power,Max power regime.,Cylinder diameter,Fuel tank,Steering wheel adjustment height,Specific power,...,Sidewall height,Provenance,On board computer,Light in the trunk,Settings,Unit displacement,Trunk,USB connection,Urban,Radio
0,1,"24,9 kg/kgfm",1650 kg,Gasoline,"2,85 kg/cv",7500 rpm,87 mm,100 litros,,"93,5 cv/litro",...,101 mm,Imported,,,Coupe,516 cm^3,50 litros,,"2,4 km/l",
1,2,"97,1 kg/kgfm",825 kg,Gasoline,"15,00 kg/cv",5500 rpm,70 mm,50 litros,,"55,1 cv/litro",...,116 mm,National,,,Hatchback,250 cm^3,290 litros,,10 km/l,
2,3,"67,6 kg/kgfm",1170 kg,Gasoline,"10,09 kg/cv",5200 rpm,86 mm,52 litros,,"58,1 cv/litro",...,113 mm,National,,,Hatchback,500 cm^3,370 litros,,"7,2 km/l",
4,5,"59,9 kg/kgfm",1282 kg,Gasoline,"8,55 kg/cv",5700 rpm,81 mm,55 litros,,"84,2 cv/litro",...,123 mm,National,,,Hatchback,445 cm^3,330 litros,,"6,8 km/l",
5,6,"71,9 kg/kgfm",1200 kg,Gasoline,"9,09 kg/cv",6500 rpm,82 mm,50 litros,,"75,6 cv/litro",...,107 mm,National,,,Hatchback,437 cm^3,370 litros,,9 km/l,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23881,23884,"63,4 kg/kgfm",1110 kg,Flex alcohol/gasoline,"9,25 kg/cv",6000 rpm,71 mm,50 litros,,"120,2 cv/litro",...,107 mm,National,Standard equipment,,Hatchback,333 cm^3,300 litros,Standard equipment,"8,3 km/l A",Standard equipment
23882,23885,"63,4 kg/kgfm",1110 kg,Flex alcohol/gasoline,"9,25 kg/cv",6000 rpm,71 mm,50 litros,Standard equipment,"120,2 cv/litro",...,107 mm,National,Standard equipment,,Hatchback,333 cm^3,300 litros,Standard equipment,"8,3 km/l A",Standard equipment
23890,23893,"63,4 kg/kgfm",1110 kg,Flex alcohol/gasoline,"9,25 kg/cv",6000 rpm,71 mm,50 litros,Standard equipment,"120,2 cv/litro",...,107 mm,National,Standard equipment,Standard equipment,Hatchback,333 cm^3,300 litros,Standard equipment,9 km/l A,Standard equipment
23902,23905,"63,2 kg/kgfm",2192 kg,Diesel,"15,66 kg/cv",3750 rpm,85 mm,90 litros,,"64,2 cv/litro",...,169 mm,Imported,Standard equipment,Standard equipment,Van,545 cm^3,11500 litros,Standard equipment,10 km/l,Optional equipment


In [4]:
df=preprocess(df)

  df[columns_to_transform] = df[columns_to_transform].applymap(lambda x: 0 if pd.isnull(x) else 1)


In [5]:
def process_guarantee_column(df):
    new_df = df.copy()

    # Remove 'year' or 'years' from 'Guarantee' column values
    new_df['Guarantee'] = new_df['Guarantee'].str.replace(r'\s*years?\s*', '', regex=True)

    # Rename the column to 'Guarantee in years'
    new_df = new_df.rename(columns={'Guarantee': 'Guarantee in years'})

    return new_df

In [6]:
df=process_guarantee_column(df)

In [7]:
import os

# Define the file path to the 'Downloads' directory
downloads_directory = os.path.expanduser('~/Downloads')

# Specify the full file path for saving the CSV file
csv_file_path = os.path.join(downloads_directory, 'df_7.csv')

# Save the DataFrame to the CSV file
df.to_csv(csv_file_path, index=False)

In [8]:
df_object_columns = df.select_dtypes(include=['object'])
df_object_columns

Unnamed: 0,Fuel,Car size,Compression ratio,Tuching,Guarantee in years,Drag coefficient,Propulsion,Car gearbox,Price R,Devaluation,CNW Index,Reader score,Provenance,Settings
0,Gasoline,Grande,1071,,1,033,Combustion,Manual,811.249,25,693,85,Imported,Coupe
1,Gasoline,Compact and portable,951,Mechanical,1,036,Combustion,Manual,11.577,827,6445,88,National,Hatchback
2,Gasoline,Medium,981,Hydro,1,03,Combustion,Manual,16.585,294,7795,84,National,Hatchback
4,Gasoline,Medium,951,Hydro,1,031,Combustion,Manual,33.527,1502,5423,92,National,Hatchback
5,Gasoline,Medium,1031,Hydro,1,,Combustion,Manual,12.087,827,11147,74,National,Hatchback
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23881,Flex alcohol/gasoline,Compact and portable,1051,Mechanical,5,,Combustion,Automatic,102.090,,97396,70,National,Hatchback
23882,Flex alcohol/gasoline,Compact and portable,1051,Mechanical,5,,Combustion,Automatic,107.390,,99345,70,National,Hatchback
23890,Flex alcohol/gasoline,Compact and portable,1051,Mechanical,5,,Combustion,Automatic,117.090,,"1.001,37",70,National,Hatchback
23902,Diesel,Grande,161,,1,,Combustion,Manual,245.990,,29395,70,Imported,Van


## Converting string to float

In [9]:
def convert_columns_to_float(df, columns):
    """
    Replace commas with dots in specified columns and convert them to float.

    Parameters:
    - df: pandas.DataFrame.
    - columns: List of column names to be converted.

    Returns:
    - DataFrame with the specified columns converted to float.
    """
    for col in columns:
        # Check if the column exists in the DataFrame
        if col in df.columns:
            # Replace commas with dots and convert to float, coercing errors to NaN
            df[col] = pd.to_numeric(df[col].astype(str).str.replace(',', '.'), errors='coerce')
        else:
            print(f"Column '{col}' not found in DataFrame.")
    return df


In [10]:
columns_to_convert = ['Carcode','Guarantee in years','Drag coefficient', 'Price R', 'Devaluation', 'CNW Index', 'Reader score','Compression ratio']
cleaned_data = convert_columns_to_float(df=df, columns=columns_to_convert)

In [11]:
cleaned_data.columns

Index(['Carcode', 'Weight/Torque', 'Weight', 'Fuel', 'Weight/power',
       'Max power regime.', 'Cylinder diameter', 'Fuel tank',
       'Steering wheel adjustment height', 'Specific power',
       'Multifunctional steering wheel',
       'Driver s seat with height adjustment', 'Alloy wheels', 'Maximum power',
       'Car size', 'Hot air', 'Rear window',
       'Electric rearview mirror adjustment', 'Air conditioning',
       'Frontal Airbags', 'Electric front window control', 'Folding rear seat',
       'ABS brakes', 'Central locking of the doors',
       'Perimeter anti theft alarm', 'Bluetooth connection',
       'Acceleration 0100 km/h in S', 'Length', 'Maximum torque', 'Width',
       'Rev counter', 'Compression ratio', 'Assisted direction', 'Tuching',
       'Guarantee in years', 'Drag coefficient', 'Height', 'Specific torque',
       'Minimum height from the ground', 'Propulsion', 'Piston course',
       'Car gearbox', 'Price R', 'Headrest for all occupants', 'Front gauge',
   

In [12]:
df_object_columns = cleaned_data.select_dtypes(include=['object'])
df_object_columns

Unnamed: 0,Fuel,Car size,Tuching,Propulsion,Car gearbox,Provenance,Settings
0,Gasoline,Grande,,Combustion,Manual,Imported,Coupe
1,Gasoline,Compact and portable,Mechanical,Combustion,Manual,National,Hatchback
2,Gasoline,Medium,Hydro,Combustion,Manual,National,Hatchback
4,Gasoline,Medium,Hydro,Combustion,Manual,National,Hatchback
5,Gasoline,Medium,Hydro,Combustion,Manual,National,Hatchback
...,...,...,...,...,...,...,...
23881,Flex alcohol/gasoline,Compact and portable,Mechanical,Combustion,Automatic,National,Hatchback
23882,Flex alcohol/gasoline,Compact and portable,Mechanical,Combustion,Automatic,National,Hatchback
23890,Flex alcohol/gasoline,Compact and portable,Mechanical,Combustion,Automatic,National,Hatchback
23902,Diesel,Grande,,Combustion,Manual,Imported,Van


In [13]:
# # Define the file path to the 'Downloads' directory
# downloads_directory = os.path.expanduser('~/Downloads')

# # Specify the full file path for saving the CSV file
# csv_file_path = os.path.join(downloads_directory, 'df_8.csv')

# # Save the DataFrame to the CSV file
# df_object_columns.to_csv(csv_file_path, index=False)

## we will impute Unknown in Nan values in categorical data

In [14]:
def impute_missing_with_unknown(df, columns):
    """
    Impute missing values with 'Unknown' in specified columns.

    Parameters:
    - df: pandas.DataFrame.
    - columns: List of column names where missing values should be imputed.

    Returns:
    - DataFrame with missing values in the specified columns replaced with 'Unknown'.
    """
    for col in columns:
        df[col] = df[col].fillna('Unknown')
    return df

In [15]:
columns=[['Tuching', 'Fuel']]

In [16]:
cleaned_data=impute_missing_with_unknown(cleaned_data, columns)
cleaned_data


Unnamed: 0,Carcode,Weight/Torque,Weight,Fuel,Weight/power,Max power regime.,Cylinder diameter,Fuel tank,Steering wheel adjustment height,Specific power,...,Sidewall height,Provenance,On board computer,Light in the trunk,Settings,Unit displacement,Trunk,USB connection,Urban,Radio
0,1,24.9,1650.0,Gasoline,2.85,7500.0,87.0,100.0,0,93.5,...,101.0,Imported,0,0,Coupe,516.0,50.0,0,2.4,0
1,2,97.1,825.0,Gasoline,15.00,5500.0,70.0,50.0,0,55.1,...,116.0,National,0,0,Hatchback,250.0,290.0,0,10.0,0
2,3,67.6,1170.0,Gasoline,10.09,5200.0,86.0,52.0,0,58.1,...,113.0,National,0,0,Hatchback,500.0,370.0,0,7.2,0
4,5,59.9,1282.0,Gasoline,8.55,5700.0,81.0,55.0,0,84.2,...,123.0,National,0,0,Hatchback,445.0,330.0,0,6.8,0
5,6,71.9,1200.0,Gasoline,9.09,6500.0,82.0,50.0,0,75.6,...,107.0,National,0,0,Hatchback,437.0,370.0,0,9.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23881,23884,63.4,1110.0,Flex alcohol/gasoline,9.25,6000.0,71.0,50.0,0,120.2,...,107.0,National,1,0,Hatchback,333.0,300.0,1,8.3,1
23882,23885,63.4,1110.0,Flex alcohol/gasoline,9.25,6000.0,71.0,50.0,1,120.2,...,107.0,National,1,0,Hatchback,333.0,300.0,1,8.3,1
23890,23893,63.4,1110.0,Flex alcohol/gasoline,9.25,6000.0,71.0,50.0,1,120.2,...,107.0,National,1,1,Hatchback,333.0,300.0,1,9.0,1
23902,23905,63.2,2192.0,Diesel,15.66,3750.0,85.0,90.0,0,64.2,...,169.0,Imported,1,1,Van,545.0,11500.0,1,10.0,1


In [17]:
nan_percentage_per_column = cleaned_data.isnull().mean() * 100

# Create a new DataFrame with column names and NaN percentages and Dtypes
nan_info_df = pd.DataFrame({
    'Column Name': nan_percentage_per_column.index,
    'NaN Percentage': nan_percentage_per_column.values,
    'Data Type': cleaned_data.dtypes
})

# Display the new DataFrame
nan_info_df[0:50] # Cheking 50 by 50 to see whole list at a time.

Unnamed: 0,Column Name,NaN Percentage,Data Type
Carcode,Carcode,0.0,int64
Weight/Torque,Weight/Torque,0.0,float64
Weight,Weight,0.0,float64
Fuel,Fuel,0.0,object
Weight/power,Weight/power,0.0,float64
Max power regime.,Max power regime.,0.709871,float64
Cylinder diameter,Cylinder diameter,1.028588,float64
Fuel tank,Fuel tank,0.709871,float64
Steering wheel adjustment height,Steering wheel adjustment height,0.0,int64
Specific power,Specific power,0.709871,float64


## Applying one hot encoding to categorical data.

In [18]:
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder
# import pandas as pd


# # Define the categorical columns you want to encode
# categorical_columns = ['Tuching', 'Propulsion', 'Car gearbox', 'Fuel', 'Provenance', 'Settings', 'Car size']  # Adjust as needed

# # Initialize the ColumnTransformer to apply OneHotEncoding to the categorical columns
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', OneHotEncoder(), categorical_columns)
#     ],
#     remainder='passthrough'  # Keep other columns unchanged
# )

# # Apply the transformations to the dataset
# # This will fit the OneHotEncoder to your categorical data and transform it,
# # while leaving the rest of your columns unchanged
# transformed_data = preprocessor.fit_transform(cleaned_data)

# # Get the new feature names for the one-hot encoded columns
# new_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_columns)

# # Combine the new feature names with the names of the non-transformed columns
# final_feature_names = list(new_feature_names) + [col for col in cleaned_data.columns if col not in categorical_columns]

# # Create a new DataFrame with the transformed data and the correct column names
# final_df = pd.DataFrame(transformed_data, columns=final_feature_names)

# # Display the first few rows of the final DataFrame
# final_df.head()

In [35]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

def encode_categorical_columns(data, categorical_columns):
    """
    Applies one-hot encoding to specified categorical columns in a DataFrame.

    Parameters:
    - data: pandas.DataFrame, the DataFrame containing the data.
    - categorical_columns: list, a list of column names to be one-hot encoded.

    Returns:
    - A new DataFrame with the specified columns one-hot encoded and original columns unchanged.
    """
    # Initialize the ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(), categorical_columns)
        ],
        remainder='passthrough'  # Keep other columns unchanged
    )


    # Apply the transformations to the dataset
    transformed_data = preprocessor.fit_transform(data)

    # Get the new feature names for the one-hot encoded columns
    new_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_columns)

    # Combine the new feature names with the names of the non-transformed columns
    final_feature_names = [col for col in data.columns if col not in categorical_columns] + list(new_feature_names)

    # Create a new DataFrame with the transformed data and the correct column names
    final_df = pd.DataFrame(transformed_data, columns=final_feature_names, index=data.index)

    return final_df


In [38]:
# Select columns of dtype 'object'
df_object_columns = cleaned_data.select_dtypes(include=['object'])
# Define the categorical columns you want to encode
categorical_columns = ['Tuching', 'Propulsion', 'Car gearbox', 'Fuel', 'Provenance', 'Settings', 'Car size']

In [39]:
df_finale=encode_categorical_columns(df_object_columns, categorical_columns)

ValueError: Shape of passed values is (20708, 1), indices imply (20708, 40)

In [33]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

def encode_all_columns(data):
    """
    Applies one-hot encoding to all columns in a DataFrame, assuming all are categorical.
    Correctly handles the construction of the output DataFrame to avoid shape mismatches.

    Parameters:
    - data: pandas.DataFrame, the DataFrame containing only categorical data.

    Returns:
    - A DataFrame with all columns one-hot encoded.
    """
    # Initialize the OneHotEncoder
    encoder = OneHotEncoder()

    # Apply the transformations to the entire dataset
    transformed_data = encoder.fit_transform(data)

    # Get the new feature names for the one-hot encoded columns
    new_feature_names = encoder.get_feature_names_out(input_features=data.columns)

    # Correctly handle the shape of transformed_data when creating the new DataFrame
    # Ensure the number of columns in final_df matches the length of new_feature_names
    final_df = pd.DataFrame(transformed_data, columns=new_feature_names, index=data.index)

    return final_df


In [37]:
# Select columns of dtype 'object'
df_object_columns = cleaned_data.select_dtypes(include=['object'])


In [34]:
cat_df=encode_all_columns(df_object_columns)
cat_df

ValueError: Shape of passed values is (20708, 1), indices imply (20708, 40)

In [21]:
# Define the categorical columns you want to encode
categorical_columns = ['Tuching', 'Propulsion', 'Car gearbox', 'Fuel', 'Provenance', 'Settings', 'Car size']

In [18]:
# Select columns of dtype 'object'
df_object_columns = cleaned_data.select_dtypes(include=['object'])

# Get just the column names
object_column_names = df_object_columns.columns.tolist()

# Display the column names
print(object_column_names)

['Fuel', 'Car size', 'Tuching', 'Propulsion', 'Car gearbox', 'Provenance', 'Settings']


In [24]:
final_df=encode_categorical_columns(cleaned_data, categorical_columns)
final_df

ValueError: Shape of passed values is (20708, 1), indices imply (20708, 100)

In [71]:
# Define the file path to the 'Downloads' directory
downloads_directory = os.path.expanduser('~/Downloads')

# Specify the full file path for saving the CSV file
csv_file_path = os.path.join(downloads_directory, 'df_9.csv')

# Save the DataFrame to the CSV file
final_df.to_csv(csv_file_path, index=False)

In [101]:
nan_percentage_per_column = final_df.isnull().mean() * 100

# Create a new DataFrame with column names and NaN percentages and Dtypes
nan_info_df = pd.DataFrame({
    'Column Name': nan_percentage_per_column.index,
    'NaN Percentage': nan_percentage_per_column.values,
    'Data Type': final_df.dtypes
})

# Display the new DataFrame
nan_info_df[0:50] # Cheking 50 by 50 to see whole list at a time.

Unnamed: 0,Column Name,NaN Percentage,Data Type
Tuching_Hydro,Tuching_Hydro,0.0,float64
Tuching_Mechanical,Tuching_Mechanical,0.0,float64
Tuching_Unknown,Tuching_Unknown,0.0,float64
Propulsion_Combustion,Propulsion_Combustion,0.0,float64
Propulsion_Electric,Propulsion_Electric,0.0,float64
Propulsion_Hybrid,Propulsion_Hybrid,0.0,float64
Propulsion_Light Hybrid,Propulsion_Light Hybrid,0.0,float64
Propulsion_Plug-in hybrid,Propulsion_Plug-in hybrid,0.0,float64
Car gearbox_Automated,Car gearbox_Automated,0.0,float64
Car gearbox_Automatic,Car gearbox_Automatic,0.0,float64


## putting 0 for empty cells in numerical columns.

In [76]:
def fill_missing_values_with_zero(data):
    """
    Fills all missing values in the DataFrame with 0.

    Parameters:
    - data: pandas.DataFrame, the DataFrame containing the data.

    Returns:
    - A new DataFrame with missing values filled with 0.
    """
    # Fill missing values with 0
    filled_data = data.fillna(0)

    return filled_data

In [77]:
final_cleaned_data=fill_missing_values_with_zero(final_df)

In [79]:
nan_percentage_per_column = final_cleaned_data.isnull().mean() * 100

# Create a new DataFrame with column names and NaN percentages and Dtypes
nan_info_df = pd.DataFrame({
    'Column Name': nan_percentage_per_column.index,
    'NaN Percentage': nan_percentage_per_column.values,
    'Data Type': final_cleaned_data.dtypes
})

# Display the new DataFrame
nan_info_df[0:50] # Cheking 50 by 50 to see whole list at a time.

Unnamed: 0,Column Name,NaN Percentage,Data Type
Tuching_Hydro,Tuching_Hydro,0.0,float64
Tuching_Mechanical,Tuching_Mechanical,0.0,float64
Tuching_Unknown,Tuching_Unknown,0.0,float64
Propulsion_Combustion,Propulsion_Combustion,0.0,float64
Propulsion_Electric,Propulsion_Electric,0.0,float64
Propulsion_Hybrid,Propulsion_Hybrid,0.0,float64
Propulsion_Light Hybrid,Propulsion_Light Hybrid,0.0,float64
Propulsion_Plug-in hybrid,Propulsion_Plug-in hybrid,0.0,float64
Car gearbox_Automated,Car gearbox_Automated,0.0,float64
Car gearbox_Automatic,Car gearbox_Automatic,0.0,float64


In [None]:
# Define the file path to the 'Downloads' directory
downloads_directory = os.path.expanduser('~/Downloads')

# Specify the full file path for saving the CSV file
csv_file_path = os.path.join(downloads_directory, 'df_9.csv')

# Save the DataFrame to the CSV file
final_df.to_csv(csv_file_path, index=False)

In [2]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
features_df=pd.read_csv('/Users/bididudy/code/marcnaweb/car_recommendation_engine/raw_data/categorized_car_features.csv')

  features_df=pd.read_csv('/Users/bididudy/code/marcnaweb/car_recommendation_engine/raw_data/categorized_car_features.csv')


In [8]:
features_df.rename(columns={'Carcode': 'car_code'}, inplace=True)

In [5]:
prices_relevant_df=pd.read_csv('/Users/bididudy/code/marcnaweb/car_recommendation_engine/raw_data/car_prices_enriched_v3_carPriceIndex.csv')


In [6]:
last_price_df=prices_relevant_df[['car_code', 'car_last_price_in_brl']]

In [10]:
import pandas as pd

# Assuming you've already loaded the datasets into features_df and prices_df

# Step 1: Define the matching columns. Replace these with actual matching columns.
matching_columns = ['car_code']

# Step 2: Merge the datasets based on the defined matching columns.
# Note: This step assumes that both datasets contain the columns defined in `matching_columns`.
# You might need to adjust the columns in each dataset to match exactly.
merged_df = pd.merge(features_df, last_price_df, how='left', on='car_code')

# Step 3: Check the merged dataset
merged_df.columns

Index(['car_code', 'Deactivation of cylinders',
       'Frontal Driver Seat Extension', 'Driver s seat with memories', 'Power',
       'Weight/Torque', 'Driver s seat with heating',
       'Driver s seat with electrical adjustment', 'Weight',
       'Lumbar Support Adjustment for the Driver',
       ...
       'Headlights with height adjustment', 'Platform', 'Urban autonomyG',
       'Pneumatic suspension', 'Radio', 'Maximum speed electric mode',
       'Electronic noise cancellation', 'Facial recognition camera',
       'Track centralization assistant', 'car_last_price_in_brl'],
      dtype='object', length=283)

In [12]:
import os

# Define the file path to the 'Downloads' directory
downloads_directory = os.path.expanduser('~/Downloads')

# Specify the full file path for saving the CSV file
csv_file_path = os.path.join(downloads_directory, 'df_9.csv')

# Save the DataFrame to the CSV file
merged_df.to_csv(csv_file_path, index=False)

In [18]:
filtered_columns = merged_df.columns[merged_df.isna().sum() < 15000]
filtered_df = merged_df[filtered_columns]

In [19]:
filtered_df .shape

(23904, 93)