In [93]:
import unidecode
import pandas as pd
from pandasgui import show
import re
import csv
import requests


In [94]:
def clean_text_df(df):
    # Create a copy to avoid modifying the original
    df = df.copy()
    
    # Dictionary for character replacements
    replacements = {
        'á': 'a',  'Á': 'a',
        'é': 'e', 'É': 'e', 
        'í': 'i',  'Í': 'i', 
        'ó': 'o',   'Ó': 'o',
        'ú': 'u', 'Ú': 'u',
        'ñ': 'n', 'Ñ': 'n'
    }
    
    # Function to apply replacements
    def replace_chars(text):
        if isinstance(text, str):
            for old, new in replacements.items():
                text = text.replace(old, new)
            return text.lower()
        return text
    
    # Clean column names
    df.columns = [replace_chars(col) for col in df.columns]
    
    # Clean string values
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].apply(replace_chars)
    
    return df

### Get the products
get all the purchases, corresponding to the 3 FAs. 

In [101]:
# Read the CSV file
df17 = pd.read_csv("../raw_data\ConvenioMarco/vehiculos_2017\MaestraProd_cm_2239-4-lr17.csv", sep=';', encoding='cp1252')  
df21 = pd.read_csv("../raw_data\ConvenioMarco/vehiculos_2021\MaestraProd_cm_2239-5-lr21.csv", sep=';',  encoding='cp1252')
df23 = pd.read_csv('../raw_data\ConvenioMarco/vehiculos_2023/MaestraProd_cm_2239-8-lr23.csv', sep=';', encoding='utf-8-sig', encoding_errors='replace')

df17 = clean_text_df(df17)
df21 = clean_text_df(df21)
df23 = clean_text_df(df23)


1. standarize vars 
2. join dfs 

In [103]:
print(df17.columns, df21.columns, df23.columns)
#change var of 17 and 21 to match 23
rename_dict = {'nrolicitacionpublica': 'numero licitacion',
               'idproducto': 'id producto',
               'idproductocm': 'id producto', 
               'rut': 'rut proveedor',
               'idproveedor': 'id proveedor',
                }
df17.rename(columns=rename_dict, inplace=True)
df21.rename(columns=rename_dict, inplace=True)



# Print columns that differ between dataframes
cols_diff = (set(df17.columns) ^ set(df21.columns) ^ set(df23.columns))
print("Different columns:", cols_diff if cols_diff else "None - all columns match")

df17['year'] = 2017
df21['year'] = 2021
df23['year'] = 2023
df = pd.concat([df17, df21, df23], axis=0, ignore_index=True)


Index(['id convenio marco', 'nrolicitacionpublica', 'idproducto',
       'nombre producto', 'idtipoproducto', 'tipo producto', 'marca', 'modelo',
       'medida', 'descripcion', 'idproveedor', 'rut', 'proveedor', 'precio',
       'moneda', 'estado productoproveedorconvenio',
       'estado proveedor convenio', 'estado producto', 'fechaactualizacion'],
      dtype='object') Index(['nrolicitacionpublica', 'idproductocm', 'nombreproducto',
       'idtipoproducto', 'tipoproducto', 'marca', 'modelo', 'medida',
       'descripcion', 'rut', 'orgcode', 'razonsocialproveedor', 'moneda',
       'precio', 'tipo_precio', 'stock', 'fechaactualizacion'],
      dtype='object') Index(['convenio marco', 'numero licitacion', 'id convenio marco',
       'id proveedor', 'nombre proveedor', 'rut proveedor', 'id producto',
       'codigo onu', 'producto', 'id tipo producto', 'tipo producto', 'region',
       'marca', 'modelo', 'medida', 'stock', 'precio en tienda',
       'precio regular', 'precio oferta', 

In [104]:
# Get unique values from the "tipo producto" column
unique_tipos = df['tipo producto'].value_counts()
print(unique_tipos)

print(df['tipo producto'].unique())


tipo producto
suv                                                 5154
camioneta                                           4052
cargo                                               2752
sedan                                               1114
tolva                                                633
excavadora                                           506
furgon                                               485
minibus                                              393
cargador                                             358
aditamento para maquinaria                           298
mantencion preventiva vehiculo liviano y mediano     271
mantencion preventiva vehiculo pesado                250
hatchback                                            144
bus pesado                                           133
retroexcavadora                                      112
tractor                                               87
buses media y larga distancia                         81
tracto           

### select relevant categories and obtain their brand and model

In [105]:
# List of Tipo Producto we're interested in: needs to have more than 100 obs and be reasonable small (no trucks, buses, tolva, etc)
tipos_producto = ['camioneta', 'sedan', 'suv', 'minibus', 'furgon', 'hatchback']


# Create an empty list to store all results
all_results = []

# Iterate through each Tipo Producto
for tipo in tipos_producto:
    print(f"\nTipo Producto: {tipo}")
    
    # Filter the dataframe for the current Tipo Producto
    df_filtered = df[df['tipo producto'] == tipo]
    
    # Get unique combinations of Marca and Modelo
    unique_combinations = df_filtered[['marca', 'modelo']].drop_duplicates()
    
    # Sort the combinations by marca and Modelo
    unique_combinations = unique_combinations.sort_values(['marca', 'modelo'])
    
    # Add Tipo Producto column
    unique_combinations['tipo producto'] = tipo
    
    # Append to all_results
    all_results.append(unique_combinations)

# Combine all results into a single DataFrame
result_df = pd.concat(all_results, ignore_index=True)

# Save the result to a CSV file
output_file = 'csvs/unique_marca_modelo_by_tipo_producto.csv'
result_df.to_csv(output_file, index=False)
print(f"\nResults saved to {output_file}")


Tipo Producto: camioneta

Tipo Producto: sedan

Tipo Producto: suv

Tipo Producto: minibus

Tipo Producto: furgon

Tipo Producto: hatchback

Results saved to csvs/unique_marca_modelo_by_tipo_producto.csv


### Vemos que en el nombre del producto hay varias especificaciones aparte del nombre, como el motor, drive, transmisión y año. Todos son del 2024.

In [106]:
# Read the CSV data into a DataFrame
df = pd.read_csv('csvs/unique_marca_modelo_by_tipo_producto.csv')

def parse_modelo(modelo):
    parsed = {
        'Model Name': '',
        'Engine': '',
        'Drive': '',
        'Transmission': '',
        'Year': ''
    }
    tokens = modelo.strip().split()
    model_name_tokens = []
    started_fields = False

    for token in tokens:
        # Check for year (assuming it's a 4-digit number)
        if re.match(r'^\d{4}$', token):
            parsed['Year'] = token
            started_fields = True
        # Check for engine size with decimal or ending with 'L' or 'T'
        elif re.match(r'^\d+(\.\d+)?[LT]?T?$', token):
            if ('.' in token) or token.endswith(('L', 'T')):
                parsed['Engine'] = token
                started_fields = True
            else:
                if not started_fields:
                    model_name_tokens.append(token)
                else:
                    model_name_tokens.append(token)
        # Check for drive types
        elif token.upper() in ['4X2', '4X4', '2WD', '4WD', 'AWD', '2R', 'FWD', 'RWD']:
            parsed['Drive'] = token.upper()
            started_fields = True
        # Check for transmission types (including multi-character)
        elif re.match(r'^\d?(MT|AT|CVT|DCT|EAT6|EAT8)$', token.upper()):
            parsed['Transmission'] = token.upper()
            started_fields = True
        else:
            # Always add to Model Name
            model_name_tokens.append(token)

    parsed['Model Name'] = ' '.join(model_name_tokens).strip()
    return parsed



# Apply the parsing function to the DataFrame
parsed_data = df['modelo'].apply(parse_modelo)
parsed_df = pd.DataFrame(parsed_data.tolist())

# Combine the parsed data with the original DataFrame
result_df = pd.concat([df.drop(columns=['modelo']), parsed_df], axis=1)

# Save the result to a new CSV file
result_df.to_csv('csvs/parsed_output.csv', index=False)


OTRAS PÁGINAS WEB
https://www.car.info/
cars.com
edmunds.com

Changan...
https://db.carnewschina.com/changan/changan-hunter/2024

API de https://api-ninjas.com/api/cars

In [91]:
api_key = '9GyIlheuW+x4HWPLGzAqEQ==iTzy4qqM2x8bUybS'  # Replace with your actual API key

output_rows = []

with open('csvs/parsed_output.csv', mode='r', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    fieldnames = reader.fieldnames + ['API_Response']
    for row in reader:
        model_name = row.get('Model Name', '').strip()
        if model_name:
            first_word = model_name.split()[0].lower()
            api_url = f'https://api.api-ninjas.com/v1/cars?model={first_word}&year=2024'
            response = requests.get(api_url, headers={'X-Api-Key': api_key})
            if response.status_code == requests.codes.ok:
                row['API_Response'] = response.text
            else:
                row['API_Response'] = f"Error: {response.status_code} {response.text}"
        else:
            row['API_Response'] = "No model name found"
        output_rows.append(row)

with open('output.csv', mode='w', encoding='utf-8', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(output_rows)

### Ejemplo del get de los modelo Colorado, hay varios modelos pero no exactos. A nivel de la API hay match exactos, modelos parecidos y algunos que no están.

In [92]:
API_KEY="9GyIlheuW+x4HWPLGzAqEQ==iTzy4qqM2x8bUybS"

api_url = "https://api.api-ninjas.com/v1/cars?limit=50&model=colorado&year=2024&transmission=a"

headers = {'X-Api-Key': API_KEY}

response = requests.get(api_url, headers=headers)

print(response.json())

[{'city_mpg': 18, 'class': 'small pickup truck', 'combination_mpg': 20, 'cylinders': 4, 'displacement': 2.7, 'drive': '4wd', 'fuel_type': 'gas', 'highway_mpg': 23, 'make': 'chevrolet', 'model': 'colorado 4wd', 'transmission': 'a', 'year': 2024}, {'city_mpg': 19, 'class': 'small pickup truck', 'combination_mpg': 21, 'cylinders': 4, 'displacement': 2.7, 'drive': 'rwd', 'fuel_type': 'gas', 'highway_mpg': 24, 'make': 'chevrolet', 'model': 'colorado 2wd', 'transmission': 'a', 'year': 2024}, {'city_mpg': 20, 'class': 'small pickup truck', 'combination_mpg': 22, 'cylinders': 4, 'displacement': 2.7, 'drive': 'rwd', 'fuel_type': 'gas', 'highway_mpg': 24, 'make': 'chevrolet', 'model': 'colorado 2wd', 'transmission': 'a', 'year': 2024}, {'city_mpg': 16, 'class': 'small pickup truck', 'combination_mpg': 16, 'cylinders': 4, 'displacement': 2.7, 'drive': '4wd', 'fuel_type': 'gas', 'highway_mpg': 16, 'make': 'chevrolet', 'model': 'colorado zr2 bison 4wd', 'transmission': 'a', 'year': 2024}, {'city_mp

colorado
silverado
maverick
ranger
sentra
leaf
yaris
bolt
traverse
escape
commander
compass
montero
outlander
kicks
pathfinder
crosstrek
4runner
rav4

In [18]:
import requests
import csv

url = "https://api.perplexity.ai/chat/completions"

# Read your existing CSV file
with open('parsed_output.csv', mode='r', encoding='utf-8-sig') as infile:
    reader = csv.DictReader(infile)
    rows = list(reader)

# Prepare to write to a new CSV file with additional columns
fieldnames = reader.fieldnames + [
    "FuelType", "MilesPerGallon",
    "Power", "Torque",
    "NumSeats", "FuelTankCapacity", "CargoCapacity",
    "CountryOfManufacture"
]

fieldnames_search = reader.fieldnames + [
    "Fuel Type", "Miles per gallon",
    "Power", "Torque",
    "Number of seats", "Capacity of Fuel Tank", "Cargo Capacity",
    "Country Of Manufacture"
]


with open('output.csv', mode='w', newline='', encoding='utf-8') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    writer.writeheader()

    for row in rows:
        # Construct the prompt
        vehicle_data = ', '.join([f"{key}: {value}" for key, value in row.items() if value])
        additional_fields = ', '.join(fieldnames_search[len(reader.fieldnames):])
        prompt = f"Given the following vehicle data: {vehicle_data}. Please search and provide the following additional information in the same order, separated by commas: {additional_fields}. If you do not find the information fill with NA, do not put any additional text only the string separated with comas."

        payload = {
            "model": "llama-3.1-sonar-huge-128k-online",
            "messages": [
                {
                    "role": "system",
                    "content": "You are a helpful assistant that outputs data in CSV format without any additional text. Only output the data in the specified order, separated by commas."
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            "max_tokens": 300,  # Adjusted for longer responses
            "temperature": 0.2,
            "top_p": 0.9,
            "return_citations": False,
            "return_images": False,
            "return_related_questions": False,
            "search_recency_filter": "month",
            "top_k": 0,
            "stream": False,
            "presence_penalty": 0,
            "frequency_penalty": 1
        }
        headers = {
            "Authorization": "Bearer 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",
            "Content-Type": "application/json"
        }

        response = requests.post(url, json=payload, headers=headers)

        # Check the response status code
        print(f"Status Code: {response.status_code}")

        if response.status_code == 200:
            try:
                data = response.json()
                # Extract the assistant's reply
                assistant_reply = data['choices'][0]['message']['content']
                print(assistant_reply)

                # Split the reply into a list of values
                new_values = assistant_reply.strip().split(',')
                print(new_values)

                # Create a dictionary of the new data
                new_data = dict(zip(fieldnames[len(reader.fieldnames):], [value.strip() for value in new_values]))
                print(new_data)

                # Combine the old and new data
                combined_row = {**row, **new_data}
                print(combined_row)

                # Write the combined data to the new CSV file
                writer.writerow(combined_row)

            except ValueError as e:
                print("Failed to parse JSON response.")
                print("Response Text:", response.text)
                continue  # Skip to the next iteration or handle the error accordingly
        else:
            print(f"Error: Received status code {response.status_code}")
            print("Response Text:", response.text)
            continue  # Skip to the next iteration or handle the error accordingly

        break  # Remove this break statement if you want to process all rows



Status Code: 200
Unleaded Petrol/Electric, NA, 114bhp, 141Nm, 5, NA, 390L, Japan
['Unleaded Petrol/Electric', ' NA', ' 114bhp', ' 141Nm', ' 5', ' NA', ' 390L', ' Japan']
{'FuelType': 'Unleaded Petrol/Electric', 'MilesPerGallon': 'NA', 'Power': '114bhp', 'Torque': '141Nm', 'NumSeats': '5', 'FuelTankCapacity': 'NA', 'CargoCapacity': '390L', 'CountryOfManufacture': 'Japan'}
{'MARCA': 'TOYOTA', 'TIPO PRODUCTO': 'SUV', 'Model Name': 'YARIS CROSS XI', 'Engine': '1.5', 'Drive': '', 'Transmission': 'AT', 'Year': '2024', 'FuelType': 'Unleaded Petrol/Electric', 'MilesPerGallon': 'NA', 'Power': '114bhp', 'Torque': '141Nm', 'NumSeats': '5', 'FuelTankCapacity': 'NA', 'CargoCapacity': '390L', 'CountryOfManufacture': 'Japan'}
