### Obtenemos los Productos

In [6]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('raw_data/Vehiculos/MaestraProd_cm_2239-8-lr23.csv', sep=';')

# Get unique values from the "TIPO PRODUCTO" column
unique_tipos = df['TIPO PRODUCTO'].unique()

# Print the unique values
print("Unique TIPO PRODUCTO values:")
for tipo in unique_tipos:
    print(tipo)

Unique TIPO PRODUCTO values:
CAMIONETA
SEDÁN
SUV
MINIBUS
FURGÓN
CARGO
EXCAVADORA
MOTONIVELADORA
RETROEXCAVADORA
TOLVA
CARGADOR
BUS PESADO


### Para las categorías que nos importan obtenemos la marca y modelo de manera única

In [7]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('raw_data/Vehiculos/MaestraProd_cm_2239-8-lr23.csv', sep=';')

# List of Tipo Producto we're interested in
tipos_producto = ['CAMIONETA', 'SEDÁN', 'SUV', 'MINIBUS', 'FURGÓN']

# Create an empty list to store all results
all_results = []

# Iterate through each Tipo Producto
for tipo in tipos_producto:
    print(f"\nTipo Producto: {tipo}")
    
    # Filter the dataframe for the current Tipo Producto
    df_filtered = df[df['TIPO PRODUCTO'] == tipo]
    
    # Get unique combinations of Marca and Modelo
    unique_combinations = df_filtered[['MARCA', 'MODELO']].drop_duplicates()
    
    # Sort the combinations by Marca and Modelo
    unique_combinations = unique_combinations.sort_values(['MARCA', 'MODELO'])
    
    # Add Tipo Producto column
    unique_combinations['TIPO PRODUCTO'] = tipo
    
    # Append to all_results
    all_results.append(unique_combinations)

# Combine all results into a single DataFrame
result_df = pd.concat(all_results, ignore_index=True)

# Save the result to a CSV file
output_file = 'unique_marca_modelo_by_tipo_producto.csv'
result_df.to_csv(output_file, index=False)
print(f"\nResults saved to {output_file}")


Tipo Producto: CAMIONETA
Marca: CHANGAN, Modelo: HUNTER COMFORT 2.0T 4X2 MT 2024
Marca: CHANGAN, Modelo: HUNTER COMFORT 2.0T 4X4 MT 2024
Marca: CHANGAN, Modelo: HUNTER ELITE SPORT 2.0T 4X4 MT 2024
Marca: CHANGAN, Modelo: HUNTER LUXURY 2.0T 4X2 MT 2024
Marca: CHANGAN, Modelo: MD 301 AC ESP MT 2024
Marca: CHANGAN, Modelo: MD201 MT 2024
Marca: CHANGAN, Modelo: MD201 XL 1.5L MT 2024
Marca: CHANGAN, Modelo: MS 201 MT 2024
Marca: CHANGAN, Modelo: MS 301 PLUS AC ESP 2024
Marca: CHEVROLET, Modelo: COLORADO HIGH COUNTRY 4X4 AT 2024
Marca: CHEVROLET, Modelo: COLORADO LS 4X2 MT 2024
Marca: CHEVROLET, Modelo: COLORADO LT 4X4 AT 2024
Marca: CHEVROLET, Modelo: COLORADO LT 4X4 MT 2024
Marca: CHEVROLET, Modelo: COLORADO LTZ 4X4 AT 2024
Marca: CHEVROLET, Modelo: COLORADO Z71 4X4 AT 2024
Marca: CHEVROLET, Modelo: MONTANA 1.2 T LT MT 2024
Marca: CHEVROLET, Modelo: MONTANA 1.2 T LTZ AT 2024
Marca: CHEVROLET, Modelo: MONTANA 1.2 T PREMIER AT 2024
Marca: CHEVROLET, Modelo: MONTANA 1.2 T PREMIER MT 2024
Mar

### Vemos que en el nombre del producto hay varias especificaciones aparte del nombre, como el motor, drive, transmisión y año. Todos son del 2024.

In [20]:
import pandas as pd
import re

# Read the CSV data into a DataFrame
df = pd.read_csv('unique_marca_modelo_by_tipo_producto.csv')

def parse_modelo(modelo):
    parsed = {
        'Model Name': '',
        'Engine': '',
        'Drive': '',
        'Transmission': '',
        'Year': ''
    }
    tokens = modelo.strip().split()
    model_name_tokens = []
    started_fields = False

    for token in tokens:
        # Check for year (assuming it's a 4-digit number)
        if re.match(r'^\d{4}$', token):
            parsed['Year'] = token
            started_fields = True
        # Check for engine size with decimal or ending with 'L' or 'T'
        elif re.match(r'^\d+(\.\d+)?[LT]?T?$', token):
            if ('.' in token) or token.endswith(('L', 'T')):
                parsed['Engine'] = token
                started_fields = True
            else:
                if not started_fields:
                    model_name_tokens.append(token)
                else:
                    model_name_tokens.append(token)
        # Check for drive types
        elif token.upper() in ['4X2', '4X4', '2WD', '4WD', 'AWD', '2R', 'FWD', 'RWD']:
            parsed['Drive'] = token.upper()
            started_fields = True
        # Check for transmission types (including multi-character)
        elif re.match(r'^\d?(MT|AT|CVT|DCT|EAT6|EAT8)$', token.upper()):
            parsed['Transmission'] = token.upper()
            started_fields = True
        else:
            # Always add to Model Name
            model_name_tokens.append(token)

    parsed['Model Name'] = ' '.join(model_name_tokens).strip()
    return parsed



# Apply the parsing function to the DataFrame
parsed_data = df['MODELO'].apply(parse_modelo)
parsed_df = pd.DataFrame(parsed_data.tolist())

# Combine the parsed data with the original DataFrame
result_df = pd.concat([df.drop(columns=['MODELO']), parsed_df], axis=1)

# Save the result to a new CSV file
result_df.to_csv('parsed_output.csv', index=False)


OTRAS PÁGINAS WEB
https://www.car.info/
cars.com
edmunds.com

Changan...
https://db.carnewschina.com/changan/changan-hunter/2024

API de https://api-ninjas.com/api/cars

In [25]:
import csv
import requests

api_key = 'API_KEY="6ue2W96PLx7uIy8/aSzZjg==BjuLPBac2jh5yqGI"'  # Replace with your actual API key

output_rows = []

with open('parsed_output.csv', mode='r', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    fieldnames = reader.fieldnames + ['API_Response']
    for row in reader:
        model_name = row.get('Model Name', '').strip()
        if model_name:
            first_word = model_name.split()[0].lower()
            api_url = f'https://api.api-ninjas.com/v1/cars?model={first_word}&year=2024'
            response = requests.get(api_url, headers={'X-Api-Key': api_key})
            if response.status_code == requests.codes.ok:
                row['API_Response'] = response.text
            else:
                row['API_Response'] = f"Error: {response.status_code} {response.text}"
        else:
            row['API_Response'] = "No model name found"
        output_rows.append(row)

with open('output.csv', mode='w', encoding='utf-8', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(output_rows)

### Ejemplo del get de los modelo Colorado, hay varios modelos pero no exactos. A nivel de la API hay match exactos, modelos parecidos y algunos que no están.

In [23]:
API_KEY="6ue2W96PLx7uIy8/aSzZjg==BjuLPBac2jh5yqGI"

api_url = "https://api.api-ninjas.com/v1/cars?limit=50&model=colorado&year=2024&transmission=a"

headers = {'X-Api-Key': API_KEY}

response = requests.get(api_url, headers=headers)

print(response.json())

[{'city_mpg': 18, 'class': 'small pickup truck', 'combination_mpg': 20, 'cylinders': 4, 'displacement': 2.7, 'drive': '4wd', 'fuel_type': 'gas', 'highway_mpg': 23, 'make': 'chevrolet', 'model': 'colorado 4wd', 'transmission': 'a', 'year': 2024}, {'city_mpg': 19, 'class': 'small pickup truck', 'combination_mpg': 21, 'cylinders': 4, 'displacement': 2.7, 'drive': 'rwd', 'fuel_type': 'gas', 'highway_mpg': 24, 'make': 'chevrolet', 'model': 'colorado 2wd', 'transmission': 'a', 'year': 2024}, {'city_mpg': 20, 'class': 'small pickup truck', 'combination_mpg': 22, 'cylinders': 4, 'displacement': 2.7, 'drive': 'rwd', 'fuel_type': 'gas', 'highway_mpg': 24, 'make': 'chevrolet', 'model': 'colorado 2wd', 'transmission': 'a', 'year': 2024}, {'city_mpg': 16, 'class': 'small pickup truck', 'combination_mpg': 16, 'cylinders': 4, 'displacement': 2.7, 'drive': '4wd', 'fuel_type': 'gas', 'highway_mpg': 16, 'make': 'chevrolet', 'model': 'colorado zr2 bison 4wd', 'transmission': 'a', 'year': 2024}, {'city_mp

colorado
silverado
maverick
ranger
sentra
leaf
yaris
bolt
traverse
escape
commander
compass
montero
outlander
kicks
pathfinder
crosstrek
4runner
rav4

In [18]:
import requests
import csv

url = "https://api.perplexity.ai/chat/completions"

# Read your existing CSV file
with open('parsed_output.csv', mode='r', encoding='utf-8-sig') as infile:
    reader = csv.DictReader(infile)
    rows = list(reader)

# Prepare to write to a new CSV file with additional columns
fieldnames = reader.fieldnames + [
    "FuelType", "MilesPerGallon",
    "Power", "Torque",
    "NumSeats", "FuelTankCapacity", "CargoCapacity",
    "CountryOfManufacture"
]

fieldnames_search = reader.fieldnames + [
    "Fuel Type", "Miles per gallon",
    "Power", "Torque",
    "Number of seats", "Capacity of Fuel Tank", "Cargo Capacity",
    "Country Of Manufacture"
]


with open('output.csv', mode='w', newline='', encoding='utf-8') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    writer.writeheader()

    for row in rows:
        # Construct the prompt
        vehicle_data = ', '.join([f"{key}: {value}" for key, value in row.items() if value])
        additional_fields = ', '.join(fieldnames_search[len(reader.fieldnames):])
        prompt = f"Given the following vehicle data: {vehicle_data}. Please search and provide the following additional information in the same order, separated by commas: {additional_fields}. If you do not find the information fill with NA, do not put any additional text only the string separated with comas."

        payload = {
            "model": "llama-3.1-sonar-huge-128k-online",
            "messages": [
                {
                    "role": "system",
                    "content": "You are a helpful assistant that outputs data in CSV format without any additional text. Only output the data in the specified order, separated by commas."
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            "max_tokens": 300,  # Adjusted for longer responses
            "temperature": 0.2,
            "top_p": 0.9,
            "return_citations": False,
            "return_images": False,
            "return_related_questions": False,
            "search_recency_filter": "month",
            "top_k": 0,
            "stream": False,
            "presence_penalty": 0,
            "frequency_penalty": 1
        }
        headers = {
            "Authorization": "Bearer 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",
            "Content-Type": "application/json"
        }

        response = requests.post(url, json=payload, headers=headers)

        # Check the response status code
        print(f"Status Code: {response.status_code}")

        if response.status_code == 200:
            try:
                data = response.json()
                # Extract the assistant's reply
                assistant_reply = data['choices'][0]['message']['content']
                print(assistant_reply)

                # Split the reply into a list of values
                new_values = assistant_reply.strip().split(',')
                print(new_values)

                # Create a dictionary of the new data
                new_data = dict(zip(fieldnames[len(reader.fieldnames):], [value.strip() for value in new_values]))
                print(new_data)

                # Combine the old and new data
                combined_row = {**row, **new_data}
                print(combined_row)

                # Write the combined data to the new CSV file
                writer.writerow(combined_row)

            except ValueError as e:
                print("Failed to parse JSON response.")
                print("Response Text:", response.text)
                continue  # Skip to the next iteration or handle the error accordingly
        else:
            print(f"Error: Received status code {response.status_code}")
            print("Response Text:", response.text)
            continue  # Skip to the next iteration or handle the error accordingly

        break  # Remove this break statement if you want to process all rows



Status Code: 200
Unleaded Petrol/Electric, NA, 114bhp, 141Nm, 5, NA, 390L, Japan
['Unleaded Petrol/Electric', ' NA', ' 114bhp', ' 141Nm', ' 5', ' NA', ' 390L', ' Japan']
{'FuelType': 'Unleaded Petrol/Electric', 'MilesPerGallon': 'NA', 'Power': '114bhp', 'Torque': '141Nm', 'NumSeats': '5', 'FuelTankCapacity': 'NA', 'CargoCapacity': '390L', 'CountryOfManufacture': 'Japan'}
{'MARCA': 'TOYOTA', 'TIPO PRODUCTO': 'SUV', 'Model Name': 'YARIS CROSS XI', 'Engine': '1.5', 'Drive': '', 'Transmission': 'AT', 'Year': '2024', 'FuelType': 'Unleaded Petrol/Electric', 'MilesPerGallon': 'NA', 'Power': '114bhp', 'Torque': '141Nm', 'NumSeats': '5', 'FuelTankCapacity': 'NA', 'CargoCapacity': '390L', 'CountryOfManufacture': 'Japan'}
