This file useses the purchases of the four FAs that we could find (2239-5-LR21, 2239-8-LR23, 2239-4-LR17 and ID 2239-20-LP13 ) and estimates a demand model. 

    'IDProductoCM' -> j 
    To create product characteristics: 'Tipo de Producto', 'Marca', 'Nombre Producto ONU'
    'Precio Unitario'-> net not including taxes, just changes the scale of the price parameter 
    'Rut Unidad de Compra' -> i 
    To create the groups (k):  'Región Unidad de Compra', 'Sector'
     'year' -> t 

     'Modelo'-> to do the match with the product characteristics. 
        
Variables not directly paired but which could be useful: 
    'Nro Licitación Pública', 'Id Convenio Marco', 'Convenio Marco', 'CodigoOC', 'Fecha Envío OC', 'Cantidad', 'Rut Proveedor', 
    'Nombre Proveedor Sucursal', 'Orgcode_Comprador', 'Entcode_Comprador', 


In [1]:
%reset -f

In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns
from pandasgui import show
import socket
import numpy as np
from unidecode import unidecode
import unicodedata
import re
from difflib import SequenceMatcher
import pyblp


Functions

In [3]:
def calculate_market_shares_with_outside_good(df_transac, share_adjustment = True):
    """
    Calculate market shares using market-specific N_i from the DataFrame
    """
    # Group by product (j) and market (t) to count purchases
    product_counts = df_transac.groupby(['product_ids', 'market_ids']).size().reset_index(name='purchases')
    
    # Get N_i for each market (taking first occurrence since it's constant within market)
    market_populations = df_transac.groupby('market_ids')['N_i'].first().reset_index()
    
    # Merge N_i with product counts
    product_counts = product_counts.merge(market_populations, on='market_ids')
    
    # Calculate market shares
    if share_adjustment: 
        product_counts['shares'] = np.minimum(5*product_counts['purchases'] / product_counts['N_i'], 0.5)
    else: 
        product_counts['shares'] = product_counts['purchases'] / product_counts['N_i']
    
    product_counts['shares_real'] =product_counts['purchases'] / product_counts['N_i']
                                               
    x_vars = [col for col in df_transac.columns if col.startswith('x')] #list of variables starting with x 
    
    # Get product characteristics
    product_chars = df_transac.groupby(['product_ids', 'market_ids']).agg(
        {var: 'first' for var in x_vars} | {'prices': 'first'}
    ).reset_index()
    
    # Merge market shares with product characteristics
    final_df = product_chars.merge(product_counts[['product_ids', 'market_ids', 'shares', 'shares_real']], on=['product_ids', 'market_ids'])
    
    # Calculate outside good share for each market
    market_shares = final_df.groupby('market_ids')['shares'].sum()
    outside_shares = 1 - market_shares
    market_shares_real = final_df.groupby('market_ids')['shares_real'].sum()
    outside_shares_real = 1 - market_shares_real
    
    
    # Create outside good rows
    markets = df_transac['market_ids'].unique()
    outside_goods = pd.DataFrame()
    outside_goods['market_ids'] = markets
    outside_goods['product_ids'] = 0

    for var in x_vars:
        if pd.api.types.is_numeric_dtype(df_transac[var]):
            outside_goods[var] = 0  # Set numeric variables to 0
        else:
            outside_goods[var] = '-'  # Set categorical variables to '-'
    
    outside_goods['prices'] = 0
    outside_goods['shares'] = [outside_shares[t] for t in markets]
    outside_goods['shares_real'] = [outside_shares_real[t] for t in markets]

    # Combine outside goods with other products
    final_df = pd.concat([final_df, outside_goods])
    
    # Sort by market and product ID
    final_df = final_df.sort_values(['market_ids', 'product_ids'])
    
    return final_df

def normalize_shares(group):
    """
    Function to normalize shares for each market
    """
    # Fix shares that are not negative
    total_shares = group[group['shares'] > 0.01]['shares'].sum()
    scaling_factor = 0.99 - group[group['shares'] == 0.01]['shares'].sum()
    if total_shares > 0:  # Avoid division by zero
        group.loc[group['shares'] > 0.01, 'shares'] *= scaling_factor / total_shares
    return group

In [20]:
car_path = os.path.join('..', 'car_data', 'final_matched_data.csv')
print(os.path.abspath(car_path))
car_df_original = pd.read_csv(car_path, encoding = 'latin1')
car_df = car_df_original.copy()

c:\Users\lucas\OneDrive - Yale University\Documents\GitHub\2nd-year-paper\car_data\final_matched_data.csv


## data cleaning

1. select vars that could be relevant
2. rename prod. char as x1, x2.. and do other renaming 
3.  to reduce fixed effects if a variable is categorical and has more than 5 values we remove it. 

In [21]:
#### 1. variable selection 
char_list = [ 'Motor - Cilindrada', 'Medidas y capacidades - Ancho sin espejos', 'Medidas y capacidades - Largo', 
        'Medidas y capacidades - Alto', 'Performance - Rendimiento en ciudad', 'TransmisiÃ³n y chasis - Motor - tracciÃ³n', 
        'TransmisiÃ³n y chasis - SuspensiÃ³n trasera', 'Confort - Aire acondicionado', 'Confort - Tapizados',
        'Confort - Cierre de puertas', 'Confort - Vidrios (del. - tras.)', 'Confort - Espejos exteriores',
        'Confort - Faros delanteros', 'Confort - Faros antiniebla', 'Confort - Computadora de a bordo', 
        'Confort - DirecciÃ³n asistida', 'Confort - Llantas', 'Seguridad - Airbags', 
        'Seguridad - Alarma e inmovilizador de motor', 'Confort - Sensores de estacionamiento', 'Tipo de Producto']   
            
model_vars = ['Cargos Adicionales OC', 'Monto Total OC', 'Rut Unidad de Compra', 
        'RegiÃÂ³n Unidad de Compra', 'Sector', 'CodigoOC' , 'Marca', 'Precio Unitario',
        'IDProductoCM',  'Modelo']

extra_vars = ['Nro LicitaciÃÂ³n PÃÂºblica', 'Fecha EnvÃÂ­o OC', 'Cantidad', 'Rut Proveedor', 'Nombre Proveedor Sucursal']

car_df = car_df[model_vars + extra_vars + char_list]

### 2. rename vars 
for n, col in enumerate(char_list, start=1):  # Start numbering from 1
    car_df.rename(columns={col: f'x{n}'}, inplace=True)

car_df = car_df.rename(columns={
    'Fecha EnvÃÂ­o OC': 'Fecha',
    'RegiÃÂ³n Unidad de Compra': 'Region', 
    'IDProductoCM': 'product_ids',
})

### 3. drop vars to reduce number of parameters to estimate
for n, col in enumerate(char_list, start=1):  # Start numbering from 1
    print(f"number of unique values in {f'x{n}'}: {car_df[f'x{n}'].nunique()}")


drop_list = ['x7', 'x9', 'x11', 'x12', 'x13', 'x18']
car_df = car_df.drop(drop_list, axis=1)


number of unique values in x1: 25
number of unique values in x2: 24
number of unique values in x3: 29
number of unique values in x4: 28
number of unique values in x5: 18
number of unique values in x6: 3
number of unique values in x7: 18
number of unique values in x8: 5
number of unique values in x9: 5
number of unique values in x10: 3
number of unique values in x11: 4
number of unique values in x12: 10
number of unique values in x13: 19
number of unique values in x14: 4
number of unique values in x15: 1
number of unique values in x16: 3
number of unique values in x17: 3
number of unique values in x18: 8
number of unique values in x19: 5
number of unique values in x20: 3
number of unique values in x21: 2


1. Create month-year 
2. create markets and number of consumers
3. create consumer group

In [None]:
#1. create month-year 
car_df['Fecha'] = pd.to_datetime(car_df['Fecha'], errors='coerce')
car_df['month'] = car_df['Fecha'].dt.month
car_df['year'] = car_df['Fecha'].dt.year
car_df.drop('Fecha', axis=1, inplace=True)

#2. define the market and the number of consumers in the market 
car_df['semester'] = ((car_df['month'] - 1) // 6) + 1
car_df['market_ids'] = car_df.groupby(['year', 'semester', 'Region']).ngroup()
print(car_df['market_ids'].nunique())

region_counts = car_df.groupby('Region')['market_ids'].nunique()
car_df['N_i'] = car_df['Region'].map(region_counts)

## 3. create consumer groups based on zone-sector
#print(car_df['Region'].unique())
Norte = ['Arica y Parinacota', 'TarapacÃ\x83Â\x83Ã\x82Â¡', 'Antofagasta', 'Atacama', 'Coquimbo']
Centro = ["Lib. Gral. Bdo. O'Higgins", 'Maule', 'Metropolitana', 'ValparaÃ\x83Â\x83Ã\x82Â\xadso' ]
Sur = ['BÃ\x83Â\x83Ã\x82Â\xado-BÃ\x83Â\x83Ã\x82Â\xado', 'Los RÃ\x83Â\x83Ã\x82Â\xados', 'Ã\x83Â\x83Ã\x82Â\x91uble',
    'Los Lagos', 'AraucanÃ\x83Â\x83Ã\x82Â\xada', 'Magallanes y AntÃ\x83Â\x83Ã\x82Â¡rtica', 'AysÃ\x83Â\x83Ã\x82Â©n']


def get_zone(region):
    if region in Norte:
        return 'Norte'
    elif region in Centro:
        return 'Centro'
    elif region in Sur:
        return 'Sur'
    else:
        return 'Other'  # For any regions not in the lists
car_df['Zona'] = car_df['Region'].apply(get_zone)
car_df['k'] = car_df.groupby(['Sector', 'Zona']).ngroup()
car_df['k2'] = car_df.groupby('Sector').ngroup()

74


1. convert to numeric values and drop chars with NaNs 
2. drop columns with NaNs since they can not be used.


In [109]:
# 1. convert to numeric values and drop chars with NaNs 
def extract_number(value):
    if pd.isna(value) or value == 'N/D':
        return np.nan
    # Extract numbers using string methods
    if isinstance(value, str):
        # Find first sequence of numbers (including decimals)
        numbers = ''.join(c for c in value if c.isdigit() or c == '.')
        return float(numbers) if numbers else value
    return value

for column in car_df.columns:
    try:
        car_df[column] = car_df[column].apply(extract_number)
    except:
        continue # If conversion fails, keep the original values


## 2. drop rows with NaNs
x_columns = [col for col in car_df.columns if col.startswith('x')]
x_columns_with_missing = [col for col in x_columns if car_df[col].isna().any()] #cols start with x and have NaNs
non_x_columns = [col for col in car_df.columns if not col.startswith('x')] # Get non-x columns 
car_df = car_df.drop(columns=x_columns_with_missing) # Drop x-columns with missing values

remaining_x_columns = [col for col in car_df.columns if col.startswith('x')]
new_x_names = [f'x{i+1}' for i in range(len(remaining_x_columns))] #rename x cols so that they dont skip numbers 
rename_dict = dict(zip(remaining_x_columns, new_x_names))
car_df = car_df.rename(columns=rename_dict) # Rename only the x-columns

In [24]:
car_df['Monto Total OC'] = car_df['Monto Total OC'].replace(r',.*', '', regex=True) #remove commas from numbers

columns_to_convert = ['Monto Total OC'] 

# Convert the specified columns to numeric
car_df[columns_to_convert] = car_df[columns_to_convert].apply(pd.to_numeric, errors='coerce')
car_df['prices'] = car_df['Monto Total OC'] / car_df['Cantidad']

In [25]:
product_data = calculate_market_shares_with_outside_good(car_df, share_adjustment=False)

In [None]:
show(product_data)

check that shares 1. are positive and 2. sum upp to less than 1. 

In [26]:
###1. Check for negative shares
# percentage of neg shares. (if share is negative replace by 2%)
negative_shares_percentage = (product_data['shares'] < 0).mean() * 100
print(f"Percentage of negative shares: {negative_shares_percentage:.2f}%")
product_data['shares'] = product_data['shares'].apply(lambda x: 0.02 if x < 0.02 else x)

#shares have to sum up to 1%, normalize  by market_ids
product_data['shares_sum'] = product_data.groupby('market_ids')['shares'].transform('sum')
product_data['shares'] = product_data['shares'] / (product_data['shares_sum']+0.05) # add 0.05 since shares have to sum up to less than 1. 

# 3. check that outside shares are not too high and that there are no negative shares
outside_good_shares = product_data[product_data['product_ids'] == 0]['shares'].describe()
print(outside_good_shares)
negative_shares_percentage = (product_data['shares'] < 0).mean() * 100
print(f"Percentage of negative shares: {negative_shares_percentage:.2f}%")

# 4. check that no share sum up to more than 1 
sum_market_shares = product_data.groupby('market_ids')['shares'].sum()
markets_with_high_shares = sum_market_shares[(sum_market_shares > 1) ]
print(markets_with_high_shares)



Percentage of negative shares: 6.53%
count    74.000000
mean      0.293486
std       0.316287
min       0.000875
25%       0.007099
50%       0.088711
75%       0.634921
max       0.793651
Name: shares, dtype: float64
Percentage of negative shares: 0.00%
Series([], Name: shares, dtype: float64)


## PyBLP without micro-moments

I tried to estimate the model with less variables but I do not understand why the number of moments and the # of parameters decrease, and I do not get the negative price coefficient. Hence for now this estimation is OK. 

In [81]:
formulation_string = '1 + ' + ' + '.join(new_x_names) + ' + prices'
X1_formulation = pyblp.Formulation(formulation_string) # non random coefficient formulation
X2_formulation = pyblp.Formulation('1 + prices') # variables with random coefficients. 
product_formulations = (X1_formulation, X2_formulation)
product_formulations


(1 + x1 + x2 + x3 + x4 + x5 + x6 + x7 + x9 + x10 + prices, 1 + prices)

In [82]:

mc_integration = pyblp.Integration('monte_carlo', size=50, specification_options={'seed': 0})
mc_integration

mc_problem = pyblp.Problem(product_formulations, product_data, integration=mc_integration)
mc_problem



pr_integration = pyblp.Integration('product', size=5)
pr_integration

pr_problem = pyblp.Problem(product_formulations, product_data, integration=pr_integration)
pr_problem

bfgs = pyblp.Optimization('bfgs', {'gtol': 1e-4})

Initializing the problem ...
Initialized the problem after 00:00:00.

Dimensions:
 T    N    I     K1    K2    MD 
---  ---  ----  ----  ----  ----
74   521  3700   24    2     23 

Formulations:
       Column Indices:          0     1      2    3                4                           5                           6                      7                       8                                     9                                     10                   11               12                              13                           14              15                     16                     17              18            19                20                     21            22     23  
-----------------------------  ---  ------  ---  ---  ---------------------------  --------------------------  -------------------------  ------------------  -------------------------  -----------------------------------------------  --------------------------  ------------  ------------------  ----

Detected collinearity issues with [x5['manual'], x6['manual'], x7['traseros'], x9['aleaciÃ\x83Â³n']] and at least one other column in X1. To disable collinearity checks, set options.collinear_atol = options.collinear_rtol = 0.
Detected collinearity issues with [x5['manual'], x6['manual'], x7['traseros'], x9['aleaciÃ\x83Â³n']] and at least one other column in ZD. To disable collinearity checks, set options.collinear_atol = options.collinear_rtol = 0.
Detected collinearity issues with [x5['manual'], x6['manual'], x7['traseros'], x9['aleaciÃ\x83Â³n']] and at least one other column in X1. To disable collinearity checks, set options.collinear_atol = options.collinear_rtol = 0.
Detected collinearity issues with [x5['manual'], x6['manual'], x7['traseros'], x9['aleaciÃ\x83Â³n']] and at least one other column in ZD. To disable collinearity checks, set options.collinear_atol = options.collinear_rtol = 0.


In [83]:
results1 = mc_problem.solve(sigma=np.ones((2, 2)), optimization=bfgs)


Solving the problem ...

Nonlinear Coefficient Initial Values:
Sigma:        1           prices      |  Sigma Squared:        1           prices    
------  -------------  -------------  |  --------------  -------------  -------------
  1     +1.000000E+00                 |        1         +1.000000E+00  +1.000000E+00
prices  +1.000000E+00  +1.000000E+00  |      prices      +1.000000E+00  +2.000000E+00
Starting optimization ...



The model may be under-identified. The total number of unfixed parameters is 27, which is more than the total number of moments, 23. Consider checking whether instruments were properly specified when initializing the problem, and whether parameters were properly configured when solving the problem.
Detected that the 2SLS weighting matrix is nearly singular with condition number +3.372674E+24. To disable singularity checks, set options.singular_tol = numpy.inf.



At least one error was encountered. As long as the optimization routine does not get stuck at values of theta that give rise to errors, this is not necessarily a problem. If the errors persist or seem to be impacting the optimization results, consider setting an error punishment or following any of the other suggestions below:
The fixed point computation of delta failed to converge. This problem can sometimes be mitigated by increasing the maximum number of fixed point iterations, increasing the fixed point tolerance, choosing more reasonable initial parameter values, setting more conservative parameter or share bounds, or using different iteration or optimization configurations.

GMM   Computation  Optimization   Objective   Fixed Point  Contraction  Clipped    Objective      Objective      Gradient                                                
Step     Time       Iterations   Evaluations  Iterations   Evaluations  Shares       Value       Improvement       Norm                    

Detected that the estimated covariance matrix of aggregate GMM moments is nearly singular with condition number +4.944639E+23. To disable singularity checks, set options.singular_tol = numpy.inf.



The fixed point computation of delta failed to converge. This problem can sometimes be mitigated by increasing the maximum number of fixed point iterations, increasing the fixed point tolerance, choosing more reasonable initial parameter values, setting more conservative parameter or share bounds, or using different iteration or optimization configurations.

Computed results after 00:00:22.

Problem Results Summary:
GMM     Objective      Gradient         Hessian         Hessian     Clipped  Weighting Matrix
Step      Value          Norm       Min Eigenvalue  Max Eigenvalue  Shares   Condition Number
----  -------------  -------------  --------------  --------------  -------  ----------------
 1    -3.685946E+44  +9.883039E+19  -8.152938E+19   +0.000000E+00     431     +7.058805E+18  

Starting optimization ...


At least one error was encountered. As long as the optimization routine does not get stuck at values of theta that give rise to errors, this is not necessarily a problem. If 

Detected that the estimated covariance matrix of aggregate GMM moments is nearly singular with condition number +1.223445E+24. To disable singularity checks, set options.singular_tol = numpy.inf.
Detected that the estimated covariance matrix of aggregate GMM moments is nearly singular with condition number +7.454853E+24. To disable singularity checks, set options.singular_tol = numpy.inf.


In [17]:
product_formulations = ( pyblp.Formulation('1 + x1 + x2 + x3 +  x6 + x10 + prices'), #X1:  non random coefficient formulation, 
                         pyblp.Formulation('1 + prices') #X2: variables with random coefficients. 
        )   

mc_problem = pyblp.Problem(product_formulations, product_data, integration=mc_integration)
results2 = mc_problem.solve(sigma=np.ones((2, 2)), optimization=bfgs)



The model may be under-identified. The total number of unfixed parameters is 12, which is more than the total number of moments, 8. Consider checking whether instruments were properly specified when initializing the problem, and whether parameters were properly configured when solving the problem.


Initializing the problem ...
Initialized the problem after 00:00:00.

Dimensions:
 T    N    I     K1    K2    MD 
---  ---  ----  ----  ----  ----
74   521  3700   9     2     8  

Formulations:
       Column Indices:          0     1      2    3           4                               5                            6         7     8   
-----------------------------  ---  ------  ---  ---  ------------------  ------------------------------------------  ------------  ---  ------
 X1: Linear Characteristics     1     x1    x2   x3   x6['centralizado']  x6['centralizado con comando a distancia']  x6['manual']  x10  prices
X2: Nonlinear Characteristics   1   prices                                                                                                     
Solving the problem ...

Nonlinear Coefficient Initial Values:
Sigma:        1           prices      |  Sigma Squared:        1           prices    
------  -------------  -------------  |  --------------  -------------  -------

## Py-BLP with micro-moments. 

product_data is the data with the prices/shares/prod char,  in the documentation called product data
agent_data is the data with the demographics, in our case it has the distribution of demographics. 

\begin{align*}
    &E[\text{i purchases new vehicle} \mid \{\bar{y}_i < \bar{y}_1\}], \\
    &E[\text{i purchases new vehicle} \mid \{\bar{y}_1 \leq \bar{y}_i < \bar{y}_2\}], \\
    &E[\text{i purchases new vehicle} \mid \{\bar{y}_i \geq \bar{y}_2\}];
\end{align*}

\begin{align*}
    &E[f_{si} \mid \{\text{i purchases a minivan}\}], \\
    &E[f_{si} \mid \{\text{i purchases a station wagon}\}], \\
    &E[f_{si} \mid \{\text{i purchases a sport-utility}\}], \\
    &E[f_{si} \mid \{\text{i purchases a full-size van}\}].
\end{align*}


For each market create one obs. for each local agency in the market. 

### Loading data

In [27]:
product_data_store = product_data.copy() # to have a copy of the original data

product_data['x11'] = 1* (product_data['x10'] == 'SUV')
product_data['x10'] =  1*(product_data['x10'] == 'CAMIONETA')
 
#product_data['clustering_ids'] = product_data['Region'] #cluster at the market level

In [29]:
# Step 1: Map each market_id to its corresponding Region
market_region_df = car_df[['market_ids', 'Region']].drop_duplicates()

# Step 2: Count the number of observations for each region-sector pair
region_sector_counts = car_df.groupby(['Region', 'Sector', 'k']).size().reset_index(name='count')

# Step 3: Merge the counts into a combined dataset for each market_id and its region
agent_data = (
    market_region_df
    .merge(region_sector_counts, on='Region', how='left')
)

agent_data = agent_data.loc[agent_data.index.repeat(agent_data['count'])]
agent_data = agent_data.reset_index(drop=True)


unique_k = sorted(agent_data['k'].unique()) #list of different k values

dummy_vars = []
for i, k_value in enumerate(unique_k, 1):
    
    var_name = f'a{i}'
    agent_data[var_name] = (agent_data['k'] == k_value).astype(int)
    dummy_vars.append(var_name)
    
# Create the formula string
dummy_vars
agent_formulation_string =' + '.join(dummy_vars)
agent_data.drop(['Region', 'Sector'], axis=1, inplace=True)

In [30]:
agent_data['N'] = agent_data['market_ids'].value_counts()[agent_data['market_ids']].values
agent_data = agent_data.drop('count', axis=1)
agent_data['weights'] = 1 / agent_data['N']
m = 5 
for i in range(0, m ): #important: nodes have to start from 0
    agent_data[f'nodes{i}'] = np.random.chisquare(df=3, size=len(agent_data))

### setting up the problem

In [132]:
formulation_vars = new_x_names.copy() 
formulation_vars.append('x11')
formulation_vars.remove('x7')
formulation_vars.remove('x8')
formulation_vars.remove('x9')
formulation_string2 = '1 + ' + ' + '.join(formulation_vars) + ' + prices'
product_formulations = (
    pyblp.Formulation(formulation_string2), # can replcae the string by 'formulation_string'
    pyblp.Formulation('1 + x10 + x11 +  prices'),
)
product_formulations

(1 + x1 + x2 + x3 + x4 + x5 + x6 + x10 + x11 + prices, 1 + x10 + x11 + prices)

In [133]:
agent_formulation = pyblp.Formulation('1 +' + agent_formulation_string)
agent_formulation


1 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 + a9 + a10 + a11 + a12 + a13 + a14 + a15 + a16 + a17 + a18 + a19

In [134]:
problem = pyblp.Problem(product_formulations, product_data, agent_formulation, agent_data, costs_type='log')
problem


Initializing the problem ...
Initialized the problem after 00:00:00.

Dimensions:
 T    N    I     K1    K2    D    MD 
---  ---  ----  ----  ----  ---  ----
74   521  3885   18    4    20    17 

Formulations:
       Column Indices:          0    1    2     3                  4                           5                           6                      7                       8                                     9                                     10                   11               12                              13                           14        15    16     17     18    19 
-----------------------------  ---  ---  ---  ------  ---------------------------  --------------------------  -------------------------  ------------------  -------------------------  -----------------------------------------------  --------------------------  ------------  ------------------  ------------------------------------------  ------------  ----  ----  ------  ----  ----
 X1: Linear Charact

Detected collinearity issues with [x5['manual'], x6['manual'], x11] and at least one other column in X1. To disable collinearity checks, set options.collinear_atol = options.collinear_rtol = 0.
Detected collinearity issues with [x5['manual'], x6['manual'], x11] and at least one other column in ZD. To disable collinearity checks, set options.collinear_atol = options.collinear_rtol = 0.


Dimensions:
 T    N    I     K1    K2    D    MD 
---  ---  ----  ----  ----  ---  ----
74   521  3885   18    4    20    17 

Formulations:
       Column Indices:          0    1    2     3                  4                           5                           6                      7                       8                                     9                                     10                   11               12                              13                           14        15    16     17     18    19 
-----------------------------  ---  ---  ---  ------  ---------------------------  --------------------------  -------------------------  ------------------  -------------------------  -----------------------------------------------  --------------------------  ------------  ------------------  ------------------------------------------  ------------  ----  ----  ------  ----  ----
 X1: Linear Characteristics     1   x1   x2     x3    x4['delantero - delantera']  x4['de

In [146]:
initial_sigma = np.diag([1,1, 1,1]) # square matrix with the dimension of X2, in our case a constant and prices. 
initial_pi = np.array(np.ones((4, 1+len(unique_k)))) # dim(x2) x dim(agent formulation)


### Setting up micro moments

In [98]:
micro_dataset = pyblp.MicroDataset(
    name="transactions",
    observations= len(agent_data),
    compute_weights=lambda t, p, a: np.ones((a.size, 1 + p.size)), #define a function that computes the weights and will be used later 
)
micro_dataset

transactions: 3885 Observations in All Markets

In [180]:
avg_outside_share = product_data[product_data['product_ids'] == 0]['shares'].mean() #average share of outside good

micro_statistics = car_df.groupby(['k', 'x10']).size().reset_index(name='count') #count of each k, x12 combination

# Add N column which is the count of k values
micro_statistics['N'] = car_df['k'].value_counts()[micro_statistics['k']].values
micro_statistics['share'] = micro_statistics['count'] / micro_statistics['N']
micro_statistics['share'] = micro_statistics['share']/(1 + avg_outside_share) #normalize by the outside share

for i, k_value in enumerate(unique_k, 1):    
    micro_statistics[f'a{i}'] = (micro_statistics['k'] == k_value).astype(int)

micro_statistics['x11'] = 1* (micro_statistics['x10'] == 'SUV')
micro_statistics['x10'] =  1*(micro_statistics['x10'] == 'CAMIONETA')


In [181]:
# Group by 'k' and calculate mean of 'precio unitario'
avg_prices = car_df.groupby('k')['Precio Unitario'].mean()
print(avg_prices)


for k, price in avg_prices.items():
    micro_statistics.loc[micro_statistics['k'] == k, 'prices'] = price
    micro_statistics = pd.concat([micro_statistics, pd.DataFrame({'k': [k], 'prices': [price]})], ignore_index=True)
 

k
0     2.257999e+07
1     1.964237e+07
2     2.280369e+07
3     2.087746e+07
4     2.555011e+07
5     2.395788e+07
6     1.659000e+07
7     1.836054e+07
8     2.295087e+07
9     2.254438e+07
10    1.955776e+07
11    2.393215e+07
12    2.268234e+07
13    1.875585e+07
14    1.842240e+07
15    2.257909e+07
16    1.729412e+07
17    2.689500e+07
18    2.427745e+07
Name: Precio Unitario, dtype: float64


let's start with only one momnet the probability of suv given a1 = 1

In [100]:

#in our case both x12 and k are categorical variables. hence we should follow the same procedure used to form the 
# moments that are in the form of E(new/income) where income is binned into categories, hence is similar to our setting. 

micro_parts_dict = {}

for _, row in micro_statistics.iterrows():
    part_name = f'E[{row["x10"]}|k={row["k"]}'  
    
    micro_parts_dict[part_name] = pyblp.MicroPart(
        name= part_name,
        dataset=micro_dataset,
        compute_values=lambda t, p, a: np.outer(a.demographics[:, row['k'] + 1], np.r_[0, p.X1[:, 7]])  # Adjust indices as needed
    )
    



In [101]:
inside_suv_a1_part = pyblp.MicroPart( #copying inside_mid_part 
    name="E[{x12 =1}  * {a1=1}]",
    dataset=micro_dataset,
    compute_values=lambda t, p, a: np.outer(a.demographics[:, 2], np.r_[0, p.X1[:, 13]]), #the 0 before X2 is because the outside option has value 0 
)

suv_a1_part= pyblp.MicroPart( #copying mid_part
    name="{a1=1}]",
    dataset=micro_dataset,
    compute_values=lambda t, p, a: np.outer(a.demographics[:, 3], np.r_[1, p.X1[:, 13]]), #note the 1 in X2 
)

In [102]:
compute_ratio = lambda v: v[0] / v[1]
compute_ratio_gradient = lambda v: [1 / v[1], -v[0] / v[1]**2]

In [103]:
micro_moments = [
    pyblp.MicroMoment(
        name="E[age_i | mi_j]",
        value=0.783, 
        parts=[inside_suv_a1_part, suv_a1_part], 
        compute_value=compute_ratio, 
        compute_gradient=compute_ratio_gradient,
    ),]

In [105]:
results = problem.solve(
    sigma=initial_sigma,
    pi=initial_pi,
    optimization=pyblp.Optimization('bfgs', {'gtol': 1e-4}),
    iteration=pyblp.Iteration('squarem', {'atol': 1e-13}),
    #se_type='clustered',
    #W_type='clustered',
    micro_moments=micro_moments,
)
results

Solving the problem ...

Micro Moments:
  Observed         Moment               Part             Dataset     Observations  Markets
-------------  ---------------  ---------------------  ------------  ------------  -------
+7.830000E-01  E[age_i | mi_j]  E[{x12 =1}  * {a1=1}]  transactions      3885        All  
                                       {a1=1}]         transactions      3885        All  

Nonlinear Coefficient Initial Values:
Sigma:        1           prices      |   Pi:          1             a1             a2             a3             a4             a5             a6             a7             a8             a9             a10            a11            a12            a13            a14            a15            a16            a17            a18            a19     
------  -------------  -------------  |  ------  -------------  -------------  -------------  -------------  -------------  -------------  -------------  -------------  -------------  -------------  ----------

The model may be under-identified. The total number of unfixed parameters is 67, which is more than the total number of moments, 25. Consider checking whether instruments were properly specified when initializing the problem, and whether parameters were properly configured when solving the problem.
Detected that the 2SLS weighting matrix is nearly singular with condition number +5.881668E+24. To disable singularity checks, set options.singular_tol = numpy.inf.
Detected that the estimated covariance matrix of aggregate GMM moments is nearly singular with condition number +6.703106E+23. To disable singularity checks, set options.singular_tol = numpy.inf.


Computed results after 00:00:02.

Starting optimization ...

GMM   Computation  Optimization   Objective   Fixed Point  Contraction  Clipped    Objective      Objective      Gradient                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
Step     Time       Iterations   Evaluations  Iterations   Evaluations  Shares       Value       Improvement       Norm                                                                 

Detected that the estimated covariance matrix of aggregate GMM moments is nearly singular with condition number +4.961012E+24. To disable singularity checks, set options.singular_tol = numpy.inf.


Computed results after 00:01:35.

Problem Results Summary:
GMM     Objective      Gradient         Hessian         Hessian     Clipped  Weighting Matrix
Step      Value          Norm       Min Eigenvalue  Max Eigenvalue  Shares   Condition Number
----  -------------  -------------  --------------  --------------  -------  ----------------
 1    +5.504648E+01  +0.000000E+00  +0.000000E+00   +0.000000E+00     443         +INF       

Starting optimization ...

GMM   Computation  Optimization   Objective   Fixed Point  Contraction  Clipped    Objective      Objective      Gradient                                                                                                                                                                                                                                                                                                                                                                                                                                

Detected that the estimated covariance matrix of aggregate GMM moments is nearly singular with condition number +4.109831E+23. To disable singularity checks, set options.singular_tol = numpy.inf.
Detected that the estimated covariance matrix of aggregate GMM moments is nearly singular with condition number +2.591848E+25. To disable singularity checks, set options.singular_tol = numpy.inf.


Problem Results Summary:
GMM     Objective      Gradient         Hessian         Hessian     Clipped  Weighting Matrix  Covariance Matrix
Step      Value          Norm       Min Eigenvalue  Max Eigenvalue  Shares   Condition Number  Condition Number 
----  -------------  -------------  --------------  --------------  -------  ----------------  -----------------
 2    +5.778962E+01  +0.000000E+00  +0.000000E+00   +0.000000E+00     443         +INF               +INF       

Cumulative Statistics:
Computation  Optimizer  Optimization   Objective   Fixed Point  Contraction
   Time      Converged   Iterations   Evaluations  Iterations   Evaluations
-----------  ---------  ------------  -----------  -----------  -----------
 00:03:10       Yes          0             5          2147         6737    

Nonlinear Coefficient Estimates (Robust SEs in Parentheses):
Sigma:         1             prices       |   Pi:           1               a1               a2               a3               a4    

### setting up micro moments 2: using the probability of buying each type of car

In [194]:
x_list = [f'x{i}' for i in range(10, 12)]
print(x_list)

a_list = [f'a{i}' for i in range(1, 20)]
print(a_list)

['x10', 'x11']
['a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'a10', 'a11', 'a12', 'a13', 'a14', 'a15', 'a16', 'a17', 'a18', 'a19']


In [195]:
inside_micro_parts = {}
micro_parts = {}

for num, j in enumerate(a_list, 1): 
    micro_parts[f'{j}=1'] = pyblp.MicroPart( #copying mid_part
        name = f'E[{j}=1]',
        dataset=micro_dataset,
        compute_values=lambda t, p, a: np.outer(a.demographics[:, num], np.r_[0, p.X2[:, 0]]), 
        #in mid_part they use X2, 
    )

    for num2, i in enumerate(x_list, 1): 
        inside_micro_parts[f'E[{j}=1*{i}=1]'] = pyblp.MicroPart( #copying inside_mid_part 
            name=f'E[{j}=1*{i}=1]',
            dataset=micro_dataset,
            compute_values=lambda t, p, a: np.outer(a.demographics[:, num], np.r_[1, p.X2[:, num2]]), 
        )

In [None]:
compute_ratio = lambda v: v[0] / v[1]
compute_ratio_gradient = lambda v: [1 / v[1], -v[0] / v[1]**2]

micro_moments = []

for num, j  in enumerate(a_list, 1): 
    for num2, i in enumerate(x_list, 1): 
        try: 
            value = micro_statistics.loc[(micro_statistics[j] == 1) & (micro_statistics[i] == 1), 'share'].iloc[0]
            micro_moments.append(
                pyblp.MicroMoment(
                    name=f'E[{i}=1|{j}=1]',
                    value=value, 
                    parts= [inside_micro_parts[f'E[{j}=1*{i}=1]'], micro_parts[f'{j}=1'] ], 
                    compute_value=compute_ratio,
                    compute_gradient=compute_ratio_gradient,
                )
            )
            print('works', j, i)
        except: 
            continue

works a1 x10
works a1 x11
works a2 x10
works a3 x10
works a4 x10
works a4 x11
works a5 x10
works a5 x11
works a6 x10
works a6 x11
works a7 x11
works a8 x10
works a8 x11
works a9 x10
works a9 x11
works a10 x10
works a10 x11
works a11 x10
works a11 x11
works a12 x10
works a12 x11
works a13 x10
works a13 x11
works a14 x10
works a15 x10
works a16 x10
works a17 x10
works a18 x10
works a19 x10
works a19 x11


In [None]:
results = problem.solve(
    sigma=initial_sigma,
    pi=initial_pi,
    optimization=pyblp.Optimization('bfgs', {'gtol': 1e-4}),
    iteration=pyblp.Iteration('squarem', {'atol': 1e-13}),
    #se_type='clustered',
    #W_type='clustered',
    micro_moments=micro_moments,
)
results

Solving the problem ...

Micro Moments:
  Observed         Moment           Part         Dataset     Observations  Markets
-------------  --------------  --------------  ------------  ------------  -------
+6.832086E-01  E[x10=1|a1=1]   E[a1=1*x10=1]   transactions      3885        All  
                                  E[a1=1]      transactions      3885        All  
+8.989586E-02  E[x11=1|a1=1]   E[a1=1*x11=1]   transactions      3885        All  
                                  E[a1=1]      transactions      3885        All  
+7.731044E-01  E[x10=1|a2=1]   E[a2=1*x10=1]   transactions      3885        All  
                                  E[a2=1]      transactions      3885        All  
+7.731044E-01  E[x10=1|a3=1]   E[a3=1*x10=1]   transactions      3885        All  
                                  E[a3=1]      transactions      3885        All  
+5.911975E-01  E[x10=1|a4=1]   E[a4=1*x10=1]   transactions      3885        All  
                                  E[a4=1]      

The model may be under-identified. The total number of unfixed parameters is 102, which is more than the total number of moments, 47. Consider checking whether instruments were properly specified when initializing the problem, and whether parameters were properly configured when solving the problem.
Detected that the 2SLS weighting matrix is nearly singular with condition number +9.328032E+25. To disable singularity checks, set options.singular_tol = numpy.inf.
Detected that the estimated covariance matrix of micro moments is nearly singular with condition number +5.961714E+146. To disable singularity checks, set options.singular_tol = numpy.inf.
Detected that the estimated covariance matrix of aggregate GMM moments is nearly singular with condition number +4.415895E+23. To disable singularity checks, set options.singular_tol = numpy.inf.


Computed results after 00:00:12.

Starting optimization ...

GMM   Computation  Optimization   Objective   Fixed Point  Contraction  Clipped    Objective      Objective      Gradient                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

Detected that the estimated covariance matrix of micro moments is nearly singular with condition number +5.961714E+146. To disable singularity checks, set options.singular_tol = numpy.inf.
Detected that the estimated covariance matrix of aggregate GMM moments is nearly singular with condition number +1.235280E+24. To disable singularity checks, set options.singular_tol = numpy.inf.


Computed results after 00:17:41.

Problem Results Summary:
GMM     Objective      Gradient         Hessian         Hessian     Clipped  Weighting Matrix
Step      Value          Norm       Min Eigenvalue  Max Eigenvalue  Shares   Condition Number
----  -------------  -------------  --------------  --------------  -------  ----------------
 1    +5.664170E+01  +0.000000E+00  +0.000000E+00   +0.000000E+00     444         +INF       

Starting optimization ...

GMM   Computation  Optimization   Objective   Fixed Point  Contraction  Clipped    Objective      Objective      Gradient                                                                                                                                                                                                                                                                                                                                                                                                                                

Detected that the estimated covariance matrix of micro moments is nearly singular with condition number +5.961714E+146. To disable singularity checks, set options.singular_tol = numpy.inf.


### setting up micro moments 3: using the probability of buying each type of car and the average price paid. 

In [178]:
inside_micro_parts = {}
inside_micro_price = {} # dictionary to store the numerator of the price moments
micro_parts = {}

for num, j in enumerate(a_list, 1): 
    micro_parts[f'{j}=1'] = pyblp.MicroPart( #copying mid_part
        name = f'E[{j}=1]',
        dataset=micro_dataset,
        compute_values=lambda t, p, a: np.outer(a.demographics[:, num], np.r_[0, p.X2[:, 0]]), 
        #in mid_part they use X2, 
    )

    inside_micro_price[f'E[price*{j}=1]'] = pyblp.MicroPart( #copying inside_mid_part 
        name=f'E[{j}=1*price]',
        dataset=micro_dataset,
        compute_values=lambda t, p, a: np.outer(a.demographics[:, num], np.r_[1, p.X2[:, 3]]), 
        )
    
    for num2, i in enumerate(x_list, 1): 
        inside_micro_parts[f'E[{j}=1*{i}=1]'] = pyblp.MicroPart( #copying inside_mid_part 
            name=f'E[{j}=1*{i}=1]',
            dataset=micro_dataset,
            compute_values=lambda t, p, a: np.outer(a.demographics[:, num], np.r_[1, p.X2[:, num2]]), 
        )

In [182]:
compute_ratio = lambda v: v[0] / v[1]
compute_ratio_gradient = lambda v: [1 / v[1], -v[0] / v[1]**2]

micro_moments = []

for num, j  in enumerate(a_list, 1): 
    try: 
        value = micro_statistics.loc[ (micro_statistics[i] == 1), 'prices'].iloc[0]
        micro_moments.append(
            pyblp.MicroMoment(
                name=f'E[prices|{j}=1]',
                value = value,
                
        )

        
    except: 
        continue 



    for num2, i in enumerate(x_list, 1): 
        try: 
            value = micro_statistics.loc[(micro_statistics[j] == 1) & (micro_statistics[i] == 1), 'share'].iloc[0]
            micro_moments.append(
                pyblp.MicroMoment(
                    name=f'E[{i}=1|{j}=1]',
                    value=value, 
                    parts= [inside_micro_parts[f'E[{j}=1*{i}=1]'], micro_parts[f'{j}=1'] ], 
                    compute_value=compute_ratio,
                    compute_gradient=compute_ratio_gradient,
                )
            )
            print('works', j, i)
        except: 
            continue

IndentationError: expected an indented block after 'try' statement on line 7 (520269623.py, line 9)

In [None]:
results = problem.solve(
    sigma=initial_sigma,
    pi=initial_pi,
    optimization=pyblp.Optimization('bfgs', {'gtol': 1e-4}),
    iteration=pyblp.Iteration('squarem', {'atol': 1e-13}),
    #se_type='clustered',
    #W_type='clustered',
    micro_moments=micro_moments,
)
results

# display

In [None]:
print(car_df.columns)


In [None]:
show(product_data_store)

PandasGUI INFO — pandasgui.gui — Opening PandasGUI


<pandasgui.gui.PandasGui at 0x1ea9e402290>

In [55]:
show(micro_dataset)

PandasGUI INFO — pandasgui.gui — Opening PandasGUI


<pandasgui.gui.PandasGui at 0x24ec06932e0>

In [51]:
show(micro_statistics)

PandasGUI INFO — pandasgui.gui — Opening PandasGUI


<pandasgui.gui.PandasGui at 0x24ec0150550>

In [28]:
show(product_data)

PandasGUI INFO — pandasgui.gui — Opening PandasGUI


<pandasgui.gui.PandasGui at 0x24eadc4a9e0>

In [None]:
show(car_df[['Rut Unidad de Compra','market_ids', 'product_ids', 'k'] ])

In [48]:
print(micro_parts_dict)

{'E[CAMIONETA|k=0': E[CAMIONETA|k=0 on transactions: 3885 Observations in All Markets, 'E[SUV|k=0': E[SUV|k=0 on transactions: 3885 Observations in All Markets, 'E[CAMIONETA|k=1': E[CAMIONETA|k=1 on transactions: 3885 Observations in All Markets, 'E[CAMIONETA|k=2': E[CAMIONETA|k=2 on transactions: 3885 Observations in All Markets, 'E[CAMIONETA|k=3': E[CAMIONETA|k=3 on transactions: 3885 Observations in All Markets, 'E[SUV|k=3': E[SUV|k=3 on transactions: 3885 Observations in All Markets, 'E[CAMIONETA|k=4': E[CAMIONETA|k=4 on transactions: 3885 Observations in All Markets, 'E[SUV|k=4': E[SUV|k=4 on transactions: 3885 Observations in All Markets, 'E[CAMIONETA|k=5': E[CAMIONETA|k=5 on transactions: 3885 Observations in All Markets, 'E[SUV|k=5': E[SUV|k=5 on transactions: 3885 Observations in All Markets, 'E[SUV|k=6': E[SUV|k=6 on transactions: 3885 Observations in All Markets, 'E[CAMIONETA|k=7': E[CAMIONETA|k=7 on transactions: 3885 Observations in All Markets, 'E[SUV|k=7': E[SUV|k=7 on t