In [45]:
import pandas as pd
from pandasgui import show
import os

# create the root path 
current_dir = os.getcwd()
base_path = os.path.join('raw_data', 'OrdenesCompra')

#joint the files for a single year. 
def join_data(base_path, year): 
    # Create an empty list to store DataFrames
    dfs = []

    for year in range(year, year+1 ):
        for month in range(1, 13):
            # Construct the file path
            file_path = os.path.join(base_path, str(year), f"{year}-{month}.csv")
            
            # Check if the file exists before trying to read it
            if os.path.exists(file_path):
                try:
                    df = pd.read_csv(file_path, encoding='ISO-8859-1', sep=';')
                    
                    # Add year and month columns for reference
                    df['Year'] = year
                    df['Month'] = month
                    
                    # Append the DataFrame to the list
                    dfs.append(df)
                    print(f"Successfully read file: {file_path}")
                except Exception as e:
                    print(f"Error reading file {file_path}: {str(e)}")
            else:
                print(f"File not found: {file_path}")

    # Concatenate all DataFrames in the list
    combined_df = pd.concat(dfs, ignore_index=True)

    # Display basic information about the combined DataFrame
    print(combined_df.info())
    return combined_df

#describe the data
def descriptive_stats(df): 
    print('This is the descriptive statistics function')
    print(df.columns) 

    # Create a pivot table to tabulate the data by year and month
    pivot_table = df.pivot_table(index='Year', columns='Month', aggfunc='size', fill_value=0)

    # Display the pivot table
    print(pivot_table)

    df['ProcedenciaOC'].unique()
    df['precioNeto'] = pd.to_numeric(df['precioNeto'], errors='coerce')
    df = df.dropna(subset=['precioNeto'])
    total_weight = df['precioNeto'].sum()

    convenio_marco_weight = df[df['ProcedenciaOC'] == 'Proveniente de convenio marco']['precioNeto'].sum()
    share_convenio_marco = convenio_marco_weight / total_weight
    print('share of total expenditure in FAs:', share_convenio_marco)

#drop purchases that do not come from FAs or that are not from vehicles. 
def keep_FA_vehicles(df): 
    #remove purchases that are not trough FA
    df = df[df['ProcedenciaOC'] == 'Proveniente de convenio marco']


    ## see which variables are give you information of the type of product being bougth 
    varlist = [ 'Categoria',  'NombreroductoGenerico', 'RubroN1', 'RubroN2', 'RubroN3']
    #for var in varlist:
        #print('The var is: ', var)
        #print(df[var].unique())


    # create a df with the rows that could car purchases 
    columns_to_check = ['Categoria', 'NombreroductoGenerico', 'RubroN1', 'RubroN2', 'RubroN3']
    mask = df[columns_to_check].apply(lambda x: x.str.contains('Vehículos|vehículos', case=False, na=False)).any(axis=1)
    df = df[mask]

    return df

# show the columns that have relevant information 
def show_df(df): 

    # List of columns we care about 
    columns_to_check = ['ID', 'Codigo', 'Link', 'Nombre', 'Descripcion/Obervaciones', 'MontoTotalOC', 'MontoTotalOC_PesosChilenos', 'Impuestos','TotalNetoOC', 'CodigoProveedor',   'RegionProveedor',  'IDItem', 'codigoCategoria', 'Categoria', 'codigoProductoONU', 'NombreroductoGenerico', 'RubroN1', 'RubroN2', 'RubroN3', 'EspecificacionComprador', 'EspecificacionProveedor', 'cantidad',  'precioNeto', 'totalCargos','totalDescuentos', 'totalImpuestos', 'totalLineaNeto']

    df_show = df[columns_to_check]
    show(df_show)

#save the dataframe, otherwise one would have to use the join_data function each time
def save_df(df_light, year):    
    try:
        os.makedirs('interm_data', exist_ok=True)
        save_path = os.path.join('interm_data', 'search_CM', f'OC_{year}.csv')
        
        # If successful, now try to save your DataFrame
        df_light.to_csv(save_path, index=False)
        print(f"Successfully saved DataFrame to: {save_path}")

    except PermissionError as e:
        print(f"Permission error: {e}")
        print("You don't have permission to write to this location.")
    except FileNotFoundError as e:
        print(f"File not found error: {e}")
        print("The specified path doesn't exist and couldn't be created.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# summarize the amounts of the purchases and transforms amount variables to int, input: dictionary with dataframes
def summarize_amounts(df_dict): 
    for year, df in df_dict.items():
        print(f"Year: {year}")
        
        #get sense of price and its dispersion 
        df['MontoTotalOC'] = pd.to_numeric(df['MontoTotalOC'], errors='coerce')
        summary = df['MontoTotalOC'].describe()
        print(summary)

        df['precioNeto'] = pd.to_numeric(df['precioNeto'], errors='coerce')
        summary = df['precioNeto'].describe()
        print(summary)

        df_dict[year] = df
    return df_dict

# Find the values of 'Codigo' that are repeated more than N times
def repeated_codes(df_dict, N = 3): 
    for year, df in df_dict.items():
        print(f"Year: {year}")
        repeated_codigos = df['Codigo'].value_counts()
        repeated_codigos = repeated_codigos[repeated_codigos > N]
        print(repeated_codigos)
       
    return 


years = range(2016, 2024)
df_dict = {}  # Initialize an empty dictionary



for year in years:
    exists = False 

    combined_file_path = os.path.join(current_dir,'interm_data', 'search_CM', f'OC_{year}.csv')
    if os.path.exists(combined_file_path):
        print(f"Combined file for {year} already exists: {combined_file_path}")
        exists = True 

        df_light = pd.read_csv(combined_file_path)
        print(f'read df {year}, path {combined_file_path}')
    
    if exists == False: 

        df = join_data(base_path, year)

        descriptive_stats(df)
        df_light = keep_FA_vehicles(df)
        print(df_light.info())

        # Save the dataframe
        save_df(df_light, year)
    
    # Store the dataframe in the dictionary with the year as the key
    df_dict[year] = df_light

Combined file for 2016 already exists: c:\Users\lucas\OneDrive - Yale University\Documents\GitHub\2nd-year-paper\interm_data\search_CM\OC_2016.csv


  df_light = pd.read_csv(combined_file_path)


read df 2016, path c:\Users\lucas\OneDrive - Yale University\Documents\GitHub\2nd-year-paper\interm_data\search_CM\OC_2016.csv
Combined file for 2017 already exists: c:\Users\lucas\OneDrive - Yale University\Documents\GitHub\2nd-year-paper\interm_data\search_CM\OC_2017.csv
read df 2017, path c:\Users\lucas\OneDrive - Yale University\Documents\GitHub\2nd-year-paper\interm_data\search_CM\OC_2017.csv
Combined file for 2018 already exists: c:\Users\lucas\OneDrive - Yale University\Documents\GitHub\2nd-year-paper\interm_data\search_CM\OC_2018.csv


  df_light = pd.read_csv(combined_file_path)


read df 2018, path c:\Users\lucas\OneDrive - Yale University\Documents\GitHub\2nd-year-paper\interm_data\search_CM\OC_2018.csv
Combined file for 2019 already exists: c:\Users\lucas\OneDrive - Yale University\Documents\GitHub\2nd-year-paper\interm_data\search_CM\OC_2019.csv


  df_light = pd.read_csv(combined_file_path)


read df 2019, path c:\Users\lucas\OneDrive - Yale University\Documents\GitHub\2nd-year-paper\interm_data\search_CM\OC_2019.csv
Combined file for 2020 already exists: c:\Users\lucas\OneDrive - Yale University\Documents\GitHub\2nd-year-paper\interm_data\search_CM\OC_2020.csv
read df 2020, path c:\Users\lucas\OneDrive - Yale University\Documents\GitHub\2nd-year-paper\interm_data\search_CM\OC_2020.csv
Combined file for 2021 already exists: c:\Users\lucas\OneDrive - Yale University\Documents\GitHub\2nd-year-paper\interm_data\search_CM\OC_2021.csv
read df 2021, path c:\Users\lucas\OneDrive - Yale University\Documents\GitHub\2nd-year-paper\interm_data\search_CM\OC_2021.csv
Combined file for 2022 already exists: c:\Users\lucas\OneDrive - Yale University\Documents\GitHub\2nd-year-paper\interm_data\search_CM\OC_2022.csv
read df 2022, path c:\Users\lucas\OneDrive - Yale University\Documents\GitHub\2nd-year-paper\interm_data\search_CM\OC_2022.csv
Combined file for 2023 already exists: c:\Users\luc

  df_light = pd.read_csv(combined_file_path)


In [46]:
## see which variables give you information of the type of product being bougth 
varlist = [ 'Categoria',  'NombreroductoGenerico', 'RubroN1', 'RubroN2', 'RubroN3']
    
for year, df in df_dict.items():
    print(f"Year: {year}")
    
    for var in varlist:
        print('The var is: ', var)
        print(df_light[var].unique())

    df = df[(df['RubroN2'] == 'Vehículos motorizados')& (df['NombreroductoGenerico'] != 'Camiones de transporte')]
    print(df.info())

    df_dict[year] = df

Year: 2016
The var is:  Categoria
['Vehículos y equipamiento en general / Vehículos motorizados / Vehículos para pasajeros'
 'Vehículos y equipamiento en general / Vehículos motorizados / Vehículos de emergencia'
 'Vehículos y equipamiento en general / Vehículos motorizados / Vehículos para turismo'
 'Vehículos y equipamiento en general / Vehículos motorizados / Vehículos especiales y de recreación'
 'Vehículos y equipamiento en general / Vehículos motorizados / Bicicletas con motor']
The var is:  NombreroductoGenerico
['SUV o todo terrenos'
 'Camiones ligeros o vehículos de deporte de utilida' 'Ambulancias'
 'Automóviles' 'Trineos motorizados o motos de nieve'
 'Escuters o motos pequeñas']
The var is:  RubroN1
['Vehículos y equipamiento en general']
The var is:  RubroN2
['Vehículos motorizados']
The var is:  RubroN3
['Vehículos para pasajeros' 'Vehículos de emergencia'
 'Vehículos para turismo' 'Vehículos especiales y de recreación'
 'Bicicletas con motor']
<class 'pandas.core.frame.D

In [47]:
df_dict = summarize_amounts(df_dict)

Year: 2016
count    1.550000e+02
mean     1.002289e+08
std      1.553469e+08
min      1.249500e+04
25%      5.715570e+05
50%      5.399000e+07
75%      1.406164e+08
max      9.930134e+08
Name: MontoTotalOC, dtype: float64
count    8.830000e+02
mean     1.525781e+07
std      1.293838e+07
min      2.883000e+03
25%      7.146430e+06
50%      1.405134e+07
75%      1.802861e+07
max      9.431842e+07
Name: precioNeto, dtype: float64
Year: 2017
count    2.270000e+02
mean     1.085645e+08
std      1.711052e+08
min      7.000000e+03
25%      3.302190e+06
50%      3.705017e+07
75%      1.344164e+08
max      1.057077e+09
Name: MontoTotalOC, dtype: float64
count    1.114000e+03
mean     1.420325e+07
std      1.138950e+07
min      2.479000e+03
25%      8.649521e+06
50%      1.419249e+07
75%      1.789000e+07
max      1.182000e+08
Name: precioNeto, dtype: float64
Year: 2018
count    5.560000e+02
mean     4.325683e+07
std      1.129131e+08
min      4.498200e+04
25%      1.549772e+07
50%      2.027397

In [48]:
repeated_codes(df_dict, N = 3)

Year: 2016
Codigo
3959-7-SE16       23
586-497-CM16       7
612-2317-CM16      4
4728-1743-CM16     4
586-1423-CM16      4
3959-15-SE16       4
Name: count, dtype: int64
Year: 2017
Codigo
918434-343-CM17    15
701-1622-CM17      12
612-14-CM17        11
904268-119-SE17     9
587-1843-CM17       7
1596-275-CM17       4
612-2817-SE17       4
1591-380-CM17       4
Name: count, dtype: int64
Year: 2018
Codigo
612-72-CM18       27
587-2780-CM18      6
4629-1271-CM18     6
5186-285-CM18      5
731-1559-CM18      4
5693-267-SE18      4
708-1011-CM18      4
Name: count, dtype: int64
Year: 2019
Codigo
612-153-SE19       26
4629-1204-CM19     12
585-211-CM19        7
587-1745-CM19       6
1080094-7-SE19      5
587-2528-CM19       5
593951-549-CM19     4
3686-3-CM19         4
1080094-61-SE19     4
1080095-10-SE19     4
587-2950-CM19       4
3422-109-SE19       4
731-1265-CM19       4
2084-453-SE19       4
1591-319-CM19       4
Name: count, dtype: int64
Year: 2020
Codigo
593951-231-CM20     8
10987