In [47]:
import pandas as pd
import numpy as np
import string
import re
import os
from collections import defaultdict

In [29]:
df_aic = pd.read_csv('aic.csv')
df_bic = pd.read_csv('bic.csv')
df_cv = pd.read_csv('cv.csv')
df_press = pd.read_csv('press.csv')
df_r2a = pd.read_csv('r2a.csv')

In [30]:
df_aic

Unnamed: 0.1,Unnamed: 0,0
0,Intercept,-3.034394e+06
1,C(Manufacturer)[T.AUDI],3.756034e+03
2,C(Manufacturer)[T.BMW],1.310446e+04
3,C(Manufacturer)[T.BUICK],1.145679e+03
4,C(Manufacturer)[T.CADILLAC],7.885624e+03
...,...,...
65,C(Drive_wheels)[T.Rear],2.747306e+03
66,Prod_year,1.512949e+03
67,Engine_Volume,1.700510e+03
68,Cylinders,1.296979e+03


In [31]:
df_bic

Unnamed: 0.1,Unnamed: 0,0
0,Intercept,-2725383.0
1,C(Fuel_type)[T.Diesel],2358.172
2,C(Fuel_type)[T.Hybrid],-8030.168
3,C(Fuel_type)[T.LPG],-7495.083
4,C(Fuel_type)[T.Petrol],-4127.678
5,C(Fuel_type)[T.Plug-in Hybrid],2597.03
6,C(HasTurbo)[T.1],7469.564
7,C(Gear_box_type)[T.Manual],5468.235
8,C(Gear_box_type)[T.Tiptronic],11384.13
9,C(Gear_box_type)[T.Variator],6809.069


In [None]:
# 1 Problem we have is that each df has an identical first cell value: 'Intercept'. Gonna change that so each intercept is unique
df_aic['Unnamed: 0'][0] = 'Intercept_aic'
df_bic['Unnamed: 0'][0] = 'Intercept_bic'
df_cv['Unnamed: 0'][0] = 'Intercept_cv'
df_press['Unnamed: 0'][0] = 'Intercept_press'
df_r2a['Unnamed: 0'][0] = 'Intercept_r2a'

In [42]:
df_aic

Unnamed: 0.1,Unnamed: 0,0
0,Intercept_aic,-3.034394e+06
1,C(Manufacturer)[T.AUDI],3.756034e+03
2,C(Manufacturer)[T.BMW],1.310446e+04
3,C(Manufacturer)[T.BUICK],1.145679e+03
4,C(Manufacturer)[T.CADILLAC],7.885624e+03
...,...,...
65,C(Drive_wheels)[T.Rear],2.747306e+03
66,Prod_year,1.512949e+03
67,Engine_Volume,1.700510e+03
68,Cylinders,1.296979e+03


In [43]:
df_r2a

Unnamed: 0.1,Unnamed: 0,0
0,Intercept_r2a,-3.034394e+06
1,C(Manufacturer)[T.AUDI],3.756034e+03
2,C(Manufacturer)[T.BMW],1.310446e+04
3,C(Manufacturer)[T.BUICK],1.145679e+03
4,C(Manufacturer)[T.CADILLAC],7.885624e+03
...,...,...
65,C(Drive_wheels)[T.Rear],2.747306e+03
66,Prod_year,1.512949e+03
67,Engine_Volume,1.700510e+03
68,Cylinders,1.296979e+03


In [45]:
# How to join multiple csvs with the same columns
df_aic_bic = pd.concat([df_aic, df_bic], ignore_index=True)
df_aic_bic

Unnamed: 0.1,Unnamed: 0,0
0,Intercept_aic,-3.034394e+06
1,C(Manufacturer)[T.AUDI],3.756034e+03
2,C(Manufacturer)[T.BMW],1.310446e+04
3,C(Manufacturer)[T.BUICK],1.145679e+03
4,C(Manufacturer)[T.CADILLAC],7.885624e+03
...,...,...
78,C(Gear_box_type)[T.Tiptronic],1.138413e+04
79,C(Gear_box_type)[T.Variator],6.809069e+03
80,Prod_year,1.363534e+03
81,Engine_Volume,2.881329e+03


In [46]:
# Join all csvs from each model
df_all = pd.concat([df_aic, df_bic, df_cv, df_press, df_r2a], ignore_index=True)
df_all

Unnamed: 0.1,Unnamed: 0,0
0,Intercept_aic,-3.034394e+06
1,C(Manufacturer)[T.AUDI],3.756034e+03
2,C(Manufacturer)[T.BMW],1.310446e+04
3,C(Manufacturer)[T.BUICK],1.145679e+03
4,C(Manufacturer)[T.CADILLAC],7.885624e+03
...,...,...
290,C(Drive_wheels)[T.Rear],2.747306e+03
291,Prod_year,1.512949e+03
292,Engine_Volume,1.700510e+03
293,Cylinders,1.296979e+03


In [56]:
# 2 functions to change the first columns of our df. 

# map_dict creates a map of the first column of our input df with unique integer
def map_dict(df):
    """
    Creates a dictionary mapping unique strings in the first column to unique integers.
    
    :param df: Input DataFrame
    :return: Dictionary with string keys and integer values
    """
    unique_values = df.iloc[:, 0].unique()
    mapping = defaultdict(lambda: len(mapping))
    for value in unique_values:
        _ = mapping[value]
    return dict(mapping)


# mapped_df 
def mapped_df(df, map_dict):
    """
    Applies the mapping dictionary to the first column of the DataFrame.
    
    :param df: Input DataFrame or the name of the column to be mapped
    :param map_dict: Dictionary mapping strings to integers
    :return: DataFrame with first column mapped to integers
    """
    # Check if df is a DataFrame
    if isinstance(df, pd.DataFrame):
        df_copy = df.copy()
        column_name = df_copy.columns[0]
    # Check if df is a string (column name)
    elif isinstance(df, str):
        column_name = df
        df_copy = pd.DataFrame({column_name: map_dict.keys()})
    else:
        raise ValueError("Input must be a DataFrame or a column name (string)")

    # Apply mapping
    df_copy[column_name] = df_copy[column_name].map(map_dict)

    # Check if any values couldn't be mapped
    unmapped = df_copy[column_name].isna().sum()
    if unmapped > 0:
        print(f"Warning: {unmapped} values could not be mapped.")

    return df_copy

In [51]:
# Making a comprehensive map of ALL of the predictors in our 5 models. There's 93 unique predictors in our best models, counting the intercepts.

complete_map = map_dict(df_all)
complete_map

{'Intercept_aic': 0,
 'C(Manufacturer)[T.AUDI]': 1,
 'C(Manufacturer)[T.BMW]': 2,
 'C(Manufacturer)[T.BUICK]': 3,
 'C(Manufacturer)[T.CADILLAC]': 4,
 'C(Manufacturer)[T.CHEVROLET]': 5,
 'C(Manufacturer)[T.CHRYSLER]': 6,
 'C(Manufacturer)[T.CITROEN]': 7,
 'C(Manufacturer)[T.DAEWOO]': 8,
 'C(Manufacturer)[T.DAIHATSU]': 9,
 'C(Manufacturer)[T.DODGE]': 10,
 'C(Manufacturer)[T.FIAT]': 11,
 'C(Manufacturer)[T.FORD]': 12,
 'C(Manufacturer)[T.GAZ]': 13,
 'C(Manufacturer)[T.GMC]': 14,
 'C(Manufacturer)[T.HONDA]': 15,
 'C(Manufacturer)[T.HUMMER]': 16,
 'C(Manufacturer)[T.HYUNDAI]': 17,
 'C(Manufacturer)[T.INFINITI]': 18,
 'C(Manufacturer)[T.JAGUAR]': 19,
 'C(Manufacturer)[T.JEEP]': 20,
 'C(Manufacturer)[T.KIA]': 21,
 'C(Manufacturer)[T.LANCIA]': 22,
 'C(Manufacturer)[T.LAND ROVER]': 23,
 'C(Manufacturer)[T.LEXUS]': 24,
 'C(Manufacturer)[T.LINCOLN]': 25,
 'C(Manufacturer)[T.MAZDA]': 26,
 'C(Manufacturer)[T.MERCEDES-BENZ]': 27,
 'C(Manufacturer)[T.MINI]': 28,
 'C(Manufacturer)[T.MITSUBISHI]': 29,


In [57]:
# Applying that map to each of our 5 dfs. 

df_aic_mapped = mapped_df(df_aic, complete_map)
df_bic_mapped = mapped_df(df_bic, complete_map)
df_cv_mapped = mapped_df(df_cv, complete_map)
df_press_mapped = mapped_df(df_press, complete_map)
df_r2a_mapped = mapped_df(df_r2a, complete_map)

In [68]:
# I don't like permanently changing global settings, but I want to see all rows in each df. Here's functions to do that

from contextlib import contextmanager

@contextmanager
def expanded_display(max_rows=None, max_columns=None, width=None):
    """
    Context manager for temporarily expanding pandas display options.
    """
    original_max_rows = pd.get_option('display.max_rows')
    original_max_columns = pd.get_option('display.max_columns')
    original_width = pd.get_option('display.width')
    
    pd.set_option('display.max_rows', max_rows)
    pd.set_option('display.max_columns', max_columns)
    pd.set_option('display.width', width)
    
    try:
        yield
    finally:
        pd.set_option('display.max_rows', original_max_rows)
        pd.set_option('display.max_columns', original_max_columns)
        pd.set_option('display.width', original_width)

def print_full_df(df):
    """
    Prints the full DataFrame without truncation.
    """
    with expanded_display(max_rows=len(df), max_columns=len(df.columns), width=None):
        print(df)

In [69]:
print_full_df(df_aic_mapped)

    Unnamed: 0             0
0            0 -3.034394e+06
1            1  3.756034e+03
2            2  1.310446e+04
3            3  1.145679e+03
4            4  7.885624e+03
5            5  6.307939e+03
6            6  1.307239e+04
7            7  6.524802e+03
8            8  7.969194e+03
9            9  6.475181e+01
10          10  2.616541e+03
11          11  5.411470e+03
12          12  1.342678e+04
13          13  1.284810e+05
14          14  6.605445e+03
15          15  1.115127e+04
16          16 -5.351336e+02
17          17  1.515882e+04
18          18  1.339396e+04
19          19 -3.026176e+03
20          20  1.184756e+04
21          21  6.838654e+03
22          22  1.151056e+04
23          23  3.366417e+04
24          24  8.589899e+03
25          25  1.643995e+04
26          26  7.827457e+03
27          27  1.501927e+04
28          28  1.291048e+04
29          29  1.199342e+04
30          30  3.113720e+03
31          31  1.484101e+04
32          32  7.132070e+03
33          33

In [70]:
print_full_df(df_bic_mapped)

    Unnamed: 0             0
0           70 -2.725383e+06
1           55  2.358172e+03
2           56 -8.030168e+03
3           57 -7.495083e+03
4           58 -4.127678e+03
5           59  2.597030e+03
6           60  7.469564e+03
7           61  5.468235e+03
8           62  1.138413e+04
9           63  6.809069e+03
10          66  1.363534e+03
11          67  2.881329e+03
12          69 -8.528417e+02


In [71]:
print_full_df(df_cv_mapped)

    Unnamed: 0             0
0           71 -2.999863e+06
1            1  3.045043e+03
2            2  1.224378e+04
3            3 -2.878595e+02
4            4  7.504806e+03
5            5  5.283607e+03
6            6  1.314586e+04
7            7  5.643932e+03
8            8  6.825441e+03
9            9 -1.619051e+03
10          10  2.798014e+03
11          11  4.265517e+03
12          12  1.301816e+04
13          13  1.273609e+05
14          14  6.395655e+03
15          15  1.032078e+04
16          16 -4.363651e+02
17          17  1.434927e+04
18          18  1.384954e+04
19          19 -3.154828e+03
20          20  1.198328e+04
21          21  6.283683e+03
22          22  8.952539e+03
23          23  3.256335e+04
24          24  8.761023e+03
25          25  1.652941e+04
26          26  7.736710e+03
27          27  1.448707e+04
28          28  1.212801e+04
29          29  1.132975e+04
30          30  2.151168e+03
31          31  1.401399e+04
32          32  6.452049e+03
33          33

In [74]:
print_full_df(df_press_mapped)

    Unnamed: 0             0
0           73 -3.048711e+06
1            1 -3.016432e+03
2            2  8.802297e+03
3            3 -2.612100e+02
4            4  2.329693e+03
5            5  3.013890e+03
6            6  9.253226e+03
7            7  1.661671e+03
8            8  3.090126e+03
9            9  2.528370e+03
10          10  1.138852e+03
11          11 -1.859085e+02
12          12  8.041370e+03
13          13  1.218125e+05
14          14  4.932300e+02
15          15  6.071674e+03
16          16 -2.012088e+03
17          17  1.260139e+04
18          18  9.350454e+03
19          19 -8.751005e+03
20          20  1.248911e+04
21          21  2.342367e+03
22          22  1.213735e+04
23          23  3.201047e+04
24          24  4.421354e+03
25          25  1.702413e+04
26          26  6.847804e+03
27          27  1.116714e+04
28          28  1.001128e+04
29          29  1.124578e+04
30          30  5.404496e+02
31          31  1.033733e+04
32          32  6.953319e+03
33          33

In [75]:
print_full_df(df_r2a_mapped)

    Unnamed: 0             0
0           92 -3.034394e+06
1            1  3.756034e+03
2            2  1.310446e+04
3            3  1.145679e+03
4            4  7.885624e+03
5            5  6.307939e+03
6            6  1.307239e+04
7            7  6.524802e+03
8            8  7.969194e+03
9            9  6.475181e+01
10          10  2.616541e+03
11          11  5.411470e+03
12          12  1.342678e+04
13          13  1.284810e+05
14          14  6.605445e+03
15          15  1.115127e+04
16          16 -5.351336e+02
17          17  1.515882e+04
18          18  1.339396e+04
19          19 -3.026176e+03
20          20  1.184756e+04
21          21  6.838654e+03
22          22  1.151056e+04
23          23  3.366417e+04
24          24  8.589899e+03
25          25  1.643995e+04
26          26  7.827457e+03
27          27  1.501927e+04
28          28  1.291048e+04
29          29  1.199342e+04
30          30  3.113720e+03
31          31  1.484101e+04
32          32  7.132070e+03
33          33

In [78]:
# Save all mapped df
import inspect

def save_df_to_csv(df, default_name='dataframe'):
    frame = inspect.currentframe().f_back
    variable_names = [name for name, value in frame.f_locals.items() if value is df]
    
    df_name = variable_names[0]
    
    # Remove the "df_" prefix and create the filename
    if df_name.startswith("df_"):
        filename = f"{df_name[3:]}.csv"
    else:
        filename = f"{df_name}.csv"
    
    # Save the DataFrame to CSV
    df.to_csv(filename, index=False)

save_df_to_csv(df_aic_mapped)
save_df_to_csv(df_bic_mapped)
save_df_to_csv(df_cv_mapped)
save_df_to_csv(df_press_mapped)
save_df_to_csv(df_r2a_mapped)