# Excel2LaTeXViaPython

This code reads in an excel file, and writes a LaTeX file file containing a tabular environment that attempts to match the styling of the excel table as much as possible.

## Preamble


In [1]:
import openpyxl # Package for reading excel files (.xlsx) into Python
openpyxl.__version__ # Check version number

'2.3.2'

In [2]:
# Get and/or set the working directory
import re
import os
os.getcwd() # Get current working directory
#os.chdir() # Set current working directory

'C:\\Users\\Kirker\\Documents\\Git Repositories\\excel2latexviapython'

## User input
In the Jupyter cell below enter the name of the of the input Excel file, and the name of the tex file to save the outputed LaTeX code to

In [28]:
# File names
#input_excel_filename = 'C:/Users/Kirker/Desktop/tables.xlsx'
output_tex_filename = 'C:/Users/Kirker/Dropbox/My research/Productivity Spillovers in NZ/LaTeX/Slides - Internal Tsy Overview/tables/tbl_initial_reg.tex'
sheet_name = 'initial_reg'

# When looping over sheets, we need the generic output folder
input_excel_filename = 'C:/Users/Kirker/Dropbox/My research/Productivity Spillovers in NZ/LaTeX/Slides - Internal Tsy Overview/tables/regression_tables.xlsx'
output_dir = 'C:/Users/Kirker/Dropbox/My research/Productivity Spillovers in NZ/LaTeX/Slides - Internal Tsy Overview/tables/'

## Load data
Load in the Excel Workbook, and select the appropriate Work Sheet that contains the table to copy.

In [21]:
# read in excel workbook file
workbook = openpyxl.load_workbook(filename = input_excel_filename)
type(workbook)

openpyxl.workbook.workbook.Workbook

In [22]:
# Get the list of sheets in the workbook
workbook.get_sheet_names()

['Sheet1', 'Sheet2', 'initial_reg', 'Sheet4']

In [23]:
# Read in the worksheet from the workbook that contains teh table to replicate
sheet = workbook[sheet_name]
sheet

<Worksheet "initial_reg">

In [16]:
# Examin the table to get an idea of its dimensions.
sheet.rows # See all cells in sheet 

((<Cell Sheet1.A1>, <Cell Sheet1.B1>, <Cell Sheet1.C1>, <Cell Sheet1.D1>),
 (<Cell Sheet1.A2>, <Cell Sheet1.B2>, <Cell Sheet1.C2>, <Cell Sheet1.D2>),
 (<Cell Sheet1.A3>, <Cell Sheet1.B3>, <Cell Sheet1.C3>, <Cell Sheet1.D3>),
 (<Cell Sheet1.A4>, <Cell Sheet1.B4>, <Cell Sheet1.C4>, <Cell Sheet1.D4>),
 (<Cell Sheet1.A5>, <Cell Sheet1.B5>, <Cell Sheet1.C5>, <Cell Sheet1.D5>),
 (<Cell Sheet1.A6>, <Cell Sheet1.B6>, <Cell Sheet1.C6>, <Cell Sheet1.D6>))

In [8]:
# Check dimensions of the table
print("number of columns = " + str(sheet.max_column))
print("number of rows = " + str(sheet.max_row))

number of columns = 4
number of rows = 31


## Define functions
Subfunctions we are going to need to process the Excel file

In [51]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False


def tupple2latexstring(row_tup):
    """ TUPPLE2LATEXSTRING
    
    This function converts a tupple of openpyxl CELLs into a single row string of LaTeX code for inclusion in a table.
    
    
   Args:
        row_tup: [tupple] contains the openpyxl CELLs for a single row of the table


    Returns:
        A string of the row cells formatted in the LaTeX style.
        
    """
    
    
    num_elements = len(row_tup) # how many columns we have 
    
    
    str_out = "" # initilise the output string
    
    
    
    for cn in range(0,num_elements): # for each column
        
        
        # Get the main text for that cell
        if row_tup[cn].value == None:
            value_string = " " # Cell is empty of value
        else:
            
            # The user might have used Excel to round off the number. If so, we need to apply the same rounding
            
            
            if is_number(row_tup[cn].value):
                # If it is a number, need to check if there is any rounding applied
                
                
                if row_tup[cn].number_format == 'General': 
                    value_string = str(row_tup[cn].value)
                    
                else:
                    # User has rounded off the number in the cell to a specific number of d.p.

                    # work out how many d.p.s the user has rounded to.
                    num_format_str =re.split('\.',row_tup[cn].number_format) 
                    num_dps=len(num_format_str[1])

                    # Round the cell's value to that number of d.p. 
                    value_string = str(round(row_tup[cn].value,num_dps))
            else:
                # Cell contains no specific rounding choice by the user, so we can take just the straight value
                value_string = str(row_tup[cn].value)
            
            
        # Apply extra formatting to cell text
        # The cell might have special formatting applied to the value inside it (e.g. bold text). 
        # Apply the LaTeX version of this formatting to the string    
            
            
        # Apply bold font if needed    
        if row_tup[cn].font.__dict__['b']:
            value_string = "\\textbf{" + value_string + "}"
            
        # Apply italicize if needed
        if row_tup[cn].font.__dict__['i']:
            value_string = "\\textit{" + value_string + "}"
            
            
        # Append formatted string for this cell to the string out
        str_out += value_string 
        
        # If this isnt the last element, add cell divider
        if cn < num_elements-1:
            str_out += " \t & \t "
        
        
    # Add on line ending code for the end of the row string
    str_out += " \\\ \n" 
    
    return str_out

In [53]:
# Test the above function
tupple2latexstring(sheet.rows[2])



'  \t & \t (0.0206) \t & \t (0.0166) \t & \t (0.0965) \\\\ \n'

In [45]:
is_number('-0.3*')

False

In [17]:
def check_for_vline(col_tup,loc):
    """ Look for vertical lines over the whole table
    
    Input:
    ------
        * row_tup = tupple with one element for every column in the table
        
        
    Output:
    -------
        * str_out = string to write LaTeX code to text file
    
    """    
    
    max_row=len(col_tup)
    
    ID = 0
    
    for rownum in range(0, max_row):
        
        if col_tup[rownum].border.__dict__[loc].border_style != None:
            
            ID += 1
            
            
    if ID == max_row:
        return(True)
    else:
        return(False)
    
out = check_for_vline(sheet.columns[2],'left')    
print(out)

False


In [12]:
sheet.columns[2][2]

<Cell initial_reg.C3>

In [13]:
def check_for_hline(row_tup,loc):
    
    #row_tup = tupple2latexstring(sheet.rows[row_num])
    
    
    max_column=len(row_tup)
    
    ID = 0
    
    
    for cn in range(0,max_column):
        
           if row_tup[cn].border.__dict__[loc].border_style != None:

            # For now we dont allow different horizontal line styles
            ID += 1
        
        
        
    if ID == max_column:
        return(True)
    else:
        return(False)

out=check_for_hline(sheet.rows[0],'bottom')
print(out)

True


In [14]:
def pick_col_text_alignment(col_tup):
    '''
    For a given column, choose the alignment (left, center, right) based on the majority of the cells
    '''
    
    max_column=len(col_tup)
    
    # Preallocate counters
    count_left = 0
    count_center = 0
    count_right = 0
    
    for rn in range(0,max_column):
        
        if col_tup[rn].alignment.__dict__['horizontal'] in [None, 'left']:
            
            count_left += 1
            
        elif col_tup[rn].alignment.__dict__['horizontal'] in ['center']:
            
            count_center += 1
            
        elif col_tup[rn].alignment.__dict__['horizontal'] in ['right']:
            
            count_right += 1
        
    max_count = max([count_left, count_center, count_right])
    
    if count_left == max_count:
        
        return('l')
    
    elif count_center == max_count:
        
        return('c')
    
    elif count_right == max_count:
        
        return('r')
    

In [15]:
pick_col_text_alignment(sheet.columns[1])



'l'

## Create LaTeX File
Write the table to a .tex file

In [24]:
# Create output file
file = open(output_tex_filename,'w')


## Create Tabular Header
# For now, only allow centered columns

col_align_str = "" # Preallocate string

for colnum in range(0,sheet.max_column):
    
    if check_for_vline(sheet.columns[colnum],'left'):
        col_align_str += '|'
        
    col_align_str += pick_col_text_alignment(sheet.columns[colnum])
    
    if check_for_vline(sheet.columns[colnum],'right'):
        col_align_str += '|'

        

begin_str = "\\begin{tabular}{" + str(col_align_str)  + "} \n"


file.write(begin_str)




for row_num in range(0,sheet.max_row):
    if check_for_hline(sheet.rows[row_num],'top') == True:
        file.write("\hline \n")
    
    #file.write(str(row_num)+"\t \\\ \n")
    file.write(tupple2latexstring(sheet.rows[row_num]))
       
    if check_for_hline(sheet.rows[row_num],'bottom') == True:
        file.write("\hline \n")
        
# Close table environment
file.write("\\end{tabular}")
file.close()

In [76]:
# Looping over all sheets in workbook
workbook = openpyxl.load_workbook(filename = input_excel_filename)

include_tabular_envir = True # Include \begin{tabular}...\end{tabular} wrapper to table


for sheet_name in workbook.get_sheet_names():
    
    #Get the worksheet 
    sheet = workbook[sheet_name]
 

    # Create output file
    file = open(output_dir + sheet_name + '.tex', 'w')


    if include_tabular_envir:
    ## Create Tabular Header

        col_align_str = "" # Preallocate string

        for colnum in range(0,sheet.max_column):

            if check_for_vline(sheet.columns[colnum],'left'):
                col_align_str += '|'

            col_align_str += pick_col_text_alignment(sheet.columns[colnum])

            if check_for_vline(sheet.columns[colnum],'right'):
                col_align_str += '|'

        begin_str = "\\begin{tabular}{" + str(col_align_str)  + "} \n"

        file.write(begin_str)




    for row_num in range(0,sheet.max_row):
        if check_for_hline(sheet.rows[row_num],'top') == True:
            file.write("\hline \n")

        #file.write(str(row_num)+"\t \\\ \n")
        file.write(tupple2latexstring(sheet.rows[row_num]))

        if check_for_hline(sheet.rows[row_num],'bottom') == True:
            file.write("\hline \n")

            
    if include_tabular_envir:
        # Close table environment
        file.write("\\end{tabular}")
        
        
        
    file.close()

In [73]:
sheet=workbook['tenure']
sheet.rows
check_for_hline(sheet.rows[0],'top')
print(sheet.rows[0][3].border.__dict__['top'].border_style)

None


## Rough workings
Cells below are just my rough workings

In [32]:
#type(sheet.cell(row=1, column=2).border.__dict__['bottom'])


#if sheet.cell(row=1, column=2).border.__dict__['bottom'].border_style:
#    print(1)
#else:
#    print(0)

1


In [33]:
#sheet.rows[1]


#row_tup = sheet.rows[0]

#len(row_tup)
#sheet.max_column

    
    
#print(sheet.cell(row=2, column=1).border.__dict__['bottom'].border_style == None)

True


In [34]:
#inspect.getmembers(sheet.cell(row=1, column=3).font)

#sheet.cell(row=1, column=2).font.__dict__

{'_key': 5269060958284101713,
 'b': False,
 'charset': None,
 'color': Color(rgb=Values must be of type <class 'str'>, indexed=Values must be of type <class 'int'>, auto=Values must be of type <class 'bool'>, theme=1, type='theme'),
 'condense': False,
 'extend': False,
 'family': 2.0,
 'i': True,
 'name': 'Calibri',
 'outline': False,
 'scheme': 'minor',
 'shadow': False,
 'strike': False,
 'sz': 11.0,
 'u': None,
 'vertAlign': None}

In [35]:
#sheet.cell(row=1, column=2).border.__dict__

{'_key': 43183173055487177,
 'bottom': Side(style='thin', color=Color(rgb=Values must be of type <class 'str'>, indexed=64, auto=Values must be of type <class 'bool'>, theme=Values must be of type <class 'int'>, type='indexed')),
 'diagonal': ,
 'diagonalDown': False,
 'diagonalUp': False,
 'diagonal_direction': None,
 'end': None,
 'horizontal': None,
 'left': ,
 'outline': True,
 'right': ,
 'start': None,
 'top': ,
 'vertical': None}

In [36]:
#sheet.cell(row=2, column=2).font.__dict__

{'_key': 1468294903339714438,
 'b': False,
 'charset': None,
 'color': Color(rgb=Values must be of type <class 'str'>, indexed=Values must be of type <class 'int'>, auto=Values must be of type <class 'bool'>, theme=1, type='theme'),
 'condense': False,
 'extend': False,
 'family': 2.0,
 'i': False,
 'name': 'Calibri',
 'outline': False,
 'scheme': 'minor',
 'shadow': False,
 'strike': False,
 'sz': 11.0,
 'u': None,
 'vertAlign': None}

In [37]:

#sheet.cell(row=2, column=2).number_format == 'General'

False

In [38]:
#sheet.cell(row=3, column=2).number_format

'General'