In [4]:
import tabula 
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
import numpy as np
import csv

pd.set_option("display.max_rows", 150)
pd.set_option("display.max_columns", 150)
plt.rcParams.update({'figure.max_open_warning': 0})

In [5]:

from io import StringIO
# pip install pdfminer.six
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

def convert_pdf_to_string(file_path):
    output_string = StringIO()
    with open(file_path, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
    return(output_string.getvalue())

In [6]:
pdf_str = convert_pdf_to_string("raw-from-source/2018generalofficialresults.pdf")

In [7]:
pdf_pages = pdf_str.split("\n\n\x0c")

# remove blank pages 
pdf_pages = [p for p in pdf_pages if len(p) >= 30]

print(len(pdf_pages))

211


In [8]:
positions = set(["Senator", "Governor", "Treasurer", "Secretary", "General", "Auditor", "Congress"])
positions = set([p.upper() for p in positions])

county_page_idxs = []
for idx, page in enumerate(pdf_pages):
    if "County" in page and len(positions.difference(set(page.split()))) < len(positions):
        county_page_idxs.append(idx)


idx2county = {}
for idx in county_page_idxs:
    page = pdf_pages[idx]
    county = page[:page.index("County")].split()[-1]
    idx2county[idx] = county
    
county2idx_list = [(val, key) for key, val in idx2county.items()]
county2idx_list.append(("End File", 212))
print(county2idx_list)

[('Addison', 4), ('Bennington', 8), ('Caledonia', 10), ('Chittenden', 12), ('Essex', 14), ('Franklin', 16), ('Isle', 18), ('Lamoille', 20), ('Orange', 22), ('Orleans', 24), ('Rutland', 28), ('Washington', 32), ('Windham', 36), ('Windsor', 40), ('Addison', 45), ('Bennington', 47), ('Caledonia', 48), ('Chittenden', 49), ('Essex', 50), ('Franklin', 51), ('Isle', 52), ('Lamoille', 53), ('Orange', 54), ('Orleans', 55), ('Rutland', 57), ('Washington', 59), ('Windham', 61), ('Windsor', 63), ('Addison', 66), ('Bennington', 70), ('Caledonia', 72), ('Chittenden', 74), ('Essex', 76), ('Franklin', 78), ('Isle', 80), ('Lamoille', 82), ('Orange', 84), ('Orleans', 86), ('Rutland', 90), ('Washington', 94), ('Windham', 98), ('Windsor', 102), ('Addison', 107), ('Bennington', 109), ('Caledonia', 110), ('Chittenden', 111), ('Essex', 112), ('Franklin', 113), ('Isle', 114), ('Lamoille', 115), ('Orange', 116), ('Orleans', 117), ('Rutland', 119), ('Washington', 121), ('Windham', 123), ('Windsor', 125), ('Addi

In [9]:
file = "raw-from-source/2018generalofficialresults.pdf"
tables = tabula.read_pdf(file, pages = "all", multiple_tables = True)

In [10]:
print(len(tables))

211


In [11]:
tables[10]

Unnamed: 0.1,Unnamed: 0,Bernie Sanders,Brad J. Peacock,Bruce Busa,Edward S. Gilber,Folasade Adeluo,Jon Svitavsky,Lawrence Zupan,Reid Kane,Russell Beste
0,BARNET,479 58.92%,11 1.35%,2 0.25%,10 1.23%,5 0.62%,1 0.12%,279 34.32%,3 0.37%,9 1.11%
1,BURKE,431 60.88%,5 0.71%,0 0.00%,6 0.85%,7 0.99%,1 0.14%,239 33.76%,3 0.42%,4 0.56%
2,DANVILLE,652 56.35%,8 0.69%,4 0.35%,15 1.30%,6 0.52%,1 0.09%,416 35.96%,5 0.43%,10 0.86%
3,GROTON,203 49.15%,3 0.73%,0 0.00%,17 4.12%,0 0.00%,1 0.24%,176 42.62%,2 0.48%,4 0.97%
4,HARDWICK,727 62.56%,11 0.95%,5 0.43%,23 1.98%,1 0.09%,5 0.43%,358 30.81%,7 0.60%,6 0.52%
5,KIRBY,139 56.28%,3 1.21%,0 0.00%,3 1.21%,3 1.21%,9 3.64%,86 34.82%,0 0.00%,0 0.00%
6,LYNDON,876 51.20%,16 0.94%,7 0.41%,18 1.05%,12 0.70%,6 0.35%,721 42.14%,6 0.35%,15 0.88%
7,NEWARK,141 54.86%,5 1.95%,0 0.00%,1 0.39%,2 0.78%,1 0.39%,100 38.91%,2 0.78%,1 0.39%
8,PEACHAM,289 66.74%,4 0.92%,0 0.00%,8 1.85%,2 0.46%,1 0.23%,113 26.10%,2 0.46%,1 0.23%
9,RYEGATE,249 56.85%,1 0.23%,1 0.23%,5 1.14%,7 1.60%,2 0.46%,166 37.90%,1 0.23%,3 0.68%


In [9]:
county2df = defaultdict(list)
for idx, (county, page_num) in enumerate(county2idx_list[:-1]):
    
    next_county_page_num = county2idx_list[idx + 1][1]
    
    county_len = next_county_page_num - page_num
    
    # there is always a blank page after Windsor
    if county == "Windsor":
        county_len -= 1
    
    # Table small enough to be stored on one page of pdf.
    if county_len == 1:
        df = tables[page_num]

    # Table stored on 2 pages in pdf, 2 different scenarios.
    if county_len == 2:

        df1 = tables[page_num]
        df2 = tables[page_num + 1]
        
        df2 = df2.loc[~df2.index.duplicated()]
        
        # Scenario 1, too many columns to fit on one page, concat horizontally.
        if len(df1) == len(df2):
            df = pd.concat([df1, df2], axis=1)
        # Scenario 2, too many rows to fit on one page, append vertically.
        else:
            df = df1.append(df2)

    # Table has too many rows and cols to fit on one page, only one configuration:
    # page 1 if left-top, page 2 is right-top, page 3 is left-bottom and page 4 
    # is right-bottom of the combined table.
    
    if county_len == 4:

        df1 = tables[page_num]
        
        df2 = tables[page_num + 1]
        if "Unnamed: 0" in df2:
            df2.drop(columns="Unnamed: 0", inplace=True)

        df3 = tables[page_num + 2]
        df4 = tables[page_num + 3]
        
        if "Unnamed: 0" in df4:
            df4.drop(columns="Unnamed: 0", inplace=True)

        first_rows = pd.concat([df1, df2], axis=1)
        second_rows = pd.concat([df3, df4], axis=1)

        df = first_rows.append(second_rows)

    county2df[county].append(df)

        

In [10]:
# check that dims are the same between counties
for county, dfs in county2df.items():
    rows = dfs[0].index
    for df in dfs:
        if len(df.index) != len(rows):
            print("wrong length", county, len(df.index), len(rows))
            continue
        
        if (df.index != rows).any():
            print("whoops")


In [11]:
county2final_df = {key: pd.concat(val, axis=1) for key, val in county2df.items()}

county_columns = county2final_df["Addison"].keys()

In [3]:
cols = set()
size2county = defaultdict(list)
for key, df in county2final_df.items():
    df_cols = tuple(df.columns.to_list())
    cols.add(df_cols)
    
    size2county[len(df_cols)].append(key)
    
    
# let's see if dfs with same number of columns at least have the same columns
for key, counties in size2county.items():
    print(counties)
    example_cols = tuple(county2final_df[counties[0]].columns.to_list())
    for county in counties:
        compare_cols = tuple(county2final_df[county].columns.to_list())
        if compare_cols != example_cols:
            print(f"Mistake with county: {county}")



NameError: name 'defaultdict' is not defined

In [14]:
rep_counties = ["Addison", "Bennington", "Isle"]
rep_columns = [county2final_df[c].columns.to_list() for c in rep_counties]
orig_columns = rep_columns[0]
for i in [0, 1, 2]:
    rep_columns[i][0] = "NAME"
    
# great! rename the first, remove all unnamed

rep_columns = [[val for val in lst if "Unnamed" not in val] for lst in rep_columns]
    
for a, b, c in zip(*rep_columns):
    print(a, b, c)

NAME NAME NAME
Bernie Sanders Bernie Sanders Bernie Sanders
Brad J. Peacock Brad J. Peacock Brad J. Peacock
Bruce Busa Bruce Busa Bruce Busa
Edward S. Gilber Edward S. Gilber Edward S. Gilber
Folasade Adeluo Folasade Adeluo Folasade Adeluo
Jon Svitavsky Jon Svitavsky Jon Svitavsky
Lawrence Zupan Lawrence Zupan Lawrence Zupan
Reid Kane Reid Kane Reid Kane
Russell Beste Russell Beste Russell Beste
write-in write-in write-in
overvotes overvotes overvotes
blank votes blank votes blank votes
TownTotal TownTotal TownTotal
Anya Tynio Anya Tynio Anya Tynio
Cris Ericson Cris Ericson Cris Ericson
Laura S. Potter Laura S. Potter Laura S. Potter
Peter Welch Peter Welch Peter Welch
write-in write-in write-in
overvotes overvotes overvotes
blank votes blank votes blank votes
TownTotal TownTotal TownTotal
Charles Laramie Charles Laramie Charles Laramie
Christine Hallqu Christine Hallqu Christine Hallqu
Cris Ericson Cris Ericson Cris Ericson
Emily "Em" Peyt Emily "Em" Peyt Emily "Em" Peyt
Phil Scott Ph

In [16]:
vest2can = {l.split(',')[0] : l.split(',')[1].strip() for l in open("raw-from-source/utils/vest2candidate.txt", "r").readlines()}
can2vest = {val : key for key, val in vest2can.items()}

raw2vest = {}
for raw in orig_columns:
    raw = raw.strip()
    for can in can2vest:
        if raw in can:
            vest = can2vest[can]
            raw2vest[raw] = vest
            break

In [75]:
def assign_raw2vest(name):
    if name in raw2vest:
        return raw2vest[name]
    
    for opt in can2vest:
        if name in opt:
            vest = can2vest[opt]

            raw2vest[name] = vest
            return vest
        
    return name

final_col_names = rep_columns[0]
final_col_names = [assign_raw2vest(val) for val in final_col_names]

# Ericson was an over-achiever and ran for two offices
final_col_names[15] = "G18HALIERI"

for idx, col in enumerate(final_col_names):
    if col == "write-in":
        final_col_names[idx] = final_col_names[idx - 1][:-4] + "OWRI"
        continue
        
    if col == "NAME" or col in vest2can:
        continue
        
    if col == 'Unnamed: 0':
        final_col_names[idx] == "Blank Col"
        continue
        
    final_col_names[idx] = "DROP"

final_col_names = [col for col in final_col_names if col != "DROP"]

# now we just need a way to drop all unnamed from each df, AFTER renaming the first column
final_dfs = []

for key, df in county2final_df.items():
    
    old_cols = df.columns.to_list()
    old_cols[0] = "NAME"
    
    for idx, col in enumerate(old_cols):
        if col == "write-in":
            old_cols[idx] = "write-in" + str(idx)
        if col == "Cris Ericson":
            old_cols[idx] = "Cris Ericson" + str(idx)
    
    df.columns = old_cols
    
    keep_cols = [col for col in old_cols if "Unnamed" not in col \
                 and "blank" not in col and "overvotes" not in col and "TownTotal" not in col]
    
    # cris ericson is doubled 
    
    df = df[keep_cols]
    df.columns = final_col_names
    df["County"] = key
    df["ID"] = key + " " + df["NAME"]
    df.set_index(["ID"], inplace=True)
    
    # remove the percentages from each cell
    cols2reset = [col for col in df.columns if col.startswith("G18")]

    for col in cols2reset:
        df[col] = df[col].astype('str').str.split().str[0]
    
    df.reset_index(inplace=True)
    final_dfs.append(df)

final_dfs[2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["County"] = key
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["ID"] = key + " " + df["NAME"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype('str').str.split().str[0]


Unnamed: 0,ID,NAME,G18USSISAN,G18USSIPEA,G18USSIBUS,G18USSIGIL,G18USSIADE,G18USSISVI,G18USSRZUP,G18USSOKAN,G18USSIBES,G18USSOWRI,G18HALRTYN,G18HALIERI,G18HALOPOT,G18HALDWEL,G18HALOWRI,G18GOVILAR,G18GOVDHAL,G18GOVIERI,G18GOVOPEY,G18GOVRSCO,G18GOVOMAR,G18GOVIBAR,G18GOVOWRI,G18LTGPZUC,G18LTGRTUR,G18LTGONGO,G18LTGOWRI,G18TREDPEA,G18TRERMOR,G18TREOWRI,G18SOSRPAI,G18SOSDCON,G18SOSOHEB,G18SOSOWRI,G18AUDDHOF,G18AUDOBRO,G18AUDRKEN,G18AUDOWRI,G18ATGRWIL,G18ATGOJAC,G18ATGDDON,G18ATGOWRI,County
0,Caledonia BARNET,BARNET,479,11,2,10,5,1,279,3,9,0,263,17,5,513,0,6,303,2,6,473,7,4,3,435,352,9,0,465,311,1,294,475,20,0,435,28,314,0,297,20,473,0,Caledonia
1,Caledonia BURKE,BURKE,431,5,0,6,7,1,239,3,4,2,243,22,8,421,1,2,238,6,2,429,5,17,2,374,310,8,0,389,287,2,261,399,21,0,350,41,277,0,279,21,384,1,Caledonia
2,Caledonia DANVILLE,DANVILLE,652,8,4,15,6,1,416,5,10,0,371,36,14,711,0,7,351,4,5,752,7,10,1,553,559,12,1,681,418,0,404,690,18,0,632,27,431,0,412,22,669,1,Caledonia
3,Caledonia GROTON,GROTON,203,3,0,17,0,1,176,2,4,0,170,20,9,209,0,5,96,3,5,289,2,7,2,160,239,5,0,200,188,1,201,182,16,0,161,18,207,0,184,14,197,0,Caledonia
4,Caledonia HARDWICK,HARDWICK,727,11,5,23,1,5,358,7,6,4,343,44,13,742,3,12,415,9,2,680,6,14,6,593,532,16,0,722,390,0,365,730,34,0,622,56,412,1,363,36,728,0,Caledonia
5,Caledonia KIRBY,KIRBY,139,3,0,3,3,9,86,0,0,0,84,4,2,152,0,1,92,0,2,147,1,2,2,122,119,1,0,141,96,1,90,141,7,0,133,9,93,0,90,5,145,0,Caledonia
6,Caledonia LYNDON,LYNDON,876,16,7,18,12,6,721,6,15,0,657,56,27,929,2,10,439,13,13,1172,10,22,11,726,933,21,0,883,758,1,726,861,57,0,752,74,788,1,754,36,866,1,Caledonia
7,Caledonia NEWARK,NEWARK,141,5,0,1,2,1,100,2,1,1,97,10,5,139,2,2,88,1,2,151,2,3,4,116,127,4,0,130,114,0,111,120,15,2,112,13,111,0,112,9,124,0,Caledonia
8,Caledonia PEACHAM,PEACHAM,289,4,0,8,2,1,113,2,1,0,97,13,7,307,0,3,199,1,4,209,3,6,3,272,145,5,0,306,109,0,105,305,8,1,270,15,117,0,106,11,300,0,Caledonia
9,Caledonia RYEGATE,RYEGATE,249,1,1,5,7,2,166,1,3,0,164,8,2,258,0,1,109,1,1,311,4,3,1,201,225,5,0,258,168,0,181,237,9,0,211,10,201,0,169,9,246,0,Caledonia


In [76]:
final_df = pd.concat(final_dfs, axis=0)
final_df["ID"] = final_df["ID"].astype('str').str.replace("\r", " ")
final_df["NAME"] = final_df["NAME"].astype('str').str.replace("\r", " ")

final_df["ID"] = final_df["ID"].astype('str').str.replace("\n", " ")
final_df["NAME"] = final_df["NAME"].astype('str').str.replace("\n", " ")

In [77]:
final_df.head()

Unnamed: 0,ID,NAME,G18USSISAN,G18USSIPEA,G18USSIBUS,G18USSIGIL,G18USSIADE,G18USSISVI,G18USSRZUP,G18USSOKAN,G18USSIBES,G18USSOWRI,G18HALRTYN,G18HALIERI,G18HALOPOT,G18HALDWEL,G18HALOWRI,G18GOVILAR,G18GOVDHAL,G18GOVIERI,G18GOVOPEY,G18GOVRSCO,G18GOVOMAR,G18GOVIBAR,G18GOVOWRI,G18LTGPZUC,G18LTGRTUR,G18LTGONGO,G18LTGOWRI,G18TREDPEA,G18TRERMOR,G18TREOWRI,G18SOSRPAI,G18SOSDCON,G18SOSOHEB,G18SOSOWRI,G18AUDDHOF,G18AUDOBRO,G18AUDRKEN,G18AUDOWRI,G18ATGRWIL,G18ATGOJAC,G18ATGDDON,G18ATGOWRI,County
0,Addison ADDISON,ADDISON,330,4,0,4,4,21,266,2,5,0,253,23,6,353,0,5,154,7,5,465,1,4,3,245,386,5,0,306,318,0,282,324,16,0,267,19,319,0,245,8,370,0,Addison
1,Addison BRIDPORT,BRIDPORT,307,5,3,2,6,16,230,0,6,0,200,30,4,331,0,2,139,3,7,429,1,2,1,247,327,3,0,291,270,0,233,313,14,2,257,21,270,0,206,15,344,0,Addison
2,Addison BRISTOL,BRISTOL,1172,13,4,7,14,27,445,7,18,0,438,66,21,1182,0,9,694,12,9,949,5,29,8,1002,674,27,1,1131,539,0,482,1137,60,0,1017,63,548,2,416,55,1201,0,Addison
3,Addison CORNWALL,CORNWALL,543,4,1,0,5,6,135,1,1,2,126,18,2,549,1,3,336,0,4,346,6,4,1,488,200,6,0,515,152,0,144,523,16,0,479,14,160,1,123,16,543,0,Addison
4,Addison FERRISBURGH,FERRISBURGH,913,11,3,7,15,14,442,2,9,3,421,31,16,945,0,4,496,10,4,888,7,9,2,759,647,9,1,901,492,0,466,905,27,0,819,44,494,0,402,22,977,1,Addison


#### So this is exceptionally weird, run this cell **TWICE** to save the correct .csv file

In [78]:
# Need to pass in these special arguments to take care of newline characters present in the dataframe
final_df.to_csv("raw-from-source/recreated/vest_from_pdf_dummy.csv", index=False, encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC)

test = pd.read_csv("raw-from-source/recreated/vest_from_pdf.csv")


test = test.astype('str')
final_df = final_df.astype('str')

test.reset_index(inplace=True)
final_df.reset_index(inplace=True)
print(final_df.equals(test))
# so some corruptions are occuring as this file is being saved to .csv, let's see what is going on. 

False


In [37]:
final_df.head()

Unnamed: 0,index,ID,NAME,G18USSISAN,G18USSIPEA,G18USSIBUS,G18USSIGIL,G18USSIADE,G18USSISVI,G18USSRZUP,G18USSOKAN,G18USSIBES,G18USSOWRI,G18HALRTYN,G18HALIERI,G18HALOPOT,G18HALDWEL,G18HALOWRI,G18GOVILAR,G18GOVDHAL,G18GOVIERI,G18GOVOPEY,G18GOVRSCO,G18GOVOMAR,G18GOVIBAR,G18GOVOWRI,G18LTGPZUC,G18LTGRTUR,G18LTGONGO,G18LTGOWRI,G18TREDPEA,G18TRERMOR,G18TREOWRI,G18SOSRPAI,G18SOSDCON,G18SOSOHEB,G18SOSOWRI,G18AUDDHOF,G18AUDOBRO,G18AUDRKEN,G18AUDOWRI,G18ATGRWIL,G18ATGOJAC,G18ATGDDON,G18ATGOWRI
0,0,Addison ADDISON,ADDISON,330,4,0,4,4,21,266,2,5,0,253,23,6,353,0,5,154,7,5,465,1,4,3,245,386,5,0,306,318,0,282,324,16,0,267,19,319,0,245,8,370,0
1,1,Addison BRIDPORT,BRIDPORT,307,5,3,2,6,16,230,0,6,0,200,30,4,331,0,2,139,3,7,429,1,2,1,247,327,3,0,291,270,0,233,313,14,2,257,21,270,0,206,15,344,0
2,2,Addison BRISTOL,BRISTOL,1172,13,4,7,14,27,445,7,18,0,438,66,21,1182,0,9,694,12,9,949,5,29,8,1002,674,27,1,1131,539,0,482,1137,60,0,1017,63,548,2,416,55,1201,0
3,3,Addison CORNWALL,CORNWALL,543,4,1,0,5,6,135,1,1,2,126,18,2,549,1,3,336,0,4,346,6,4,1,488,200,6,0,515,152,0,144,523,16,0,479,14,160,1,123,16,543,0
4,4,Addison FERRISBURGH,FERRISBURGH,913,11,3,7,15,14,442,2,9,3,421,31,16,945,0,4,496,10,4,888,7,9,2,759,647,9,1,901,492,0,466,905,27,0,819,44,494,0,402,22,977,1


In [114]:
test.head()

Unnamed: 0,ID,NAME,G18USSISAN,G18USSIPEA,G18USSIBUS,G18USSIGIL,G18USSIADE,G18USSISVI,G18USSRZUP,G18USSOKAN,G18USSIBES,G18USSOWRI,G18HALRTYN,G18HALIERI,G18HALOPOT,G18HALDWEL,G18HALOWRI,G18GOVILAR,G18GOVDHAL,G18GOVIERI,G18GOVOPEY,G18GOVRSCO,G18GOVOMAR,G18GOVIBAR,G18GOVOWRI,G18LTGPZUC,G18LTGRTUR,G18LTGONGO,G18LTGOWRI,G18TREDPEA,G18TRERMOR,G18TREOWRI,G18SOSRPAI,G18SOSDCON,G18SOSOHEB,G18SOSOWRI,G18AUDDHOF,G18AUDOBRO,G18AUDRKEN,G18AUDOWRI,G18ATGRWIL,G18ATGOJAC,G18ATGDDON,G18ATGOWRI
0,Addison ADDISON,ADDISON,330,4,0,4,4,21,266,2,5,0,253,23,6,353,0,5,154,7,5,465,1,4,3,245,386,5,0,306,318,0,282,324,16,0,267,19,319,0,245,8,370,0
1,Addison BRIDPORT,BRIDPORT,307,5,3,2,6,16,230,0,6,0,200,30,4,331,0,2,139,3,7,429,1,2,1,247,327,3,0,291,270,0,233,313,14,2,257,21,270,0,206,15,344,0
2,Addison BRISTOL,BRISTOL,1172,13,4,7,14,27,445,7,18,0,438,66,21,1182,0,9,694,12,9,949,5,29,8,1002,674,27,1,1131,539,0,482,1137,60,0,1017,63,548,2,416,55,1201,0
3,Addison CORNWALL,CORNWALL,543,4,1,0,5,6,135,1,1,2,126,18,2,549,1,3,336,0,4,346,6,4,1,488,200,6,0,515,152,0,144,523,16,0,479,14,160,1,123,16,543,0
4,Addison FERRISBURGH,FERRISBURGH,913,11,3,7,15,14,442,2,9,3,421,31,16,945,0,4,496,10,4,888,7,9,2,759,647,9,1,901,492,0,466,905,27,0,819,44,494,0,402,22,977,1


In [125]:
for col in final_df:
    f = final_df[col].astype('str')
    t = test[col].astype('str')
    
    diff = f.ne(t).to_list()
    print(type(diff))
    print(f[diff])
    print(t[diff])
    
    npf = final_df[col].astype('str').to_numpy()
    npt = test[col].astype('str').to_numpy()
    
    if not np.array_equal(npf, npt):
        print("Np not equal")
        
        print(np.where(npt != npf))

<class 'list'>


IndexError: Boolean index has wrong length: 491 instead of 260