In [1]:
import pandas as pd
import numpy as np

with open('inputs/input_8.txt') as the_data:
    the_lines = the_data.read().replace(" |","").strip('\n').split('\n')
    
the_lines = [l.split() for l in the_lines]
line_df = pd.DataFrame(the_lines,columns=[a for a in range(10,24)])
line_df = line_df.applymap(lambda x: "".join(sorted(x)))

# Part 1 - How many strings of length 2,3,4, or 7 appear in the last 4 columns

In [2]:
num_values = 0
for i in line_df.columns[-4:]:
    num_values += line_df[i].str.len().isin([2,3,4,7]).sum()

In [3]:
num_values

342

# Part 2 - Identifying the digits

Let's start easy: The numbers 1, 7, 4, and 8 will be represented by 2, 3, 4 and 7 characters respectively. They are the only numbers represented by strings of these lengths.

In [4]:
def get_entry_with_length(row,length):
    return row[row.str.len()==length].iloc[0]    

In [5]:
line_df[1] = line_df.apply(get_entry_with_length,args=[2],axis=1)
line_df[7] = line_df.apply(get_entry_with_length,args=[3],axis=1)
line_df[4] = line_df.apply(get_entry_with_length,args=[4],axis=1)
line_df[8] = line_df.apply(get_entry_with_length,args=[7],axis=1)

Next we use these to isolate other values. 

* Notice that 9 is the only digit that will be represented using 6 characters such that 4 of these characters are the representation of the number 4. 
* Similarly 3 is the only 5-character number that contains the characters that represent 1.

In [6]:
def get_entry_with_length_and_substring(row,length,substring_col):
    right_length = row.loc[row.str.len() == length]
    has_right_subset = right_length.apply(lambda x:set(x).issuperset(set(row[substring_col])))
    return right_length.loc[has_right_subset].iloc[0]

def get_entry_with_length_and_superstring(row,length,superstring_col):
    right_length = row.loc[row.str.len() == length]
    superset = set(row[superstring_col])
    has_right_superset = right_length.apply(lambda x:set(x).issubset(superset))
    return right_length.loc[has_right_superset].iloc[0]

In [7]:
line_df[9] = line_df.apply(get_entry_with_length_and_substring,args=[6,4],axis=1)
line_df[3] = line_df.apply(get_entry_with_length_and_substring,args=[5,1],axis=1)

We see that 6 is the only value represented by 6 characters that does not contain 1. So let's modify our superset function to be not a superset.
* The integer 5 is the only 5-character number that is contained entirely in the representation of 6.

In [8]:
def get_entry_with_length_and_non_substring(row,length,substring_col):
    right_length = row.loc[row.str.len() == length]
    has_right_subset = right_length.apply(lambda x:set(x).issuperset(set(row[substring_col])))
    return right_length.loc[~has_right_subset].iloc[0]

In [9]:
line_df[6] = line_df.apply(get_entry_with_length_and_non_substring,args=[6,1],axis=1)
line_df[5] = line_df.apply(get_entry_with_length_and_superstring,args=[5,6],axis=1)

What remains is 2 and 0, so let's just define a function that isolated what remains depending on its length.

In [10]:
def get_entry_with_length_and_is_unknown(row, known_cols,length):
    known_vals = [row[c] for c in known_cols]
    return get_entry_with_length(row[~ row.isin(known_vals)],length)

In [11]:

line_df[2] = line_df.apply(get_entry_with_length_and_is_unknown,args=[[1,7,4,8,9,3,5,6],5],axis=1)
line_df[0] = line_df.apply(get_entry_with_length_and_is_unknown,args=[[1,7,4,8,9,3,5,6,2],6],axis=1)

Now we know exactly what string represents each digit.

In [12]:
correct_vals = line_df[[a for a in range(10)]]

In [13]:
correct_vals

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,abcdef,bd,acdeg,bcdeg,bdfg,bcefg,abcefg,bde,abcdefg,bcdefg
1,acdefg,ac,bcefg,abceg,abcd,abdeg,abdefg,acg,abcdefg,abcdeg
2,abcdeg,de,bcefg,bdefg,adef,abdfg,abcdfg,deg,abcdefg,abdefg
3,abcdeg,ae,abdfg,adefg,acef,cdefg,bcdefg,ade,abcdefg,acdefg
4,abcefg,ef,acdfg,cdefg,bdef,bcdeg,abcdeg,cef,abcdefg,bcdefg
...,...,...,...,...,...,...,...,...,...,...
195,abdefg,ad,bcdeg,abcdg,acdf,abcfg,abcefg,adg,abcdefg,abcdfg
196,acdefg,cf,abdfg,abcdf,bcef,abcde,abcdeg,cdf,abcdefg,abcdef
197,abcefg,eg,bcdef,cdefg,adeg,acdfg,abcdfg,efg,abcdefg,acdefg
198,abcdef,ae,abdfg,abefg,aceg,bcefg,bcdefg,abe,abcdefg,abcefg


Let's go ahead and find the displayed value for each row.

In [14]:
def find_value(row,display_cols,known_cols):
    key_cols = row[known_cols]
    return int(''.join([str(key_cols.loc[key_cols == val].keys()[0]) for val in row[display_cols]]))
line_df['displayed'] = line_df.apply(find_value,args = [[20,21,22,23],[a for a in range(10)]],axis=1)

In [15]:
line_df['displayed'].sum()

1068933

In [16]:
line_df['displayed']

0      2304
1      2211
2      9335
3      8816
4      1583
       ... 
195    5013
196    3807
197    3729
198    5117
199    1309
Name: displayed, Length: 200, dtype: int64