In [1]:
import pandas as pd
import numpy as np
import csv

# Some useful stuff for copies and pastes

1. Reading and Writing to `csv`
1. Use the `logging` module
1. Load all `csv` files in a directory into a dataframe
1. Exclusive deduplication

## Reading and Writing to csv

Other encodings include `utf-8`, `unicode` and many others. `latin_1` seems to work well in Windows. Especially if you have an excel sheet that you save as csv.

In [2]:
df = pd.read_csv('data/file1.csv', encoding='latin_1')

df.head(3)

Unnamed: 0,a,b,c
0,-0.403026,0.559875,0.755273
1,-1.795062,-0.195291,-0.550756
2,-2.151132,-0.915356,0.848399


In [3]:
df.to_csv('data/file2.csv', index=False, quoting=csv.QUOTE_ALL)

In [4]:
!head -n 4 data/file2.csv

"a","b","c"
"-0.4030264624911308","0.5598745496930929","0.7552732765837198"
"-1.7950623985555558","-0.19529096820893035","-0.5507564378674887"
"-2.151131926398676","-0.9153555453197798","0.8483986080862593"


## Use `logging`

In [5]:
import logging
logger = logging.getLogger()

try:
    1/0
except:
    logger.exception('Something failed.')

if __name__ == '__main__':
    logging.basicConfig(level=logging.DEBUG)

Something failed.
Traceback (most recent call last):
  File "<ipython-input-5-ff92aff4b25d>", line 5, in <module>
    1/0
ZeroDivisionError: division by zero


## Load all `csv` files in a directory

In [6]:
import sys
import os

class data_loader:
    def __init__(self):
        self.path = 'data/'
        self.lookupid_col = 'CONSTITUENTLOOKUPID'
        self.data = None
    
    def read_files(self, path=None):
        if not path:
            path = self.path
        
        data_frames = []
        
        for i in os.listdir(path):
            _df = pd.read_csv(f'{path}/{i}', encoding='latin_1')
            data_frames.append(_df)
        
        return pd.concat(data_frames, axis=0)
    
    def ten_digit_ids(self, df):
        def zerominator(x):
            new = '0000000000' + str(x)
            return new[-10:]
        
        df[self.lookupid_col] = df[self.lookupid_col].apply(zerominator)
        
        return df
    
    def get(self):
        self.data = (
            self.read_files()
            .pipe(self.ten_digit_ids)
        )
        return self.data

## Exclusive deduplication

Sometimes I need to check a list of IDs for the ones in either list that only appear in one list. Normal deduplication functions leave in one copy of the duplicate value. I want no copies of the duplicate values. I just want the values that only appear a single time in the set of values that includes both lists.

Behold. In Python, the answer is always a list comprehension.

In [7]:
def dedupe_exclusive(x, y):
    xs = [i for i in x if (i not in y)]
    ys = [i for i in y if (i not in x)]
    return xs + ys

In [8]:
x = np.arange(1, 9)
x

array([1, 2, 3, 4, 5, 6, 7, 8])

In [9]:
y = np.arange(3, 11)
y

array([ 3,  4,  5,  6,  7,  8,  9, 10])

In [10]:
dedupe_exclusive(x, y)

[1, 2, 9, 10]