# File Manipulation and Basic Operations
- Original Notebook by Lauren Kahre [lekahre@gmail.com](lekahre@gmail.com) 
- Pandas module documentation: https://pandas.pydata.org/pandas-docs/stable/
- Numpy module documentation: https://docs.scipy.org/doc/numpy/
- Annotated- use ctrl+F to find needed keywords, or just click on a section heading in Contents below.

## Contents
- [File input](#file_input)
    - [Pandas](#pandas_input)
    - [Numpy](#numpy_input)
    - [Basic Python Lists](#lists_input)
- [Looping](#looping)
    - [For Loops](#for_loops)
    - [While Loops](#while_loops)
    - [Loop Control](#loop_control) 
    
- [File output](#file_output)

<a id="file_input".</a>
### File Input
<a id="pandas_input".</a>
#### Pandas

In [33]:
#Pandas is easy. It's possible to use other methods, but hard with mixed datatypes.
import pandas as pd

filename = 'felon_disenfranchisement.csv'
data_pandas = pd.read_csv(filename)  #Can specify separators other than commas

#Display head to check if file input worked, set max columns so you can see all of it
pd.options.display.max_columns = 150    #set to 150 as an example- can be any value
display(data_pandas.head())       #can drop '.head()' if want to see all rows

Unnamed: 0,state,prison,parole,felony_probation,jail,post_sentence,total,vap,percentage_disenfranchised
0,Alabama,30585.0,6580.0,15626.0,1578.0,231896.0,286266,3755483,0.0762
1,Alaska,5497.0,2035.0,6900.0,7.0,,14439,552166,0.0261
2,Arizona,44509.0,7241.0,51362.0,1341.0,116717.0,221170,5205215,0.0425
3,Arkansas,19224.0,21811.0,24695.0,975.0,,66705,2272904,0.0293
4,California,136302.0,86254.0,,,,222557,30023902,0.0074


<a id="numpy_input".</a>
#### Numpy

In [26]:
#Using Numpy if for some godforsaken reason you can't use Pandas
import numpy as np

filename = 'felon_disenfranchisement.csv'
data_numpy = np.genfromtxt(filename, delimiter=',', names=True, dtype=None)  
    #names=True tells us there's a header
    #dtype=None lets you work with mixed datatypes
    #Gives you a structured array

#Check file input to make sure it worked
display(data_numpy[:5])

array([ (b'Alabama',  30585,  6580, 15626, 1578, 231896, 286266,  3755483,  0.0762),
       (b'Alaska',   5497,  2035,  6900,    7,     -1,  14439,   552166,  0.0261),
       (b'Arizona',  44509,  7241, 51362, 1341, 116717, 221170,  5205215,  0.0425),
       (b'Arkansas',  19224, 21811, 24695,  975,     -1,  66705,  2272904,  0.0293),
       (b'California', 136302, 86254,    -1,   -1,     -1, 222557, 30023902,  0.0074)],
      dtype=[('state', 'S14'), ('prison', '<i8'), ('parole', '<i8'), ('felony_probation', '<i8'), ('jail', '<i8'), ('post_sentence', '<i8'), ('total', '<i8'), ('vap', '<i8'), ('percentage_disenfranchised', '<f8')])

<a id="lists_input".</a>
#### Basic Python Lists

In [46]:
#Basic Python file I/O if you're getting desperate
import csv

filename = 'felon_disenfranchisement.csv'
f = open(filename)
csv_f = csv.reader(f)

data_basic = []  #Generate list to dump data into

#Check file input to make sure it worked
for row in csv_f:
    data_basic.append(row)
f.close()
print(data_basic[:5])

[['state', 'prison', 'parole', 'felony_probation', 'jail', 'post_sentence', 'total', 'vap', 'percentage_disenfranchised'], ['Alabama', '30585', '6580', '15626', '1578', '231896', '286266', '3755483', '0.0762'], ['Alaska', '5497', '2035', '6900', '7', '', '14439', '552166', '0.0261'], ['Arizona', '44509', '7241', '51362', '1341', '116717', '221170', '5205215', '0.0425'], ['Arkansas', '19224', '21811', '24695', '975', '', '66705', '2272904', '0.0293']]


<a id="looping".</a>
### Looping
<a id="for_loops".</a>
#### For Loops
<br>

- Can loop using index numbers or patterns.
- Python supports single-line looping- most common use is "lambda" functions.


In [30]:
#Pandas:
#Using iterrows:
for index, row in data_pandas.iterrows():
    #do things- gonna tell it to do nothing for now, but print statement in comment
    #print (row['state'], row['jail'])
    pass

#Using itertuples
for row in data_pandas.itertuples(index=True, name='Pandas'):
    #do things- gonna tell it to do nothing for now, but print statement in comment
    #print (getattr(row, 'state'), getattr(row, 'jail'))
    pass

#Single-line apply changes with lambda function
def replace_blanks(x):
    import math
    if math.isnan(x):
        return 0
    else:
        pass

data_pandas['jail'] = data_pandas.apply(lambda row: replace_blanks(row['jail']), axis=1)

In [78]:
#Numpy:
num_rows = data_numpy.shape[0]
for i in range(num_rows):    #Loop through using index numbers and range()
    #do things- gonna tell it to do nothing for now, but print statement in comment
    #print (data_numpy[i])    #if trying to print specific columns, access via 
                                #data_numpy['colname'][i]
    pass
#This is set up for the structured array to access row-by-row. Can iterate through columns
#in a basic 2d array via array[colnum, rownum].

In [77]:
#Basic lists:
for row in data_basic:       #Loop through using pattern loops
    #do things- gonna tell it to do nothing for now, but print statement in comment
    #print (row)
    pass

<a id="while_loops".</a>
#### While Loops

In [43]:
#Should be similar with all data types- careful, it's possible to build an infinite loop.
#Slower than for-loops. Use only if you can't reframe it into a for-loop without break 
#statement shenanigans. The following example is a bad example for syntax reference only- 
#this iteration should be a for-loop.

end = False
n = len(data_basic)
i = 0

while end != True:
    #do things- gonna tell it to do nothing for now, but print statement in comment
    #print(data_basic[i])
    i += 1
    if i == n:
        end = True
    else:
        end = False


<a id="loop_control".</a>
#### Loop Control

<br>
- Break statements terminate loop that contains it. If in nested structure, will only terminiate innermost loop.
- Continue statements skip remaining loop commands then proceed to the next iteration.
- Pass statements do nothing. Can be used when syntax requires a statement, but program requires no action.


In [73]:
for row in data_basic:
    value = row[3]
    if len(value) == 0:  #If value does not exist in that column for current row, end loop
        break
    else:
        print(value)
        

felony_probation
15626
6900
51362
24695


In [75]:
for row in data_basic:
    value = row[3]
    if len(value) == 0:  #If value does not exist in that column for current row, skip to next iteration
        continue
    else:
        print(value)

felony_probation
15626
6900
51362
24695
4074
86886
170194
9863
12365
3426
27323
37761
43215
28463
38870
40867
2952
8097
58123
13352
26475
21464
4114
52654
216033
56908
25164
4109
22101
3148
1116585


In [76]:
totrows = 0
fullrows = 0

for row in data_basic:
    value = row[3]
    if len(value) == 0:  #If value does not exist in that column for current row, do nothing.
        pass
    else:
        fullrows += 1
    totrows += 1
print("Number of rows:", str(totrows))
print("Full rows:", str(fullrows))

Number of rows: 52
Full rows: 32


<a id="file_output".</a>
### File Output

In [53]:
#First, with pandas:
data_pandas.to_csv('resave_felon_disenfranchisement_pandas.csv')

#Now numpy:
#np.tofile('resave_felon_disenfranchisement_numpy.csv', data_numpy, sep=',')
data_numpy.tofile('resave_felon_disenfranchisement.csv', sep=',')

#Now basic list:
with open('resave_felon_disenfranchisement_basic.csv', 'w+') as myfile:
    wr = csv.writer(myfile, delimiter=',')
    wr.writerows(data_basic)