# Some real data wrangling

In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [43]:
import pandas as pd
dbdatapath = 'gdrive/My Drive/1_PythonCourse/Mines-ERDS-private/module_4/' 
dbfilename = 'Rosie_data.txt' 
db = pd.read_csv(dbdatapath + dbfilename, header=1, names=['th', 'tr', 'dist', 'formation', 'enviro','lith','confinement']) 

In [44]:
# clean db up

# drop rows with NaNs for thickness and zero values
db = db[np.isfinite(db['th']) & db.th>0]
# and now the same for thinning rate and distance
db = db[np.isfinite(db['tr']) & db.tr>0]
db = db[np.isfinite(db['dist']) & db.dist>0]

# now add log10 columns
db['logth']=np.log10(db.th)
db['logtr']=np.log10(db.tr)
db['logdist']=np.log10(db.dist)

# replace names with just "Lobe"
db=db.replace('Proximal Lobe','Lobe') # that is amazing that it's that easy
db=db.replace('Distal Lobe','Lobe')

# make an integer column for environment for easier indexing
db['env_num']=np.ones(len(db)) # pad it first
db.loc[db.enviro == 'Basin Plain', 'env_num']=0
db.loc[db.enviro == 'Lobe', 'env_num']=1
db.loc[db.enviro == 'CL Transition', 'env_num']=2
db.loc[db.enviro == 'Channel', 'env_num']=3
db.loc[db.enviro == 'Levee', 'env_num']=4

# now just rearrange the column order
db=db[['th','tr','dist','formation','enviro','env_num','lith','confinement','logth','logtr','logdist']]

In [45]:
new_names = ['thickness_m', 'thinning_rate', 'distance_m', 
             'formation', 'environment', 'environment_numeric', 
             'lithology_numeric','confinement_numeric', 
             'log10thickness_m','log10thinning_rate','log10distance_m']
old_names = db.columns
name_dict = dict(zip(old_names, new_names))
db.rename(columns=name_dict, inplace=True)
db.describe()

Unnamed: 0,thickness_m,thinning_rate,distance_m,environment_numeric,lithology_numeric,confinement_numeric,log10thickness_m,log10thinning_rate,log10distance_m
count,28524.0,28524.0,28524.0,28524.0,28524.0,28524.0,28524.0,28524.0,28524.0
mean,0.484318,0.006383587,2665.638472,1.358926,0.663266,2.872073,-0.614947,-3.127715,2.048124
std,0.730694,0.01861618,7210.794283,1.00974,0.515873,2.11137,0.548231,1.135749,1.104511
min,0.0001,9.531686e-09,1.760801,0.0,0.0,0.0,-4.0,-8.02083,0.24571
25%,0.1,0.0001427275,19.035688,1.0,0.0,1.0,-1.0,-3.845492,1.279569
50%,0.261,0.00112974,54.300688,1.0,1.0,2.0,-0.583359,-2.947022,1.734805
75%,0.62,0.004986389,425.233311,2.0,1.0,5.0,-0.207608,-2.302214,2.628627
max,23.175,1.255403,54749.015748,4.0,2.0,6.0,1.36502,0.098783,4.738376


In [46]:
lith_dict = {0 : 'mud', 1 : 'sand', 2 : np.nan} # 2 were debrites, but there's only a few beds, so let's just make them NaNs
db['lithology'] = db.lithology_numeric.map(lith_dict)
db.groupby('lithology').size()
print('number of NaNs',db.lithology.isna().sum())
t = db.lithology_numeric == 2
print('number of 2s:',t.sum())

number of NaNs 610
number of 2s: 610


In [47]:
db.groupby('confinement_numeric').size()

confinement_numeric
0    6390
1    2479
2    5725
3     384
4    2362
5    9710
6    1474
dtype: int64

In [29]:
# now let's get rid of those zeros
db.confinement_numeric[db.confinement_numeric == 0] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Whoa! What's that warning? 
It is an insidious one, and you will run across it SO much in pandas. There is an article [here](#https://realpython.com/pandas-settingwithcopywarning/#example-of-a-settingwithcopywarning
) that has a good explanation of why this happens and what to do about it. The proper way is to do it like below, using `loc`

But, we have already assigned it, so we would need to reload the data and not run the cell above. Instead, run the cell below

In [48]:
zeros_mask = db.confinement_numeric == 0
db.loc[zeros_mask,'confinement_numeric']= np.nan

# check it
zeros_mask.value_counts()
print(db.loc[:,'confinement_numeric'].isna().sum())

6390


In [63]:
# an even better way to do it is to include the 0 --> NaN in the original conversion
conf_ints = [0,1,2,3,4,5,6]
conf_names = [np.nan,'Unconfined Proximal','Unconfined Distal','Confined Proximal','Confined Distal','Semiconfined Proximal','Semiconfined Distal','Point Loma Cabrillo']
conf_dict = dict(zip(conf_ints,conf_names))
db['confinement'] = db.confinement_numeric.map(conf_dict)

# check it
print(db.groupby('confinement').size()) # all values except NaNs
print('there are',db.confinement.value_counts().sum(),'non NaN values')
print('False values are non NaN:',zeros_mask.value_counts())

confinement
Confined Distal          2362
Confined Proximal         384
Semiconfined Distal      1474
Semiconfined Proximal    9710
Unconfined Distal        5725
Unconfined Proximal      2479
dtype: int64
there are 22134 non NaN values
False values are non NaN: False    22134
True      6390
Name: confinement_numeric, dtype: int64


In [64]:
db.to_csv(dbdatapath + 'Fryer_and_Jobe_2019_turbidite_beds.csv')