# Data Input-Output

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.signal
from scipy import interpolate
from copy import deepcopy
import time
import re
import random
import os

In [4]:
while not os.getcwd().endswith('Python_Coding'):
    # move one up until the directory folder is 'Python_Coding'
    os.chdir('..')
    if os.getcwd().endswith(':\\'):
        break

if os.getcwd().endswith('Python_Coding'):
    Directory_address = os.getcwd() +'\\'
    Library_address = Directory_address + 'Library\\'
    print('Root Directory: \'%s\' \n  Library path: \'%s\' ' %(Directory_address, Library_address) )
else : 
    raise ValueError("Could not find the path of 'Python_Coding' folder ")

input_folder_address = Directory_address + "Inputs\\"
output_folder_address = Directory_address + "Outputs\\"

def_file_name= "data"
file_labels = ""
data_type= ".txt"

Root Directory: 'C:\Users\Terabix\Google_Drive\Python_Coding\' 
  Library path: 'C:\Users\Terabix\Google_Drive\Python_Coding\Library\' 


# Import Data

In [1]:
# file_labels can be used to import multiple files and assign the data to dictionaries, or concatenate them
def import_data(input_folder_address=input_folder_address, file_name=def_file_name ,file_labels=file_labels, data_type= ".txt", delimiter=',', header_rows=3, skip_columns=-1,
                numeric = 1, # importing all numeric data, they can be converted to float
               concatenate=1 # concatenate the files imported
               ):
    
    # Detect the material, layer no, substrate (makes sure that re.findall returns something)
    # detect the thickness such as 3 from 'WSe2_3L_A6_2018-02-19'
    layer_no_regex = re.compile(r'_\dL_')
    
    layer_no = int(re.findall( '_(.*?)L_' , layer_no_regex.findall(file_name)[0])[0]) if layer_no_regex.findall(file_name) else ''
    thickness = str(layer_no)+'L' # make 2 -> 2L
    # detect the material such as 'WSe2' from 'WSe2_2L_A6_2018-02-19'
    material = re.findall('(.*?2)_', file_name)[0] if re.findall('(.*?2)_', file_name) else ''
    # date of the experiment (finds 2018-05-30 or 2018_05_30)
    date = re.findall('(?:201\d_\d\d_\d\d|201\d-\d\d-\d\d)', file_name)[0] if re.findall('(?:201\d_\d\d_\d\d|201\d-\d\d-\d\d)', file_name) else ''
    
    # detect the substrate if it is in ['PETG', 'PDMS', 'Quartz', 'SiO2', 'PC'] 
    substrates = ['PETG', 'PDMS', 'Quartz', 'SiO2', 'PC']
    if any(s in file_name for s in substrates):
        substrate = [s for s in substrates if s in file_name][0] # first instance
    else :
        substrate = 'Air'
    
    if layer_no:
        # import material thickness    
        print('%s layer %s' %(layer_no, material))

        t_layered_pd, t_layered, t_layered_labels = import_data(file_name='Layered_thickness_database', data_type= ".csv", delimiter=';', header_rows=1)
        t_sample = t_layered_pd[material][layer_no-1]
        print('%s nm thick %s %s' %(t_sample, thickness, material))
    
    data_pd= {}; data = {}; data_labels = {}
    layers = {}; diameters = {}; pressures = {}; flakes = {}
    
    if file_labels == '': # single file to import
        file_labels =['temp'] # just a dummy dictionary
        
    
    for file_label in file_labels:
        # Locate the data folder,name,type
        if file_label == 'temp':
            data_address = input_folder_address+"\\"+file_name+data_type
        else:
            data_address = input_folder_address+"\\"+file_name+file_label+data_type

        # create a header array if more than 1 header rows
        if header_rows > 1:        
            header= np.arange(0,header_rows,1).tolist()
        elif header_rows == 1:
            header=0
        else:
            header = None

        #Import the data
        data_pd[file_label] = pd.read_csv(data_address, delimiter=delimiter, header=header)
        if skip_columns != -1:
            data_pd[file_label].drop(data_pd[file_label].columns[skip_columns], axis=1, inplace=True)

        # See if the last column is NaN
        if pd.isnull(data_pd[file_label].iloc[0,-1]): # first row of the last column 
            data_pd[file_label].drop(data_pd[file_label].keys()[-1], axis=1, inplace=True) # to drop the last column since it is 'NaN'

        if header_rows>1:
            data_pd[file_label].rename(columns=lambda x: x.strip(),inplace=True) # to remove the extra spaces in the labels such as ' s0 ' to 's0'
        
        if numeric:
            # Convert the pd to an np array
            print(data_pd[file_label].values)
            data[file_label] = data_pd[file_label].values.astype(np.float)
        else : 
            data[file_label] = data_pd[file_label].values

        data_labels[file_label] = list(data_pd[file_label].columns.get_level_values(0))
        #print(data_labels[file_label])

        for header_row in range(1, header_rows):
            data_labels[file_label]=list(map(str.__add__, data_labels[file_label], list( '_' + data_pd[file_label].columns.get_level_values(header_row) )))
    
        #print(data_labels[file_label])
        
        layers[file_label] = []; diameters[file_label] = []; pressures[file_label] = []; flakes[file_label] = []

        # Pressure measurements
        # Detect the diameter of the hole,applied pressure, flake_label(such as 4a from _#4a)
        for label in data_labels[file_label]:
            
            if file_label != 'temp': # multiple files which share the same properties, but of which columns are not labelled properly in
                label+=file_name
            
            layer_regex = re.compile(r'\d+L_')
            layer = int(re.findall( '(.*?)L_' , layer_regex.findall(label)[0])[0]) if layer_regex.findall(label) else 1 # probably a 1L
            layers[file_label].append(layer)
            
            diameter_regex = re.compile(r'(?:_\d+_micron_|_\d+micron_)') # match both _12_micron OR _12micron_
            #print ('label:' + label)
            #print (diameter_regex.findall(label))
            if diameter_regex.findall(label):
                diameter_tuple = re.findall( '(?:_(.*?)_micron_|_(.*?)micron_)' , diameter_regex.findall(label)[0])[0]
                #print (re.findall( '(?:_(.*?)_micron_|_(.*?)micron_)' , diameter_regex.findall(label)[0])[0])
                diameter = int(diameter_tuple[0]+diameter_tuple[1])
            else :
                diameter = 0
            diameters[file_label].append(diameter)
            
            pressure_regex = re.compile(r'_\d+psi_')
            pressure = int(re.findall( '_(.*?)psi_' , pressure_regex.findall(label)[0])[0]) if pressure_regex.findall(label) else 0
            pressures[file_label].append(pressure)
            
            flake_regex = re.compile(r'_#\w+')
            flake = re.findall( '(?:_(#.*?)_|_(#.+))' , flake_regex.findall(label)[0])[0] if flake_regex.findall(label) else ''
            flakes[file_label].append(''.join(flake))


        data_labels[file_label] = [label.replace('.csv', '').replace('.asc', '').replace('.txt', '').replace('micron', r'${\rm \mu}$m').replace('\g(m)', r'${\rm \mu}$').replace('R/R\-(quartz)', r'R/R$_{\rm Quartz}$').replace('___','_').replace('__','_') for label in data_labels[file_label]]
        
    if file_label == 'temp' : # single file imported, return the single element of the dictionaries
        
        # return layer_no, thickness, material, t_sample only if they are not empty
        if layer_no :
            print ('Date: %s' %date)
            print ('Flakes: %s' %flakes[file_label])
            print ('Layers: %s' %layers[file_label])
            print ('Diameters: %s' %diameters[file_label])
            print ('Pressures: %s' %pressures[file_label])
            print ('Substrate: %s' %substrate)
            

            return  data_pd[file_label], data[file_label], data_labels[file_label], date, layer_no, thickness, material, t_sample, layers[file_label], diameters[file_label], pressures[file_label], flakes[file_label], substrate #if layer_no 
        else :
            return data_pd[file_label], data[file_label], data_labels[file_label] 
    else : # multiple files imported, return them all
        
        
        if layer_no :
            print('%s nm thick %s %s' %(t_sample, thickness, material))
            print ('Date: %s' %date)
            print ('Flakes: %s' %flakes[file_label])
            print ('Layers: %s' %layers[file_label])
            print ('Diameters: %s' %diameters[file_label])
            print ('Pressures: %s' %pressures[file_label])
            print ('Substrate: %s' %substrate)
            
        if concatenate:
            data_pd, data, data_labels, flakes, layers, diameters, pressures = concatenate_pd(data_pd, data_labels,flakes,layers,diameters,pressures)
            
            return data_pd, data, data_labels, date, layer_no, thickness, material, t_sample, layers, diameters, pressures, flakes, substrate
        
        if layer_no :
            return data_pd, data, data_labels, date, layer_no, thickness, material, t_sample, layers[file_label], diameters[file_label], pressures[file_label], flakes[file_label], substrate
        
        return data_pd, data, data_labels

NameError: name 'folder_address' is not defined

In [None]:
# MAY NOT BE IN USE
def import_data2(input_folder_address=input_folder_address, file_name=def_file_name ,file_labels=file_labels, data_type= ".txt", delimiter=',', header_rows=3):
    
    # Locate the datrta folder,name,type
    data_address = input_folder_address+"\\"+file_name+data_type

    # create a header array if more than 1 indices
    if header_rows > 1:        
        header= np.arange(0,header_rows,1).tolist()
    elif header_rows == 1:
        header=1
    else:
        header = None
        
    #Import the data
    data_pd = pd.read_csv(data_address, delimiter=delimiter, header=header)
    
    # See if the last column is NaN
    if pd.isnull(data_pd.iloc[0,-1]): # first row of the last column 
        data_pd = data_pd.drop(data_pd.keys()[-1], axis=1) # to drop the last column since it is 'NaN'

    #data_pd.columns = [str(col) +'_'+ data_pd.ix[1,col] for col in data_pd.columns] #combine the names with comments in the imported data
    #data_pd = data_pd.drop(1, axis=0) # Drop the third row which are characters not digits in some cases
    if header_rows>1:
        data_pd.rename(columns=lambda x: x.strip(),inplace=True) # to remove the extra spaces in the labels such as ' s0 ' to 's0'
    # Convert the pd to a np array
    data= data_pd.values.astype(np.float)
    #row,column=data.shape
    if header_rows>1:
        data_labels=list(data_pd.columns.get_level_values(0) + '_' + data_pd.columns.get_level_values(header_rows-1))
    else:
        data_labels=list(data_pd.columns.get_level_values(0))    
    del data_labels[:1] # to remove the fist columnn label (it is probably Energy or Wavelength)
    return data_pd, data, data_labels



# Concatenate Data

In [2]:
def concatenate_pd(data_pd,data_labels,flakes,layers,diameters,pressures) :
    x_first=[]; x_last=[]
    for file_label in data_pd.keys(): # append the first and last x-axis values for comparing the x-axes
        x_first.append(data_pd[file_label].iloc[0,0])
        x_last.append(data_pd[file_label].iloc[-1,0])
    
    # do the data files share the same x-range? probably if the first and last x-axis values are the same
    if x_first.count(x_first[0]) == len(x_first) and x_last.count(x_last[0]) == len(x_last):
        
        data_labels_all =[]; flakes_all=[]; layers_all=[] ; diameters_all=[]; pressures_all=[]
        data_labels_all.append(data_labels[file_label][0]) # x-axis label, unit and y-axis label of a dataset
        flakes_all.append(flakes[file_label][0]) # x-axis label, unit and y-axis label of a dataset
        layers_all.append(layers[file_label][0]) # x-axis label, unit and y-axis label of a dataset
        diameters_all.append(diameters[file_label][0]) # x-axis label, unit and y-axis label of a dataset
        pressures_all.append(pressures[file_label][0]) # x-axis label, unit and y-axis label of a dataset

        data_pd_all = data_pd[file_label].iloc[:,0] # x-axis of a data_pd
        list_data_pd = [data_pd_all]

        for file_label in file_labels:
            list_data_pd.append(data_pd[file_label].iloc[:,1:])# excluding the x-axis
            data_labels_all.extend(data_labels[file_label][1:])
            flakes_all.extend(flakes[file_label][1:])
            layers_all.extend(layers[file_label][1:])
            diameters_all.extend(diameters[file_label][1:])
            pressures_all.extend(pressures[file_label][1:])
        
        data_pd_all = pd.concat(list_data_pd, axis=1) # concatenate
        # Convert the pd to a np array
        data_all = data_pd_all.values.astype(np.float)
    
    else :
        raise ValueError('Dataframes do not have the same x-axis')

    return data_pd_all, data_all, data_labels_all, flakes_all, layers_all, diameters_all, pressures_all

# Exporting Data

In [4]:
import time
time.strftime("_%Y_%m_%d at %H-%M")

'_2019_12_16 at 10-58'

In [21]:
# Export a pandas data frame
def export_pd(output_pd, file_name="out", output_folder_address=output_folder_address, data_type=data_type ,index=False , header=False, delimeter=',' , date_label= time.strftime("_%Y_%m_%d") ):
    output_address = output_folder_address+"\\"+file_name + date_label + data_type
    
    # Make sure that the file name is unique to avoid overwriting another file, otherwise add a random integer to the file name
    if os.path.exists(output_address):
        print('\'' + output_address +'\' already exists, will try adding a random int to file name')
        output_address += '_Random # ' + str(round(random.uniform(1, 10000),1))

        if os.path.exists(output_address):
            print('\'' + output_address +'\' already exists, will try adding a random int to file name')
            output_address += '_Random # ' + str(round(random.uniform(1, 10000),1))
                
    output_pd.to_csv(path_or_buf=output_address, index=index, header=header, sep = delimeter)
    
    return output_address # it is useful for finding the file such as to attach to an email

In [1]:
# Export a numpy array
def export_np(output_np, file_name="out", output_folder_address=output_folder_address, delimiter=",", header=''):
    output_address = output_folder_address+"\\"+file_name+data_type
    np.savetxt(output_address, output_np, delimiter=delimiter, header=header, comments='')

NameError: name 'output_folder_address' is not defined

# Create Report in Pandas Dataframe Format

In [None]:
def create_pd(result, data_labels = '', header=''):
    fit_report = pd.DataFrame(index=data_labels)

    for j in range(len(result[0])):
        fit_report[header[j]] = [item[j] for item in result]

    fit_report.columns=fit_report.columns.str.replace('#','')
    return fit_report