# Mining - Drilling data pipeline solution - Sukari

This **solution** will 
1. Read Mining and Drilling excel files that we currently input them manually daily.
2. Do some data cleaning. 
3. Store cleaned data in a proper way in a database.

In [1]:
# import libraries

import glob
import numpy as np
import pandas as pd
from datetime import time
import sqlalchemy as sa
import os
import warnings

# ignore some warnings
warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl')

### 1 - Read Excel Files

#### 1 - A - Mining Data
There are 3 excel sheets for Mining:
1. Daily Dispatch Master.
    - This is our main excel sheet, it has data for **Loads**, **Hours** and **SMU**.
    - Loads and Hours are stored as aggregates for every one hour interval, SMU is recorded at shift start and end.
2. Daily Production Report.
    - Will extract the monthly truck factor from it.
3. Daily Production Performance
    - Has distances for excavators recorded as aggregates per shift.

In [16]:
# Read dispatch master
dispatch_path = 'dispatch/*.xlsx'
dispatch_files = glob.glob(dispatch_path)
dispatch_df = pd.DataFrame()
print('Reading Dispatch Files...\n')
if len(dispatch_files) < 1 :
    raise ValueError('No files to read!')
else : 
    print('dispatch files number =', len(dispatch_files))
    for file in dispatch_files:
        df = pd.read_excel(io=file, sheet_name="Fix Database",  usecols='B:W')
        dispatch_df = pd.concat([dispatch_df, df], ignore_index=True, axis=0)
        print("finished {}".format(file))
        print("Columns = {}".format(len(dispatch_df.columns)))
        if len(dispatch_df.columns) > 22:
             raise ValueError('Cols are more than 22, check excel sheet format')
print('last dispatch date =', dispatch_df['DATE'].tail(1))
print('\nDispatch Files Reading Ended Successfully...')

Reading dispatch files...

dispatch files number = 1
finished dispatch/01. Database Master (January_2023 ).xlsx
Columns = 22
last dispatch date = 13140   2023-01-04
Name: DATE, dtype: datetime64[ns]

dispatch files reading successfully...


In [26]:
# Read performance master
performance_path = 'performance/*.xlsx'
performance_files = glob.glob(performance_path)
performance_df = pd.DataFrame()
print('\nReading Performance Files...\n')
if len(performance_files) < 1 :
    raise ValueError('No files to read!')
else : 
    print('performance files number =', len(performance_files))
    for file in performance_files:
        df_ds = pd.read_excel(io=file, sheet_name="Distance (dsns)", skiprows=2, usecols='B,C,D', header=None, names=['DATE','EXC','distance'])
        df_ds['shift'] = 'day'
        df_ns = pd.read_excel(io=file, sheet_name="Distance (dsns)", skiprows=2, usecols='O,P,Q', header=None, names=['DATE','EXC','distance'])
        df_ns['shift'] = 'night'
        df = pd.concat([df_ds, df_ns], ignore_index=True, axis=0)
        performance_df = pd.concat([performance_df, df], ignore_index=True, axis=0)
        print("finished {}".format(file))
        print("Columns = {}".format(len(performance_df.columns)))
        if len(performance_df.columns) != 4:
             raise ValueError('Cols are not equal to 4, check excel sheet format')
print('last performance date =', performance_df['DATE'].tail(1))
print('\nPerformance Files Reading Ended Successfully...')

Reading Performance Files...

performance files number = 1
finished performance/DPR Daily Production Report January Performance.xlsx
Columns = 4
last performance date = 249   NaT
Name: DATE, dtype: datetime64[ns]

Performance Files Reading Ended Successfully...


In [39]:
# Read dpr
dpr_path = 'dpr/*.xlsx'
dpr_files = glob.glob(dpr_path)
dpr_df = pd.DataFrame()
print('\nReading dpr Files...\n')
if len(dpr_files) < 1 :
    raise ValueError('No files to read!')
else : 
    print('dpr files number =', len(dpr_files))
    for file in dpr_files:
        df = pd.read_excel(file, sheet_name='Data Sheet', usecols="A,H,L,M,N")
        dpr_df = pd.concat([dpr_df, df], ignore_index=True, axis=0)
        print("finished {}".format(file))
        print("Columns = {}".format(len(dpr_df.columns)))
        if len(dpr_df.columns) != 5:
             raise ValueError('Cols are not equal to 4, check excel sheet format')
print('last performance date =', dpr_df['Date'].tail(1))
print('\nPerformance Files Reading Ended Successfully...')

Reading dpr Files...

dpr files number = 1
finished dpr/DPR-Daily Production Report_January.xlsx
Columns = 5
last performance date = 7134   NaT
Name: Date, dtype: datetime64[ns]

Performance Files Reading Ended Successfully...


#### 1 - B - Drilling Data
There are 2 excel sheets for Drilling:
1. Daily Drilling Data.
    - This is our main excel sheet, it has meters drilled aggregated for each hole.
2. Rigs Data.
    - Has the rigs hours aggregated per shift

In [51]:
# Read daily drilling data

# Read Rigs data


### 2 - Data Cleaning

#### 2 - A - General data cleaning

In [59]:
# storing all dfs in a list
dfs = [dispatch_df, performance_df, dpr_df]

for df in dfs : 
    # clean column names
    df.columns = df.columns.str.lower().str.replace(' ','_')
    print(df.columns)

Index(['date', 'shift', 'time', 'operator_name', 'crew', 'activity',
       'reason_activity', 'engine_state_code', 'equipment_id',
       'equipment_oem_model', 'related_equipment', 'location_pit',
       'material_code', 'material_type_code', 'cost_code', 'entry_type',
       'production_value', 'related_equipment_oem_model', 'destination',
       'activity_code', 'reason_code', 'dispatcher_name'],
      dtype='object')
Index(['date', 'exc', 'distance', 'shift'], dtype='object')
Index(['date', 'entry_type', 'amount', '#loads', 'truck_factor'], dtype='object')
