# Oil production

## Used libraries

In [1]:
# DATA MANIPULATION, Visualization, and stats
import pandas as pd
from glob import glob
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import qqplot
import os

## Upload the data

In [2]:
path = 'data' # use your path
all_files = glob(os.path.join(path, "*.csv"))
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

df

ValueError: No objects to concatenate

## To Datetime format

In [None]:
df['Date'] = pd.to_datetime(df['Date'])
df['Date'].dtype

## Explore the dataset

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.duplicated().sum()

### Dealing with missing data

#### Check only NaN features

In [None]:
sorted_nan = df.isnull().sum().sort_values(ascending=False) 

In [None]:
n = 0
f = 0
features_drop = []
for feature, value in dict(sorted_nan).items():
    if value == len(df):
        n += 1
        features_drop.append(feature)
    if value == 0:
        f += 1
        
print(f'there are {n} features with only NaN values')
print('--------------------------------------------')
print(f'there are {f} features with no NaN values')
print('--------------------------------------------')
print(f'there are {df.shape[1] - n - f} features with some NaN values')

#### Check which features have values and NaN:

In [None]:
dic = {}
for feature, value in dict(sorted_nan).items():
    if value < len(df) and value > 0:
        dic[feature] = value
        
dic

#### Check which features with some NaN should be dropped

In [None]:
dic_2 = {}

for feature, value in dic.items():
    dic_2[feature] = round(value/len(df),2) 
    if dic_2[feature] > 0.3:
        features_drop.append(feature)
    

dic_2

##### Check if there's any feature with NaN ratio between 0.3 and 0.8 we can use

In [None]:
#list of features to check

feat_30_80 = []

for feature, value in dic_2.items():
    if value > 0.3 and value < 0.8:
        feat_30_80.append(feature)

print(len(feat_30_80))
feat_30_80

In [None]:
fig, axs = plt.subplots(nrows=8, ncols=3, figsize=(15, 30))

#dataframe with features with NaN ratio between 30% and 80%
df_30_80 = df[df.columns.intersection(feat_30_80)]

for i, col in enumerate(df_30_80.columns):
    row = i // 3
    co = i % 3
    axs[row, co].plot(df[col])
    axs[row, co].set_title(col)

fig.tight_layout()

plt.show()

Apparently no but brainstorm with group and TA

In [None]:
print(f'df should drop {len(features_drop)} features')
print('--------------------------------------------------')
print(f'Total number of features to work with in df: {df.shape[1]-len(features_drop)}')

In [None]:
df.drop(columns=features_drop, inplace = True)

In [None]:
df

#### Check features with <30% NaN -> SimpleImputer

In [None]:
features_to_impute_check = []

for feature, value in dic_2.items():
    if value < 0.3:
        features_to_impute_check.append(feature)

features_to_impute_check

In [None]:
len(features_to_impute_check)

In [None]:
fig, axs = plt.subplots(nrows=11, ncols=3, figsize=(15, 45))

#dataframe with only features to impute
df_to_impute = df[df.columns.intersection(features_to_impute_check)]

for i, col in enumerate(df_to_impute.columns):
    row = i // 3
    co = i % 3
    axs[row, co].plot(df[col])
    axs[row, co].set_title(col)

fig.suptitle('Check features to impute')

fig.tight_layout()

plt.show()

##### Features to be removed after the visual check: 
- Sand Rate
- MPFM NTotal Count Rate
- MPFM N81 Count Rate
- MPFM N356 Count Rate
- MPFM N32 Count Rate
- MPFM GOR
- Dowhole Gauge T
- Dowhole Gauge P

In [None]:
remove = {"Sand Rate", "MPFM NTotal Count Rate", "MPFM N81 Count Rate", "MPFM N356 Count Rate", "MPFM N32 Count Rate", "MPFM GOR", "Downhole Gauge T", "Downhole Gauge P"}
 
features_to_impute = [f for f in features_to_impute_check if f not in remove]
features_to_impute

In [None]:
len(features_to_impute)

#### Imputer... To be continued

#### Choosing the Target

In [None]:
plt.figure(figsize=(15, 8))
plt.plot(df['Date'], df['Qoil MPFM'], **{'color': 'blue', 'linestyle': '','marker': 'o', 'markersize': 3, 'label':'Qoil MPFM'})

plt.title('A well - Oil production ', fontsize=15, color='darkred', weight='bold')
# plt.ylim(top=3000)
plt.ylabel("Oil production", size=16, labelpad=8, color='darkred')
plt.show();

In [None]:
plt.figure(figsize=(8,5))
plt.hist(df['Qoil MPFM'], bins = 100, alpha = 0.5, color = 'blue', label = 'Oil MPFM')
plt.hist(df['Qoil MPFM corrected'], bins = 100, alpha = 0.5, color = 'lightsteelblue', label = 'Oil MPFM corrected')
plt.xlabel("Qoil", size=11)
plt.ylabel("Count", size=11)
plt.title("Target: Qoil MPFM or Oil MPFM corrected?")
plt.legend(loc='upper right')
plt.show();

-->Seems no difference which of the 'Qoil's to use as a *Target*

#### Check for outliers

In [None]:
df_to_impute = df[df.columns.intersection(features_to_impute)]
df_to_impute

In [None]:
variable = 'WHT'

fig, ax = plt.subplots(1,3,figsize=(15,5))

ax[0].set_title(f"Distribution of the {variable}")
sns.histplot(data = df_to_impute, x = f"{variable}", kde=True, ax = ax[0])

ax[1].set_title(f"Boxplot of the {variable}")
sns.boxplot(data = df_to_impute, x = f"{variable}", ax=ax[1])

ax[2].set_title(f"Gaussianity of:the {variable}")
qqplot(df_to_impute[f"{variable}"],line='s',ax=ax[2]);

In [None]:
variable = 'MPFM P'

fig, ax = plt.subplots(1,3,figsize=(15,5))

ax[0].set_title(f"Distribution of the {variable}")
sns.histplot(data = df_to_impute, x = f"{variable}", kde=True, ax = ax[0])

ax[1].set_title(f"Boxplot of the {variable}")
sns.boxplot(data = df_to_impute, x = f"{variable}", ax=ax[1])

ax[2].set_title(f"Gaussianity of:the {variable}")
qqplot(df_to_impute[f"{variable}"],line='s',ax=ax[2]);

#### Scaler... to be continued