In [1]:
import numpy as np
import csv
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 14})

from sys import stdout

from helpers import *

# Loading the data

In [49]:
ROOT_FOLDER = "../"
DATA_FOLDER = "data/processed/"

INPUT_LAB_SPEC = ROOT_FOLDER + DATA_FOLDER + "data_lab_spec_clean.csv"
INPUT_LAB_CONC = ROOT_FOLDER + DATA_FOLDER + "data_lab_conc_clean.csv"

INPUT_AMB_SPEC = ROOT_FOLDER + DATA_FOLDER + "data_amb_spec_clean.csv"
INPUT_AMB_CONC = ROOT_FOLDER + DATA_FOLDER + "data_amb_conc_clean.csv"

In [3]:
# Import data

file_amb_spec = pd.read_csv(INPUT_AMB_SPEC, index_col="Unnamed: 0")
file_amb_conc = pd.read_csv(INPUT_AMB_CONC, index_col="Unnamed: 0")

data_lab_spec = pd.DataFrame(file_lab_spec)
data_lab_conc = pd.DataFrame(file_lab_conc)

data_amb_spec = pd.DataFrame(file_amb_spec)
data_amb_conc = pd.DataFrame(file_amb_conc)

In [4]:
sites_spec = pd.DataFrame(data_lab_spec.columns, columns = ['Sites'])
sites_spec = sites_spec.drop(sites_spec.index[[0]]).reset_index(drop=True)

sites_conc_train = data_lab_conc[['Sites']].loc[data_lab_conc['TRset'] == 'calibration'].reset_index(drop=True)
sites_conc_test = data_lab_conc[['Sites']].loc[data_lab_conc['TRset'] == 'test'].reset_index(drop=True)

sites_both_train = pd.merge(sites_conc_train, sites_spec, how='inner').values.ravel()
sites_both_test = pd.merge(sites_conc_test, sites_spec, how='inner').values.ravel()

data_lab_spec_train = data_lab_spec[sites_both_train]
data_lab_spec_test = data_lab_spec[sites_both_test]

data_lab_conc_train = data_lab_conc[['Sites', '(NH4)SO4']].loc[data_lab_conc['Sites'].isin(sites_both_train)]
data_lab_conc_test = data_lab_conc[['Sites', '(NH4)SO4']].loc[data_lab_conc['Sites'].isin(sites_both_test)]

# Upsampling laboratory train set

In [9]:
print("Number of laboratory samples : ", len(data_lab_conc))
print("Number of laboratory samples : ", len(data_amb_conc))

Number of laboratory samples :  241
Number of laboratory samples :  4304


We first start to upsample laboratory samples to obtain approximately the same number of samples between lab and amb. We need to multiply by 10 the number of laboratory samples.

Idea : Replicate the concentrations and apply a randomly disturbance within 5% of the initial concentration.

In [32]:
ROOT_FOLDER = "../"
DATA_FOLDER = "data/processed/"

INPUT_LAB_SPEC_TRAIN = ROOT_FOLDER + DATA_FOLDER + "input_lab_spec_train.csv"
INPUT_LAB_SPEC_TEST = ROOT_FOLDER + DATA_FOLDER + "input_lab_spec_test.csv"

INPUT_LAB_CONC_TRAIN = ROOT_FOLDER + DATA_FOLDER + "input_lab_conc_train.csv"
INPUT_LAB_CONC_TEST = ROOT_FOLDER + DATA_FOLDER + "input_lab_conc_test.csv"

INPUT_AMB_SPEC = ROOT_FOLDER + DATA_FOLDER + "input_amb_spec.csv"
INPUT_AMB_CONC = ROOT_FOLDER + DATA_FOLDER + "input_amb_conc.csv"

In [54]:
# Import Laboratory data

file_lab_spec = pd.read_csv(INPUT_LAB_SPEC, header=None)
file_lab_conc = pd.read_csv(INPUT_LAB_CONC, header=None)

data_lab_spec = pd.DataFrame(file_lab_spec).values.T
data_lab_conc = pd.DataFrame(file_lab_conc).values

print(data_lab_spec.shape)
print(data_lab_conc.shape)

X_lab = pd.DataFrame(data_lab_spec)
X_lab['concentration'] = data_lab_conc[:,3]
X_lab['category'] = 'lab'

(240, 2785)
(242, 4)


ValueError: Length of values (242) does not match length of index (240)

In [40]:
# Import Ambient data

file_amb_spec = pd.read_csv(INPUT_AMB_SPEC, header=None)
file_amb_conc = pd.read_csv(INPUT_AMB_CONC, header=None)

X_amb = pd.DataFrame(file_amb_spec).values.T
y_amb = pd.DataFrame(file_amb_conc).values.ravel()

X_amb = pd.DataFrame(X_amb)
X_amb['concentration'] = y_amb
X_amb['category'] = 'amb'

In [55]:
data_lab_conc

array([[nan, 'Sites', 'TRset', '(NH4)SO4'],
       [0.0, 'X01012', 'test', '9.277933998139941'],
       [1.0, 'X01020', 'calibration', '18.385890579514484'],
       [2.0, 'X01021', 'calibration', '19.884996963438915'],
       [3.0, 'X01022', 'test', '22.6589339730599'],
       [4.0, 'X01027', 'calibration', '27.876296348605116'],
       [5.0, 'X01033', 'calibration', '0.0'],
       [6.0, 'X01034', 'test', '0.0'],
       [7.0, 'X01050', 'calibration', '0.0'],
       [8.0, 'X01092', 'calibration', '0.0'],
       [9.0, 'X01093', 'calibration', '0.0'],
       [10.0, 'X01094', 'test', '0.0'],
       [11.0, 'X01095', 'calibration', '0.0'],
       [12.0, 'X01096', 'calibration', '0.0'],
       [13.0, 'X01097', 'test', '0.0'],
       [14.0, 'X01098', 'calibration', '0.0'],
       [15.0, 'X01099', 'calibration', '0.0'],
       [16.0, 'X01101', 'test', '0.0'],
       [17.0, 'X01102', 'calibration', '0.0'],
       [18.0, 'X01103', 'calibration', '0.0'],
       [19.0, 'X01104', 'test', '0.0'],
   

In [31]:
p_data_lab_spec = data_lab_spec.pivot(columns='Wavenumber').reset_index()

MemoryError: Unable to allocate 13.7 GiB for an array with shape (2784, 662592) and data type float64

In [29]:
from imblearn.over_sampling import SMOTE

data_lab_conc['Category'] = 'Lab'
p_data_lab_spec = data_lab_spec.pivot(columns='Wavelength').reset_index()
print (df1)

X_lab = data_lab_conc[['(NH4)SO4']]
y_lab = data_lab_conc[['TRset']]

X_resampled, y_resampled = SMOTE().fit_resample(X, y)

# Count instances of each class
from collections import Counter
print(sorted(Counter(y_resampled).items()))

KeyError: 'searchTerm'

In [28]:
data_lab_spec.head()

Unnamed: 0,Wavenumber,X01012,X01020,X01021,X01022,X01027,X01033,X01034,X01050,X01092,...,X11054,X11055,X11056,X21046,X21047,X21048,X21049,X21050,X21051,X21052
1,3998.423206,-0.002246,-0.001828,-0.001196,-0.001546,-0.001464,-0.001665,-0.001718,-0.002252,-0.001366,...,-0.001412,-0.001581,-0.002158,-0.00059,-0.000876,-0.000956,-0.002222,-0.00169,-0.000941,-0.001959
2,3997.137539,-0.002221,-0.00181,-0.001186,-0.001526,-0.001444,-0.001646,-0.001702,-0.002252,-0.00136,...,-0.001394,-0.001561,-0.002118,-0.000591,-0.00087,-0.000974,-0.00222,-0.00168,-0.00094,-0.001955
3,3995.851872,-0.002192,-0.00179,-0.001173,-0.001497,-0.00142,-0.001626,-0.001678,-0.002241,-0.001346,...,-0.001371,-0.001536,-0.002075,-0.000591,-0.000865,-0.000986,-0.00221,-0.001665,-0.000933,-0.00195
4,3994.566206,-0.002168,-0.001773,-0.001165,-0.001473,-0.001404,-0.001608,-0.001656,-0.002227,-0.00132,...,-0.001354,-0.001513,-0.00204,-0.000587,-0.000858,-0.000985,-0.002188,-0.001644,-0.000916,-0.001939
5,3993.280539,-0.002151,-0.00176,-0.001161,-0.001459,-0.001397,-0.001592,-0.001643,-0.00221,-0.001286,...,-0.001347,-0.001499,-0.002014,-0.000578,-0.000848,-0.000969,-0.00215,-0.001617,-0.000891,-0.001911


In [11]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=5000, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=3,                            n_clusters_per_class=1, 
                           weights=[0.01, 0.05, 0.94],class_sep=0.8, random_state=0)

In [14]:
print(y)

AttributeError: 'numpy.ndarray' object has no attribute 'unique'

Sometimes we want to sample in an ublanaced way, so that we upsample datapoints of certain characteristic, and downsample the others. This can be achieved with weights parameter.

In [9]:
#make 10 samples without replacement
#sample1 = df.sample(n = 10, replace = False)

data_lab_conc_up = data_lab_conc.sample(frac=5, replace=True, random_state=1, \
                                        weights=data_lab_conc['(NH4)SO4'])

data_amb_conc_up = data_amb_conc.sample(frac=0.5, replace=True, random_state=1, \
                                        weights=data_amb_conc['(NH4)SO4'])

In [10]:
sample2

Unnamed: 0,Sites,TRset,(NH4)SO4
120,X01315,calibration,79.417226
90,X01285,calibration,25.522581
91,X01286,test,27.50329
117,X01312,calibration,62.608349
100,X01295,test,49.046039
122,X01317,calibration,127.511392
97,X01292,test,42.101517
1,X01020,calibration,18.385891
80,X01275,calibration,8.46582
96,X01291,calibration,37.340733


In [None]:
#on avergage, the samples in the sample produce with reveighting now have higher population, as we wanted!
print(sample1_counties['TotalPop'].mean())
print(sample2_counties['TotalPop'].mean())