In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
# Import all csv files with raw materials data
# Add a column with material name to each one
# Concat all them into one DataFrame named materials

import glob
import re

files = glob.glob('data/raw materials/*.csv')

mat_names = []
mat_list = []
for file in files:
    name = re.findall('\\\(.+).csv', file)
    name_string = ''.join(name)
    mat_names.append(name_string)
    df = pd.read_csv(file, index_col=None, header=0)
    df['Materials'] = name_string
    mat_list.append(df)

materials = pd.concat(mat_list, axis=0, ignore_index=True)
  

In [3]:
mat_names

['copper',
 'cotton',
 'gasnat_price',
 'gold',
 'petrol_brent',
 'rice',
 'soy',
 'wheat']

In [4]:
materials.head()

Unnamed: 0,Fecha,Último,Apertura,Máximo,Mínimo,Vol.,% var.,Materials
0,23.10.2019,2671,2635,2675,2615,"75,68K","1,44%",copper
1,22.10.2019,2633,2645,2651,2630,"46,14K","-0,53%",copper
2,21.10.2019,2647,2639,2660,2631,"54,49K","0,42%",copper
3,18.10.2019,2636,2600,2646,2585,"71,81K","1,46%",copper
4,17.10.2019,2598,2587,2618,2577,"53,90K","0,31%",copper


In [5]:
materials.shape

(20985, 8)

In [6]:
materials.columns = ['Date', 'Last', 'Open', 'Max', 'Min', 'Vol', 'Var_%', 'Materials']

In [7]:
materials['Date'] = pd.to_datetime(materials['Date'], format='%d.%m.%Y')
materials.head()

Unnamed: 0,Date,Last,Open,Max,Min,Vol,Var_%,Materials
0,2019-10-23,2671,2635,2675,2615,"75,68K","1,44%",copper
1,2019-10-22,2633,2645,2651,2630,"46,14K","-0,53%",copper
2,2019-10-21,2647,2639,2660,2631,"54,49K","0,42%",copper
3,2019-10-18,2636,2600,2646,2585,"71,81K","1,46%",copper
4,2019-10-17,2598,2587,2618,2577,"53,90K","0,31%",copper


In [8]:
def number_formatting(x):
    a = x.replace('.','')
    return a.replace(',','.')

for col in ['Last', 'Open', 'Max', 'Min']:
    materials[col] = materials[col].apply(number_formatting)
    materials[col] = pd.to_numeric(materials[col])
    
materials.head()

Unnamed: 0,Date,Last,Open,Max,Min,Vol,Var_%,Materials
0,2019-10-23,2.671,2.635,2.675,2.615,"75,68K","1,44%",copper
1,2019-10-22,2.633,2.645,2.651,2.63,"46,14K","-0,53%",copper
2,2019-10-21,2.647,2.639,2.66,2.631,"54,49K","0,42%",copper
3,2019-10-18,2.636,2.6,2.646,2.585,"71,81K","1,46%",copper
4,2019-10-17,2.598,2.587,2.618,2.577,"53,90K","0,31%",copper


In [9]:
materials.Vol.unique()

array(['75,68K', '46,14K', '54,49K', ..., '46,19K', '31,35K', '26,20K'],
      dtype=object)

In [10]:
def volume_change(x):
    if 'K' in x:
        a = x.replace('K','')
        return a.replace(',','.')
    
            
materials['Vol'] = pd.to_numeric(materials['Vol'].apply(volume_change))

In [11]:
materials = materials.rename(columns={'Vol':'Vol_K'})

In [12]:
materials.head()

Unnamed: 0,Date,Last,Open,Max,Min,Vol_K,Var_%,Materials
0,2019-10-23,2.671,2.635,2.675,2.615,75.68,"1,44%",copper
1,2019-10-22,2.633,2.645,2.651,2.63,46.14,"-0,53%",copper
2,2019-10-21,2.647,2.639,2.66,2.631,54.49,"0,42%",copper
3,2019-10-18,2.636,2.6,2.646,2.585,71.81,"1,46%",copper
4,2019-10-17,2.598,2.587,2.618,2.577,53.9,"0,31%",copper


In [13]:
materials.isna().sum()

Date            0
Last            0
Open            0
Max             0
Min             0
Vol_K        8353
Var_%           0
Materials       0
dtype: int64

In [14]:
def var_reformatting(x):
    a = x.replace(',','.')
    return a.replace('%','')

materials['Var_%'] =pd.to_numeric(materials['Var_%'].apply(var_reformatting))

In [15]:
materials.head()

Unnamed: 0,Date,Last,Open,Max,Min,Vol_K,Var_%,Materials
0,2019-10-23,2.671,2.635,2.675,2.615,75.68,1.44,copper
1,2019-10-22,2.633,2.645,2.651,2.63,46.14,-0.53,copper
2,2019-10-21,2.647,2.639,2.66,2.631,54.49,0.42,copper
3,2019-10-18,2.636,2.6,2.646,2.585,71.81,1.46,copper
4,2019-10-17,2.598,2.587,2.618,2.577,53.9,0.31,copper


In [16]:
materials['Materials'].unique()

array(['copper', 'cotton', 'gasnat_price', 'gold', 'petrol_brent', 'rice',
       'soy', 'wheat'], dtype=object)

In [18]:
copper_df = materials.loc[materials['Materials']=='copper']
cotton_df = materials.loc[materials['Materials']=='cotton']
gasnat_df = materials.loc[materials['Materials']=='gasnat_price']
gold_df = materials.loc[materials['Materials']=='gold']
petrol_df = materials.loc[materials['Materials']=='petrol_brent']
rice_df = materials.loc[materials['Materials']=='rice']
soy_df = materials.loc[materials['Materials']=='soy']
wheat_df = materials.loc[materials['Materials']=='wheat']

In [19]:
wheat_df.head()

Unnamed: 0,Date,Last,Open,Max,Min,Vol_K,Var_%,Materials
18244,2019-10-23,521.5,518.75,523.88,516.12,,0.77,wheat
18245,2019-10-22,517.5,523.62,532.12,517.5,,-1.29,wheat
18246,2019-10-21,524.25,530.38,534.88,522.88,,-1.22,wheat
18247,2019-10-18,530.75,525.62,532.38,521.12,,1.1,wheat
18248,2019-10-17,525.0,513.75,526.88,513.12,,2.09,wheat


In [20]:
#copper_df.to_csv('data/raw materials/Clean dfs/copper.csv', index=False)
#cotton_df.to_csv('data/raw materials/Clean dfs/cotton.csv', index=False)
#gasnat_df.to_csv('data/raw materials/Clean dfs/gasnat.csv', index=False)
#gold_df.to_csv('data/raw materials/Clean dfs/gold.csv', index=False)
#petrol_df.to_csv('data/raw materials/Clean dfs/petrol.csv', index=False)
#rice_df.to_csv('data/raw materials/Clean dfs/rice.csv', index=False)
#soy_df.to_csv('data/raw materials/Clean dfs/soy.csv', index=False)
#wheat_df.to_csv('data/raw materials/Clean dfs/wheat.csv', index=False)

